initial code repo

This patch creates initial code repo. For ceph, luminous stable release will be used for base code, and next changes and optimization for ceph will be added to it. For opensds, currently any changes can be upstreamed into original opensds repo (https://github.com/opensds/opensds), and so stor4nfv will directly clone opensds code to deploy stor4nfv environment. And the scripts for deployment based on ceph and opensds will be put into 'ci' directory. Change-Id: I46a32218884c75dda2936337604ff03c554648e4 Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
author: Qiaowei Ren <qiaowei.ren@intel.com> 2018-01-04 13:43:33 +0800
committer: Qiaowei Ren <qiaowei.ren@intel.com> 2018-01-05 11:59:39 +0800
commit: 812ff6ca9fcd3e629e49d4328905f33eee8ca3f5 (patch)
tree: 04ece7b4da00d9d2f98093774594f4057ae561d4 /src/ceph/qa/tasks
parent: 15280273faafb77777eab341909a3f495cf248d9 (diff)
141 files changed, 35401 insertions, 0 deletions
diff --git a/src/ceph/qa/tasks/__init__.py b/src/ceph/qa/tasks/__init__.py
new file mode 100644
index 0000000..9a7949a
--- /dev/null
+++ b/src/ceph/qa/tasks/__init__.py
@@ -0,0 +1,6 @@
+import logging
+
+# Inherit teuthology's log level
+teuthology_log = logging.getLogger('teuthology')
+log = logging.getLogger(__name__)
+log.setLevel(teuthology_log.level)
diff --git a/src/ceph/qa/tasks/admin_socket.py b/src/ceph/qa/tasks/admin_socket.py
new file mode 100644
index 0000000..3301372
--- /dev/null
+++ b/src/ceph/qa/tasks/admin_socket.py
@@ -0,0 +1,199 @@
+"""
+Admin Socket task -- used in rados, powercycle, and smoke testing
+"""
+from cStringIO import StringIO
+
+import json
+import logging
+import os
+import time
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from teuthology.parallel import parallel
+from teuthology.config import config as teuth_config
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+    """
+    Run an admin socket command, make sure the output is json, and run
+    a test program on it. The test program should read json from
+    stdin. This task succeeds if the test program exits with status 0.
+
+    To run the same test on all clients::
+
+        tasks:
+        - ceph:
+        - rados:
+        - admin_socket:
+            all:
+              dump_requests:
+                test: http://example.com/script
+
+    To restrict it to certain clients::
+
+        tasks:
+        - ceph:
+        - rados: [client.1]
+        - admin_socket:
+            client.1:
+              dump_requests:
+                test: http://example.com/script
+
+    If an admin socket command has arguments, they can be specified as
+    a list::
+
+        tasks:
+        - ceph:
+        - rados: [client.0]
+        - admin_socket:
+            client.0:
+              dump_requests:
+                test: http://example.com/script
+              help:
+                test: http://example.com/test_help_version
+                args: [version]
+
+    Note that there must be a ceph client with an admin socket running
+    before this task is run. The tests are parallelized at the client
+    level. Tests for a single client are run serially.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    assert isinstance(config, dict), \
+        'admin_socket task requires a dict for configuration'
+    teuthology.replace_all_with_clients(ctx.cluster, config)
+
+    with parallel() as ptask:
+        for client, tests in config.iteritems():
+            ptask.spawn(_run_tests, ctx, client, tests)
+
+
+def _socket_command(ctx, remote, socket_path, command, args):
+    """
+    Run an admin socket command and return the result as a string.
+
+    :param ctx: Context
+    :param remote: Remote site
+    :param socket_path: path to socket
+    :param command: command to be run remotely
+    :param args: command arguments
+
+    :returns: output of command in json format
+    """
+    json_fp = StringIO()
+    testdir = teuthology.get_testdir(ctx)
+    max_tries = 120
+    while True:
+        proc = remote.run(
+            args=[
+                'sudo',
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{tdir}/archive/coverage'.format(tdir=testdir),
+                'ceph',
+                '--admin-daemon', socket_path,
+                ] + command.split(' ') + args,
+            stdout=json_fp,
+            check_status=False,
+            )
+        if proc.exitstatus == 0:
+            break
+        assert max_tries > 0
+        max_tries -= 1
+        log.info('ceph cli returned an error, command not registered yet?')
+        log.info('sleeping and retrying ...')
+        time.sleep(1)
+    out = json_fp.getvalue()
+    json_fp.close()
+    log.debug('admin socket command %s returned %s', command, out)
+    return json.loads(out)
+
+def _run_tests(ctx, client, tests):
+    """
+    Create a temp directory and wait for a client socket to be created.
+    For each test, copy the executable locally and run the test.
+    Remove temp directory when finished.
+
+    :param ctx: Context
+    :param client: client machine to run the test
+    :param tests: list of tests to run
+    """
+    testdir = teuthology.get_testdir(ctx)
+    log.debug('Running admin socket tests on %s', client)
+    (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+    socket_path = '/var/run/ceph/ceph-{name}.asok'.format(name=client)
+    overrides = ctx.config.get('overrides', {}).get('admin_socket', {})
+
+    try:
+        tmp_dir = os.path.join(
+            testdir,
+            'admin_socket_{client}'.format(client=client),
+            )
+        remote.run(
+            args=[
+                'mkdir',
+                '--',
+                tmp_dir,
+                run.Raw('&&'),
+                # wait for client process to create the socket
+                'while', 'test', '!', '-e', socket_path, run.Raw(';'),
+                'do', 'sleep', '1', run.Raw(';'), 'done',
+                ],
+            )
+
+        for command, config in tests.iteritems():
+            if config is None:
+                config = {}
+            teuthology.deep_merge(config, overrides)
+            log.debug('Testing %s with config %s', command, str(config))
+
+            test_path = None
+            if 'test' in config:
+                # hack: the git_url is always ceph-ci or ceph
+                git_url = teuth_config.get_ceph_git_url()
+                repo_name = 'ceph.git'
+                if git_url.count('ceph-ci'):
+                    repo_name = 'ceph-ci.git'
+                url = config['test'].format(
+                    branch=config.get('branch', 'master'),
+                    repo=repo_name,
+                    )
+                test_path = os.path.join(tmp_dir, command)
+                remote.run(
+                    args=[
+                        'wget',
+                        '-q',
+                        '-O',
+                        test_path,
+                        '--',
+                        url,
+                        run.Raw('&&'),
+                        'chmod',
+                        'u=rx',
+                        '--',
+                        test_path,
+                        ],
+                    )
+
+            args = config.get('args', [])
+            assert isinstance(args, list), \
+                'admin socket command args must be a list'
+            sock_out = _socket_command(ctx, remote, socket_path, command, args)
+            if test_path is not None:
+                remote.run(
+                    args=[
+                        test_path,
+                        ],
+                    stdin=json.dumps(sock_out),
+                    )
+
+    finally:
+        remote.run(
+            args=[
+                'rm', '-rf', '--', tmp_dir,
+                ],
+            )
diff --git a/src/ceph/qa/tasks/autotest.py b/src/ceph/qa/tasks/autotest.py
new file mode 100644
index 0000000..efa9721
--- /dev/null
+++ b/src/ceph/qa/tasks/autotest.py
@@ -0,0 +1,166 @@
+""" 
+Run an autotest test on the ceph cluster.
+"""
+import json
+import logging
+import os
+
+from teuthology import misc as teuthology
+from teuthology.parallel import parallel
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Run an autotest test on the ceph cluster.
+
+    Only autotest client tests are supported.
+
+    The config is a mapping from role name to list of tests to run on
+    that client.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - ceph-fuse: [client.0, client.1]
+        - autotest:
+            client.0: [dbench]
+            client.1: [bonnie]
+
+    You can also specify a list of tests to run on all clients::
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - autotest:
+            all: [dbench]
+    """
+    assert isinstance(config, dict)
+    config = teuthology.replace_all_with_clients(ctx.cluster, config)
+    log.info('Setting up autotest...')
+    testdir = teuthology.get_testdir(ctx)
+    with parallel() as p:
+        for role in config.iterkeys():
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            p.spawn(_download, testdir, remote)
+
+    log.info('Making a separate scratch dir for every client...')
+    for role in config.iterkeys():
+        assert isinstance(role, basestring)
+        PREFIX = 'client.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+        mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_))
+        scratch = os.path.join(mnt, 'client.{id}'.format(id=id_))
+        remote.run(
+            args=[
+                'sudo',
+                'install',
+                '-d',
+                '-m', '0755',
+                '--owner={user}'.format(user='ubuntu'), #TODO
+                '--',
+                scratch,
+                ],
+            )
+
+    with parallel() as p:
+        for role, tests in config.iteritems():
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            p.spawn(_run_tests, testdir, remote, role, tests)
+
+def _download(testdir, remote):
+    """
+    Download.  Does not explicitly support muliple tasks in a single run.
+    """
+    remote.run(
+        args=[
+            # explicitly does not support multiple autotest tasks
+            # in a single run; the result archival would conflict
+            'mkdir', '{tdir}/archive/autotest'.format(tdir=testdir),
+            run.Raw('&&'),
+            'mkdir', '{tdir}/autotest'.format(tdir=testdir),
+            run.Raw('&&'),
+            'wget',
+            '-nv',
+            '--no-check-certificate',
+            'https://github.com/ceph/autotest/tarball/ceph',
+            '-O-',
+            run.Raw('|'),
+            'tar',
+            '-C', '{tdir}/autotest'.format(tdir=testdir),
+            '-x',
+            '-z',
+            '-f-',
+            '--strip-components=1',
+            ],
+        )
+
+def _run_tests(testdir, remote, role, tests):
+    """
+    Spawned to run test on remote site
+    """
+    assert isinstance(role, basestring)
+    PREFIX = 'client.'
+    assert role.startswith(PREFIX)
+    id_ = role[len(PREFIX):]
+    mnt = os.path.join(testdir, 'mnt.{id}'.format(id=id_))
+    scratch = os.path.join(mnt, 'client.{id}'.format(id=id_))
+
+    assert isinstance(tests, list)
+    for idx, testname in enumerate(tests):
+        log.info('Running autotest client test #%d: %s...', idx, testname)
+
+        tag = 'client.{id}.num{idx}.{testname}'.format(
+            idx=idx,
+            testname=testname,
+            id=id_,
+            )
+        control = '{tdir}/control.{tag}'.format(tdir=testdir, tag=tag)
+        teuthology.write_file(
+            remote=remote,
+            path=control,
+            data='import json; data=json.loads({data!r}); job.run_test(**data)'.format(
+                data=json.dumps(dict(
+                        url=testname,
+                        dir=scratch,
+                        # TODO perhaps tag
+                        # results will be in {testdir}/autotest/client/results/dbench
+                        # or {testdir}/autotest/client/results/dbench.{tag}
+                        )),
+                ),
+            )
+        remote.run(
+            args=[
+                '{tdir}/autotest/client/bin/autotest'.format(tdir=testdir),
+                '--verbose',
+                '--harness=simple',
+                '--tag={tag}'.format(tag=tag),
+                control,
+                run.Raw('3>&1'),
+                ],
+            )
+
+        remote.run(
+            args=[
+                'rm', '-rf', '--', control,
+                ],
+            )
+
+        remote.run(
+            args=[
+                'mv',
+                '--',
+                '{tdir}/autotest/client/results/{tag}'.format(tdir=testdir, tag=tag),
+                '{tdir}/archive/autotest/{tag}'.format(tdir=testdir, tag=tag),
+                ],
+            )
+
+    remote.run(
+        args=[
+            'rm', '-rf', '--', '{tdir}/autotest'.format(tdir=testdir),
+            ],
+        )
diff --git a/src/ceph/qa/tasks/aver.py b/src/ceph/qa/tasks/aver.py
new file mode 100644
index 0000000..79ee18c
--- /dev/null
+++ b/src/ceph/qa/tasks/aver.py
@@ -0,0 +1,67 @@
+"""
+Aver wrapper task
+"""
+import contextlib
+import logging
+from subprocess import check_call, Popen, PIPE
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Execute an aver assertion
+
+    Parameters:
+
+        input: file containing data referred to by the assertions. File name is
+               relative to the job's archive path
+        validations: list of validations in the Aver language
+
+    Example:
+    - aver:
+        input: bench_output.csv
+        validations:
+        - expect performance(alg='ceph') > performance(alg='raw')
+        - for size > 3 expect avg_throughput > 2000
+    """
+    log.info('Beginning aver...')
+    assert isinstance(config, dict), 'expecting dictionary for configuration'
+
+    if 'input' not in config:
+        raise Exception("Expecting 'input' option")
+    if len(config.get('validations', [])) < 1:
+        raise Exception("Expecting at least one entry in 'validations'")
+
+    url = ('https://github.com/ivotron/aver/releases/download/'
+           'v0.3.0/aver-linux-amd64.tar.bz2')
+
+    aver_path = ctx.archive + '/aver'
+
+    # download binary
+    check_call(['wget', '-O', aver_path + '.tbz', url])
+    check_call(['tar', 'xfj', aver_path + '.tbz', '-C', ctx.archive])
+
+    # print version
+    process = Popen([aver_path, '-v'], stdout=PIPE)
+    log.info(process.communicate()[0])
+
+    # validate
+    for validation in config['validations']:
+        cmd = (aver_path + ' -s -i ' + (ctx.archive + '/' + config['input']) +
+               ' "' + validation + '"')
+        log.info("executing: " + cmd)
+        process = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
+        (stdout, stderr) = process.communicate()
+        if stderr:
+            log.info('aver stderr: ' + stderr)
+        log.info('aver result: ' + stdout)
+        if stdout.strip(' \t\n\r') != 'true':
+            raise Exception('Failed validation: ' + validation)
+
+    try:
+        yield
+    finally:
+        log.info('Removing aver binary...')
+        check_call(['rm', aver_path, aver_path + '.tbz'])
diff --git a/src/ceph/qa/tasks/blktrace.py b/src/ceph/qa/tasks/blktrace.py
new file mode 100644
index 0000000..96aaf50
--- /dev/null
+++ b/src/ceph/qa/tasks/blktrace.py
@@ -0,0 +1,96 @@
+"""
+Run blktrace program through teuthology
+"""
+import contextlib
+import logging
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+blktrace = '/usr/sbin/blktrace'
+daemon_signal = 'term'
+
+@contextlib.contextmanager
+def setup(ctx, config):
+    """
+    Setup all the remotes
+    """
+    osds = ctx.cluster.only(teuthology.is_type('osd', config['cluster']))
+    log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=teuthology.get_testdir(ctx))
+
+    for remote, roles_for_host in osds.remotes.iteritems():
+        log.info('Creating %s on %s' % (log_dir, remote.name))
+        remote.run(
+            args=['mkdir', '-p', '-m0755', '--', log_dir],
+            wait=False,
+            )
+    yield
+
+@contextlib.contextmanager
+def execute(ctx, config):
+    """
+    Run the blktrace program on remote machines.
+    """
+    procs = []
+    testdir = teuthology.get_testdir(ctx)
+    log_dir = '{tdir}/archive/performance/blktrace'.format(tdir=testdir)
+
+    osds = ctx.cluster.only(teuthology.is_type('osd'))
+    for remote, roles_for_host in osds.remotes.iteritems():
+        roles_to_devs = ctx.disk_config.remote_to_roles_to_dev[remote]
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd',
+                                                     config['cluster']):
+            if roles_to_devs.get(role):
+                dev = roles_to_devs[role]
+                log.info("running blktrace on %s: %s" % (remote.name, dev))
+
+                proc = remote.run(
+                    args=[
+                        'cd',
+                        log_dir,
+                        run.Raw(';'),
+                        'daemon-helper',
+                        daemon_signal,
+                        'sudo',
+                        blktrace,
+                        '-o',
+                        dev.rsplit("/", 1)[1],
+                        '-d',
+                        dev,
+                        ],
+                    wait=False,
+                    stdin=run.PIPE,
+                    )
+                procs.append(proc)
+    try:
+        yield
+    finally:
+        osds = ctx.cluster.only(teuthology.is_type('osd'))
+        log.info('stopping blktrace processs')
+        for proc in procs:
+            proc.stdin.close()
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Usage:
+        blktrace:
+
+    or:
+        blktrace:
+          cluster: backup
+
+    Runs blktrace on all osds in the specified cluster (the 'ceph' cluster by
+    default).
+    """
+    if config is None:
+        config = {}
+    config['cluster'] = config.get('cluster', 'ceph')
+
+    with contextutil.nested(
+        lambda: setup(ctx=ctx, config=config),
+        lambda: execute(ctx=ctx, config=config),
+        ):
+        yield
diff --git a/src/ceph/qa/tasks/boto.cfg.template b/src/ceph/qa/tasks/boto.cfg.template
new file mode 100644
index 0000000..cdfe887
--- /dev/null
+++ b/src/ceph/qa/tasks/boto.cfg.template
@@ -0,0 +1,2 @@
+[Boto]
+http_socket_timeout = {idle_timeout}
diff --git a/src/ceph/qa/tasks/calamari_nosetests.py b/src/ceph/qa/tasks/calamari_nosetests.py
new file mode 100644
index 0000000..c6bbaf3
--- /dev/null
+++ b/src/ceph/qa/tasks/calamari_nosetests.py
@@ -0,0 +1,289 @@
+import contextlib
+import logging
+import os
+import textwrap
+import yaml
+
+from cStringIO import StringIO
+from teuthology import contextutil
+from teuthology import misc
+from teuthology import packaging
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+# extra stuff we need to do our job here
+EXTRA_PKGS = [
+    'git',
+]
+
+# stuff that would be in a devmode install, but should be
+# installed in the system for running nosetests against
+# a production install.
+EXTRA_NOSETEST_PKGS = [
+    'python-psutil',
+    'python-mock',
+]
+
+
+def find_client0(cluster):
+    ''' Find remote that has client.0 role, or None '''
+    for rem, roles in cluster.remotes.iteritems():
+        if 'client.0' in roles:
+            return rem
+    return None
+
+
+def pip(remote, package, venv=None, uninstall=False, force=False):
+    ''' {un}install a package with pip, possibly in a virtualenv '''
+    if venv:
+        pip = os.path.join(venv, 'bin', 'pip')
+        args = ['sudo', pip]
+    else:
+        args = ['sudo', 'pip']
+
+    if uninstall:
+        args.extend(['uninstall', '-y'])
+    else:
+        args.append('install')
+        if force:
+            args.append('-I')
+
+    args.append(package)
+    remote.run(args=args)
+
+
+@contextlib.contextmanager
+def install_epel(remote):
+    ''' install a disabled-by-default epel repo config file '''
+    remove = False
+    try:
+        if remote.os.package_type == 'deb':
+            yield
+        else:
+            remove = True
+            distromajor = remote.os.version.split('.')[0]
+
+            repofiledata = textwrap.dedent('''
+                [epel]
+                name=epel{version}
+                metalink=http://mirrors.fedoraproject.org/metalink?repo=epel-{version}&arch=$basearch
+                enabled=0
+                gpgcheck=0
+            ''').format(version=distromajor)
+
+            misc.create_file(remote, '/etc/yum.repos.d/epel.repo',
+                             data=repofiledata, sudo=True)
+            remote.run(args='sudo yum clean all')
+            yield
+
+    finally:
+        if remove:
+            misc.delete_file(remote, '/etc/yum.repos.d/epel.repo', sudo=True)
+
+
+def enable_epel(remote, enable=True):
+    ''' enable/disable the epel repo '''
+    args = 'sudo sed -i'.split()
+    if enable:
+        args.extend(['s/enabled=0/enabled=1/'])
+    else:
+        args.extend(['s/enabled=1/enabled=0/'])
+    args.extend(['/etc/yum.repos.d/epel.repo'])
+
+    remote.run(args=args)
+    remote.run(args='sudo yum clean all')
+
+
+@contextlib.contextmanager
+def install_extra_pkgs(client):
+    ''' Install EXTRA_PKGS '''
+    try:
+        for pkg in EXTRA_PKGS:
+            packaging.install_package(pkg, client)
+        yield
+
+    finally:
+        for pkg in EXTRA_PKGS:
+            packaging.remove_package(pkg, client)
+
+
+@contextlib.contextmanager
+def clone_calamari(config, client):
+    ''' clone calamari source into current directory on remote '''
+    branch = config.get('calamari_branch', 'master')
+    url = config.get('calamari_giturl', 'git://github.com/ceph/calamari')
+    try:
+        out = StringIO()
+        # ensure branch is present (clone -b will succeed even if
+        # the branch doesn't exist, falling back to master)
+        client.run(
+            args='git ls-remote %s %s' % (url, branch),
+            stdout=out,
+            label='check for calamari branch %s existence' % branch
+        )
+        if len(out.getvalue()) == 0:
+            raise RuntimeError("Calamari branch %s doesn't exist" % branch)
+        client.run(args='git clone -b %s %s' % (branch, url))
+        yield
+    finally:
+        # sudo python setup.py develop may have left some root files around
+        client.run(args='sudo rm -rf calamari')
+
+
+@contextlib.contextmanager
+def write_info_yaml(cluster, client):
+    ''' write info.yaml to client for nosetests '''
+    try:
+        info = {
+            'cluster': {
+                rem.name: {'roles': roles}
+                for rem, roles in cluster.remotes.iteritems()
+            }
+        }
+        misc.create_file(client, 'calamari/info.yaml',
+                         data=yaml.safe_dump(info, default_flow_style=False))
+        yield
+    finally:
+        misc.delete_file(client, 'calamari/info.yaml')
+
+
+@contextlib.contextmanager
+def write_test_conf(client):
+    ''' write calamari/tests/test.conf to client for nosetests '''
+    try:
+        testconf = textwrap.dedent('''
+            [testing]
+
+            calamari_control = external
+            ceph_control = external
+            bootstrap = False
+            api_username = admin
+            api_password = admin
+            embedded_timeout_factor = 1
+            external_timeout_factor = 3
+            external_cluster_path = info.yaml
+        ''')
+        misc.create_file(client, 'calamari/tests/test.conf', data=testconf)
+        yield
+
+    finally:
+        misc.delete_file(client, 'calamari/tests/test.conf')
+
+
+@contextlib.contextmanager
+def prepare_nosetest_env(client):
+    try:
+        # extra dependencies that would be in the devmode venv
+        if client.os.package_type == 'rpm':
+            enable_epel(client, enable=True)
+        for package in EXTRA_NOSETEST_PKGS:
+            packaging.install_package(package, client)
+        if client.os.package_type == 'rpm':
+            enable_epel(client, enable=False)
+
+        # install nose itself into the calamari venv, force it in case it's
+        # already installed in the system, so we can invoke it by path without
+        # fear that it's not present
+        pip(client, 'nose', venv='/opt/calamari/venv', force=True)
+
+        # install a later version of requests into the venv as well
+        # (for precise)
+        pip(client, 'requests', venv='/opt/calamari/venv', force=True)
+
+        # link (setup.py develop) calamari/rest-api into the production venv
+        # because production does not include calamari_rest.management, needed
+        # for test_rest_api.py's ApiIntrospection
+        args = 'cd calamari/rest-api'.split() + [run.Raw(';')] + \
+               'sudo /opt/calamari/venv/bin/python setup.py develop'.split()
+        client.run(args=args)
+
+        # because, at least in Python 2.6/Centos, site.py uses
+        # 'os.path.exists()' to process .pth file entries, and exists() uses
+        # access(2) to check for existence, all the paths leading up to
+        # $HOME/calamari/rest-api need to be searchable by all users of
+        # the package, which will include the WSGI/Django app, running
+        # as the Apache user.  So make them all world-read-and-execute.
+        args = 'sudo chmod a+x'.split() + \
+            ['.', './calamari', './calamari/rest-api']
+        client.run(args=args)
+
+        # make one dummy request just to get the WSGI app to do
+        # all its log creation here, before the chmod below (I'm
+        # looking at you, graphite -- /var/log/calamari/info.log and
+        # /var/log/calamari/exception.log)
+        client.run(args='wget -q -O /dev/null http://localhost')
+
+        # /var/log/calamari/* is root-or-apache write-only
+        client.run(args='sudo chmod a+w /var/log/calamari/*')
+
+        yield
+
+    finally:
+        args = 'cd calamari/rest-api'.split() + [run.Raw(';')] + \
+               'sudo /opt/calamari/venv/bin/python setup.py develop -u'.split()
+        client.run(args=args)
+        for pkg in ('nose', 'requests'):
+            pip(client, pkg, venv='/opt/calamari/venv', uninstall=True)
+        for package in EXTRA_NOSETEST_PKGS:
+            packaging.remove_package(package, client)
+
+
+@contextlib.contextmanager
+def run_nosetests(client):
+    ''' Actually run the tests '''
+    args = [
+        'cd',
+        'calamari',
+        run.Raw(';'),
+        'CALAMARI_CONFIG=/etc/calamari/calamari.conf',
+        '/opt/calamari/venv/bin/nosetests',
+        '-v',
+        'tests/',
+    ]
+    client.run(args=args)
+    yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run Calamari tests against an instance set up by 'calamari_server'.
+
+    -- clone the Calamari source into $HOME (see options)
+    -- write calamari/info.yaml describing the cluster
+    -- write calamari/tests/test.conf containing
+        'external' for calamari_control and ceph_control
+        'bootstrap = False' to disable test bootstrapping (installing minions)
+        no api_url necessary (inferred from client.0)
+        'external_cluster_path = info.yaml'
+    -- modify the production Calamari install to allow test runs:
+        install nose in the venv
+        install EXTRA_NOSETEST_PKGS
+        link in, with setup.py develop, calamari_rest (for ApiIntrospection)
+    -- set CALAMARI_CONFIG to point to /etc/calamari/calamari.conf
+    -- nosetests -v tests/
+
+    Options are:
+        calamari_giturl: url from which to git clone calamari
+                         (default: git://github.com/ceph/calamari)
+        calamari_branch: git branch of calamari to check out
+                         (default: master)
+
+    Note: the tests must find a clean cluster, so don't forget to
+    set the crush default type appropriately, or install min_size OSD hosts
+    """
+    client0 = find_client0(ctx.cluster)
+    if client0 is None:
+        raise RuntimeError("must have client.0 role")
+
+    with contextutil.nested(
+        lambda: install_epel(client0),
+        lambda: install_extra_pkgs(client0),
+        lambda: clone_calamari(config, client0),
+        lambda: write_info_yaml(ctx.cluster, client0),
+        lambda: write_test_conf(client0),
+        lambda: prepare_nosetest_env(client0),
+        lambda: run_nosetests(client0),
+    ):
+        yield
diff --git a/src/ceph/qa/tasks/calamari_setup.py b/src/ceph/qa/tasks/calamari_setup.py
new file mode 100644
index 0000000..8ef404f
--- /dev/null
+++ b/src/ceph/qa/tasks/calamari_setup.py
@@ -0,0 +1,467 @@
+"""
+Calamari setup task
+"""
+import contextlib
+import logging
+import os
+import requests
+import shutil
+import webbrowser
+
+from cStringIO import StringIO
+from teuthology.orchestra import run
+from teuthology import contextutil
+from teuthology import misc
+
+log = logging.getLogger(__name__)
+
+
+DEFAULTS = {
+    'version': 'v0.80.9',
+    'test_image': None,
+    'start_browser': False,
+    'email': 'x@y.com',
+    'no_epel': True,
+    'calamari_user': 'admin',
+    'calamari_password': 'admin',
+}
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Do the setup of a calamari server.
+
+    - calamari_setup:
+        version: 'v80.1'
+        test_image: <path to tarball or iso>
+
+    Options are (see DEFAULTS above):
+
+    version -- ceph version we are testing against
+    test_image -- Can be an HTTP URL, in which case fetch from this
+                  http path; can also be local path
+    start_browser -- If True, start a browser.  To be used by runs that will
+                     bring up a browser quickly for human use.  Set to False
+                     for overnight suites that are testing for problems in
+                     the installation itself
+    email -- email address for the user
+    no_epel -- indicates if we should remove epel files prior to yum
+               installations.
+    calamari_user -- user name to log into gui
+    calamari_password -- calamari user password
+    """
+    local_config = DEFAULTS
+    local_config.update(config)
+    config = local_config
+    cal_svr = None
+    for remote_, roles in ctx.cluster.remotes.items():
+        if 'client.0' in roles:
+            cal_svr = remote_
+            break
+    if not cal_svr:
+        raise RuntimeError('client.0 not found in roles')
+    with contextutil.nested(
+        lambda: adjust_yum_repos(ctx, cal_svr, config['no_epel']),
+        lambda: calamari_install(config, cal_svr),
+        lambda: ceph_install(ctx, cal_svr),
+        # do it again because ceph-deploy installed epel for centos
+        lambda: remove_epel(ctx, config['no_epel']),
+        lambda: calamari_connect(ctx, cal_svr),
+        lambda: browser(config['start_browser'], cal_svr.hostname),
+    ):
+        yield
+
+
+@contextlib.contextmanager
+def adjust_yum_repos(ctx, cal_svr, no_epel):
+    """
+    For each remote machine, fix the repos if yum is used.
+    """
+    ice_distro = str(cal_svr.os)
+    if ice_distro.startswith('rhel') or ice_distro.startswith('centos'):
+        if no_epel:
+            for remote in ctx.cluster.remotes:
+                fix_yum_repos(remote, ice_distro)
+    try:
+        yield
+    finally:
+        if ice_distro.startswith('rhel') or ice_distro.startswith('centos'):
+            if no_epel:
+                for remote in ctx.cluster.remotes:
+                    restore_yum_repos(remote)
+
+
+def restore_yum_repos(remote):
+    """
+    Copy the old saved repo back in.
+    """
+    if remote.run(args=['sudo', 'rm', '-rf', '/etc/yum.repos.d']).exitstatus:
+        return False
+    if remote.run(args=['sudo', 'mv', '/etc/yum.repos.d.old',
+                        '/etc/yum.repos.d']).exitstatus:
+        return False
+
+
+def fix_yum_repos(remote, distro):
+    """
+    For yum calamari installations, the repos.d directory should only
+    contain a repo file named rhel<version-number>.repo
+    """
+    if distro.startswith('centos'):
+        # hack alert: detour: install lttng for ceph
+        # this works because epel is preinstalled on the vpms
+        # this is not a generic solution
+        # this is here solely to test the one-off 1.3.0 release for centos6
+        remote.run(args="sudo yum -y install lttng-tools")
+        cmds = [
+            'sudo mkdir /etc/yum.repos.d.old'.split(),
+            ['sudo', 'cp', run.Raw('/etc/yum.repos.d/*'),
+             '/etc/yum.repos.d.old'],
+            ['sudo', 'rm', run.Raw('/etc/yum.repos.d/epel*')],
+        ]
+        for cmd in cmds:
+            if remote.run(args=cmd).exitstatus:
+                return False
+    else:
+        cmds = [
+            'sudo mv /etc/yum.repos.d /etc/yum.repos.d.old'.split(),
+            'sudo mkdir /etc/yum.repos.d'.split(),
+        ]
+        for cmd in cmds:
+            if remote.run(args=cmd).exitstatus:
+                return False
+
+        # map "distroversion" from Remote.os to a tuple of
+        # (repo title, repo name descriptor, apt-mirror repo path chunk)
+        yum_repo_params = {
+            'rhel 6.4': ('rhel6-server', 'RHEL', 'rhel6repo-server'),
+            'rhel 6.5': ('rhel6-server', 'RHEL', 'rhel6repo-server'),
+            'rhel 7.0': ('rhel7-server', 'RHEL', 'rhel7repo/server'),
+        }
+        repotitle, reponame, path = yum_repo_params[distro]
+        repopath = '/etc/yum.repos.d/%s.repo' % repotitle
+        # TO DO:  Make this data configurable too
+        repo_contents = '\n'.join(
+            ('[%s]' % repotitle,
+             'name=%s $releasever - $basearch' % reponame,
+             'baseurl=http://apt-mirror.front.sepia.ceph.com/' + path,
+             'gpgcheck=0',
+             'enabled=1')
+        )
+        misc.sudo_write_file(remote, repopath, repo_contents)
+    cmds = [
+        'sudo yum clean all'.split(),
+        'sudo yum makecache'.split(),
+    ]
+    for cmd in cmds:
+        if remote.run(args=cmd).exitstatus:
+            return False
+    return True
+
+
+@contextlib.contextmanager
+def remove_epel(ctx, no_epel):
+    """
+    just remove epel.  No undo; assumed that it's used after
+    adjust_yum_repos, and relies on its state-save/restore.
+    """
+    if no_epel:
+        for remote in ctx.cluster.remotes:
+            if remote.os.name.startswith('centos'):
+                remote.run(args=[
+                    'sudo', 'rm', '-f', run.Raw('/etc/yum.repos.d/epel*')
+                ])
+    try:
+        yield
+    finally:
+        pass
+
+
+def get_iceball_with_http(url, destdir):
+    '''
+    Copy iceball with http to destdir.  Try both .tar.gz and .iso.
+    '''
+    # stream=True means we don't download until copyfileobj below,
+    # and don't need a temp file
+    r = requests.get(url, stream=True)
+    if not r.ok:
+        raise RuntimeError("Failed to download %s", str(url))
+    filename = os.path.join(destdir, url.split('/')[-1])
+    with open(filename, 'w') as f:
+        shutil.copyfileobj(r.raw, f)
+    log.info('saved %s as %s' % (url, filename))
+    return filename
+
+
+@contextlib.contextmanager
+def calamari_install(config, cal_svr):
+    """
+    Install calamari
+
+    The steps here are:
+        -- Get the iceball, locally or from http
+        -- Copy the iceball to the calamari server, and untar/mount it.
+        -- Run ice-setup on the calamari server.
+        -- Run calamari-ctl initialize.
+    """
+    client_id = str(cal_svr)
+    at_loc = client_id.find('@')
+    if at_loc > 0:
+        client_id = client_id[at_loc + 1:]
+
+    test_image = config['test_image']
+
+    if not test_image:
+        raise RuntimeError('Must supply test image')
+    log.info('calamari test image: %s' % test_image)
+    delete_iceball = False
+
+    if test_image.startswith('http'):
+        iceball_file = get_iceball_with_http(test_image, '/tmp')
+        delete_iceball = True
+    else:
+        iceball_file = test_image
+
+    remote_iceball_file = os.path.join('/tmp', os.path.split(iceball_file)[1])
+    cal_svr.put_file(iceball_file, remote_iceball_file)
+    if iceball_file.endswith('.tar.gz'):   # XXX specify tar/iso in config?
+        icetype = 'tarball'
+    elif iceball_file.endswith('.iso'):
+        icetype = 'iso'
+    else:
+        raise RuntimeError('Can''t handle iceball {0}'.format(iceball_file))
+
+    if icetype == 'tarball':
+        ret = cal_svr.run(args=['gunzip', run.Raw('<'), remote_iceball_file,
+                          run.Raw('|'), 'tar', 'xvf', run.Raw('-')])
+        if ret.exitstatus:
+            raise RuntimeError('remote iceball untar failed')
+    elif icetype == 'iso':
+        mountpoint = '/mnt/'   # XXX create?
+        ret = cal_svr.run(
+            args=['sudo', 'mount', '-o', 'loop', '-r',
+                  remote_iceball_file, mountpoint]
+        )
+
+    # install ice_setup package
+    args = {
+        'deb': 'sudo dpkg -i /mnt/ice-setup*deb',
+        'rpm': 'sudo yum -y localinstall /mnt/ice_setup*rpm'
+    }.get(cal_svr.system_type, None)
+    if not args:
+        raise RuntimeError('{0}: unknown system type'.format(cal_svr))
+    ret = cal_svr.run(args=args)
+    if ret.exitstatus:
+        raise RuntimeError('ice_setup package install failed')
+
+    # Run ice_setup
+    icesetdata = 'yes\n\n%s\nhttp\n' % client_id
+    ice_in = StringIO(icesetdata)
+    ice_out = StringIO()
+    if icetype == 'tarball':
+        args = 'sudo python ice_setup.py'
+    else:
+        args = 'sudo ice_setup -d /mnt'
+    ret = cal_svr.run(args=args, stdin=ice_in, stdout=ice_out)
+    log.debug(ice_out.getvalue())
+    if ret.exitstatus:
+        raise RuntimeError('ice_setup failed')
+
+    # Run calamari-ctl initialize.
+    icesetdata = '%s\n%s\n%s\n%s\n' % (
+        config['calamari_user'],
+        config['email'],
+        config['calamari_password'],
+        config['calamari_password'],
+    )
+    ice_in = StringIO(icesetdata)
+    ret = cal_svr.run(args=['sudo', 'calamari-ctl', 'initialize'],
+                      stdin=ice_in, stdout=ice_out)
+    log.debug(ice_out.getvalue())
+    if ret.exitstatus:
+        raise RuntimeError('calamari-ctl initialize failed')
+    try:
+        yield
+    finally:
+        log.info('Cleaning up after Calamari installation')
+        if icetype == 'iso':
+            cal_svr.run(args=['sudo', 'umount', mountpoint])
+        if delete_iceball:
+            os.unlink(iceball_file)
+
+
+@contextlib.contextmanager
+def ceph_install(ctx, cal_svr):
+    """
+    Install ceph if ceph was not previously installed by teuthology.  This
+    code tests the case where calamari is installed on a brand new system.
+    """
+    loc_inst = False
+    if 'install' not in [x.keys()[0] for x in ctx.config['tasks']]:
+        loc_inst = True
+        ret = deploy_ceph(ctx, cal_svr)
+        if ret:
+            raise RuntimeError('ceph installs failed')
+    try:
+        yield
+    finally:
+        if loc_inst:
+            if not undeploy_ceph(ctx, cal_svr):
+                log.error('Cleanup of Ceph installed by Calamari-setup failed')
+
+
+def deploy_ceph(ctx, cal_svr):
+    """
+    Perform the ceph-deploy actions needed to bring up a Ceph cluster.  This
+    test is needed to check the ceph-deploy that comes with the calamari
+    package.
+    """
+    osd_to_name = {}
+    all_machines = set()
+    all_mons = set()
+    all_osds = set()
+
+    # collect which remotes are osds and which are mons
+    for remote in ctx.cluster.remotes:
+        all_machines.add(remote.shortname)
+        roles = ctx.cluster.remotes[remote]
+        for role in roles:
+            daemon_type, number = role.split('.')
+            if daemon_type == 'osd':
+                all_osds.add(remote.shortname)
+                osd_to_name[number] = remote.shortname
+            if daemon_type == 'mon':
+                all_mons.add(remote.shortname)
+
+    # figure out whether we're in "1.3+" mode: prior to 1.3, there was
+    # only one Ceph repo, and it was all installed on every Ceph host.
+    # with 1.3, we've split that into MON and OSD repos (in order to
+    # be able to separately track subscriptions per-node).  This
+    # requires new switches to ceph-deploy to select which locally-served
+    # repo is connected to which cluster host.
+    #
+    # (TODO: A further issue is that the installation/setup may not have
+    # created local repos at all, but that is the subject of a future
+    # change.)
+
+    r = cal_svr.run(args='/usr/bin/test -d /mnt/MON', check_status=False)
+    use_install_repo = (r.returncode == 0)
+
+    # pre-1.3:
+    # ceph-deploy new <all_mons>
+    # ceph-deploy install <all_machines>
+    # ceph-deploy mon create-initial
+    #
+    # 1.3 and later:
+    # ceph-deploy new <all_mons>
+    # ceph-deploy install --repo --release=ceph-mon <all_mons>
+    # ceph-deploy install <all_mons>
+    # ceph-deploy install --repo --release=ceph-osd <all_osds>
+    # ceph-deploy install <all_osds>
+    # ceph-deploy mon create-initial
+    #
+    # one might think the install <all_mons> and install <all_osds>
+    # commands would need --mon and --osd, but #12147 has not yet
+    # made it into RHCS 1.3.0; since the package split also hasn't
+    # landed, we can avoid using the flag and avoid the bug.
+
+    cmds = ['ceph-deploy new ' + ' '.join(all_mons)]
+
+    if use_install_repo:
+        cmds.append('ceph-deploy repo ceph-mon ' +
+                    ' '.join(all_mons))
+        cmds.append('ceph-deploy install --no-adjust-repos --mon ' +
+                    ' '.join(all_mons))
+        cmds.append('ceph-deploy repo ceph-osd ' +
+                    ' '.join(all_osds))
+        cmds.append('ceph-deploy install --no-adjust-repos --osd ' +
+                    ' '.join(all_osds))
+        # We tell users to use `hostname` in our docs. Do the same here.
+        cmds.append('ceph-deploy install --no-adjust-repos --cli `hostname`')
+    else:
+        cmds.append('ceph-deploy install ' + ' '.join(all_machines))
+
+    cmds.append('ceph-deploy mon create-initial')
+
+    for cmd in cmds:
+        cal_svr.run(args=cmd).exitstatus
+
+    disk_labels = '_dcba'
+    # NEEDS WORK assumes disks start with vd (need to check this somewhere)
+    for cmd_pts in [['disk', 'zap'], ['osd', 'prepare'], ['osd', 'activate']]:
+        mach_osd_cnt = {}
+        for osdn in osd_to_name:
+            osd_mac = osd_to_name[osdn]
+            mach_osd_cnt[osd_mac] = mach_osd_cnt.get(osd_mac, 0) + 1
+            arg_list = ['ceph-deploy']
+            arg_list.extend(cmd_pts)
+            disk_id = '%s:vd%s' % (osd_to_name[osdn],
+                                   disk_labels[mach_osd_cnt[osd_mac]])
+            if 'activate' in cmd_pts:
+                disk_id += '1'
+            arg_list.append(disk_id)
+            cal_svr.run(args=arg_list).exitstatus
+
+
+def undeploy_ceph(ctx, cal_svr):
+    """
+    Cleanup deployment of ceph.
+    """
+    all_machines = []
+    ret = True
+    for remote in ctx.cluster.remotes:
+        roles = ctx.cluster.remotes[remote]
+        if (
+            not any('osd' in role for role in roles) and
+            not any('mon' in role for role in roles)
+        ):
+            continue
+        ret &= remote.run(
+            args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
+                  'sudo', 'service', 'ceph', 'stop']
+        ).exitstatus
+        all_machines.append(remote.shortname)
+    all_machines = set(all_machines)
+    cmd1 = ['ceph-deploy', 'uninstall']
+    cmd1.extend(all_machines)
+    ret &= cal_svr.run(args=cmd1).exitstatus
+    cmd2 = ['ceph-deploy', 'purge']
+    cmd2.extend(all_machines)
+    ret &= cal_svr.run(args=cmd2).exitstatus
+    for remote in ctx.cluster.remotes:
+        ret &= remote.run(args=['sudo', 'rm', '-rf',
+                                '.ssh/known_hosts']).exitstatus
+    return ret
+
+
+@contextlib.contextmanager
+def calamari_connect(ctx, cal_svr):
+    """
+    Connect calamari to the ceph nodes.
+    """
+    connects = ['ceph-deploy', 'calamari', 'connect']
+    for machine_info in ctx.cluster.remotes:
+        if 'client.0' not in ctx.cluster.remotes[machine_info]:
+            connects.append(machine_info.shortname)
+    ret = cal_svr.run(args=connects)
+    if ret.exitstatus:
+        raise RuntimeError('calamari connect failed')
+    try:
+        yield
+    finally:
+        log.info('Calamari test terminating')
+
+
+@contextlib.contextmanager
+def browser(start_browser, web_page):
+    """
+    Bring up a browser, if wanted.
+    """
+    if start_browser:
+        webbrowser.open('http://%s' % web_page)
+    try:
+        yield
+    finally:
+        if start_browser:
+            log.info('Web browser support terminating')
diff --git a/src/ceph/qa/tasks/ceph.py b/src/ceph/qa/tasks/ceph.py
new file mode 100644
index 0000000..72f2653
--- /dev/null
+++ b/src/ceph/qa/tasks/ceph.py
@@ -0,0 +1,1688 @@
+"""
+Ceph cluster task.
+
+Handle the setup, starting, and clean-up of a Ceph cluster.
+"""
+from cStringIO import StringIO
+
+import argparse
+import contextlib
+import errno
+import logging
+import os
+import json
+import time
+import gevent
+import socket
+
+from paramiko import SSHException
+from ceph_manager import CephManager, write_conf
+from tasks.cephfs.filesystem import Filesystem
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology import exceptions
+from teuthology.orchestra import run
+import ceph_client as cclient
+from teuthology.orchestra.daemon import DaemonGroup
+
+CEPH_ROLE_TYPES = ['mon', 'mgr', 'osd', 'mds', 'rgw']
+
+log = logging.getLogger(__name__)
+
+
+def generate_caps(type_):
+    """
+    Each call will return the next capability for each system type
+    (essentially a subset of possible role values).  Valid types are osd,
+    mds and client.
+    """
+    defaults = dict(
+        osd=dict(
+            mon='allow *',
+            mgr='allow *',
+            osd='allow *',
+        ),
+        mgr=dict(
+            mon='allow profile mgr',
+            osd='allow *',
+            mds='allow *',
+        ),
+        mds=dict(
+            mon='allow *',
+            mgr='allow *',
+            osd='allow *',
+            mds='allow',
+        ),
+        client=dict(
+            mon='allow rw',
+            mgr='allow r',
+            osd='allow rwx',
+            mds='allow',
+        ),
+    )
+    for subsystem, capability in defaults[type_].items():
+        yield '--cap'
+        yield subsystem
+        yield capability
+
+
+@contextlib.contextmanager
+def ceph_log(ctx, config):
+    """
+    Create /var/log/ceph log directory that is open to everyone.
+    Add valgrind and profiling-logger directories.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    log.info('Making ceph log dir writeable by non-root...')
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'sudo',
+                'chmod',
+                '777',
+                '/var/log/ceph',
+            ],
+            wait=False,
+        )
+    )
+    log.info('Disabling ceph logrotate...')
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'sudo',
+                'rm', '-f', '--',
+                '/etc/logrotate.d/ceph',
+            ],
+            wait=False,
+        )
+    )
+    log.info('Creating extra log directories...')
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'sudo',
+                'install', '-d', '-m0777', '--',
+                '/var/log/ceph/valgrind',
+                '/var/log/ceph/profiling-logger',
+            ],
+            wait=False,
+        )
+    )
+
+    class Rotater(object):
+        stop_event = gevent.event.Event()
+
+        def invoke_logrotate(self):
+            # 1) install ceph-test.conf in /etc/logrotate.d
+            # 2) continuously loop over logrotate invocation with ceph-test.conf
+            while not self.stop_event.is_set():
+                self.stop_event.wait(timeout=30)
+                try:
+                    run.wait(
+                        ctx.cluster.run(
+                            args=['sudo', 'logrotate', '/etc/logrotate.d/ceph-test.conf'
+                                  ],
+                            wait=False,
+                        )
+                    )
+                except exceptions.ConnectionLostError as e:
+                    # Some tests may power off nodes during test, in which
+                    # case we will see connection errors that we should ignore.
+                    log.debug("Missed logrotate, node '{0}' is offline".format(
+                        e.node))
+                except EOFError as e:
+                    # Paramiko sometimes raises this when it fails to
+                    # connect to a node during open_session.  As with
+                    # ConnectionLostError, we ignore this because nodes
+                    # are allowed to get power cycled during tests.
+                    log.debug("Missed logrotate, EOFError")
+                except SSHException as e:
+                    log.debug("Missed logrotate, SSHException")
+                except socket.error as e:
+                    if e.errno == errno.EHOSTUNREACH:
+                        log.debug("Missed logrotate, host unreachable")
+                    else:
+                        raise
+
+        def begin(self):
+            self.thread = gevent.spawn(self.invoke_logrotate)
+
+        def end(self):
+            self.stop_event.set()
+            self.thread.get()
+
+    def write_rotate_conf(ctx, daemons):
+        testdir = teuthology.get_testdir(ctx)
+        rotate_conf_path = os.path.join(os.path.dirname(__file__), 'logrotate.conf')
+        with file(rotate_conf_path, 'rb') as f:
+            conf = ""
+            for daemon, size in daemons.iteritems():
+                log.info('writing logrotate stanza for {daemon}'.format(daemon=daemon))
+                conf += f.read().format(daemon_type=daemon, max_size=size)
+                f.seek(0, 0)
+
+            for remote in ctx.cluster.remotes.iterkeys():
+                teuthology.write_file(remote=remote,
+                                      path='{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
+                                      data=StringIO(conf)
+                                      )
+                remote.run(
+                    args=[
+                        'sudo',
+                        'mv',
+                        '{tdir}/logrotate.ceph-test.conf'.format(tdir=testdir),
+                        '/etc/logrotate.d/ceph-test.conf',
+                        run.Raw('&&'),
+                        'sudo',
+                        'chmod',
+                        '0644',
+                        '/etc/logrotate.d/ceph-test.conf',
+                        run.Raw('&&'),
+                        'sudo',
+                        'chown',
+                        'root.root',
+                        '/etc/logrotate.d/ceph-test.conf'
+                    ]
+                )
+                remote.chcon('/etc/logrotate.d/ceph-test.conf',
+                             'system_u:object_r:etc_t:s0')
+
+    if ctx.config.get('log-rotate'):
+        daemons = ctx.config.get('log-rotate')
+        log.info('Setting up log rotation with ' + str(daemons))
+        write_rotate_conf(ctx, daemons)
+        logrotater = Rotater()
+        logrotater.begin()
+    try:
+        yield
+
+    finally:
+        if ctx.config.get('log-rotate'):
+            log.info('Shutting down logrotate')
+            logrotater.end()
+            ctx.cluster.run(
+                args=['sudo', 'rm', '/etc/logrotate.d/ceph-test.conf'
+                      ]
+            )
+        if ctx.archive is not None and \
+                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
+            # and logs
+            log.info('Compressing logs...')
+            run.wait(
+                ctx.cluster.run(
+                    args=[
+                        'sudo',
+                        'find',
+                        '/var/log/ceph',
+                        '-name',
+                        '*.log',
+                        '-print0',
+                        run.Raw('|'),
+                        'sudo',
+                        'xargs',
+                        '-0',
+                        '--no-run-if-empty',
+                        '--',
+                        'gzip',
+                        '--',
+                    ],
+                    wait=False,
+                ),
+            )
+
+            log.info('Archiving logs...')
+            path = os.path.join(ctx.archive, 'remote')
+            os.makedirs(path)
+            for remote in ctx.cluster.remotes.iterkeys():
+                sub = os.path.join(path, remote.shortname)
+                os.makedirs(sub)
+                teuthology.pull_directory(remote, '/var/log/ceph',
+                                          os.path.join(sub, 'log'))
+
+
+def assign_devs(roles, devs):
+    """
+    Create a dictionary of devs indexed by roles
+
+    :param roles: List of roles
+    :param devs: Corresponding list of devices.
+    :returns: Dictionary of devs indexed by roles.
+    """
+    return dict(zip(roles, devs))
+
+
+@contextlib.contextmanager
+def valgrind_post(ctx, config):
+    """
+    After the tests run, look throught all the valgrind logs.  Exceptions are raised
+    if textual errors occured in the logs, or if valgrind exceptions were detected in
+    the logs.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    try:
+        yield
+    finally:
+        lookup_procs = list()
+        log.info('Checking for errors in any valgrind logs...')
+        for remote in ctx.cluster.remotes.iterkeys():
+            # look at valgrind logs for each node
+            proc = remote.run(
+                args=[
+                    'sudo',
+                    'zgrep',
+                    '<kind>',
+                    run.Raw('/var/log/ceph/valgrind/*'),
+                    '/dev/null',  # include a second file so that we always get a filename prefix on the output
+                    run.Raw('|'),
+                    'sort',
+                    run.Raw('|'),
+                    'uniq',
+                ],
+                wait=False,
+                check_status=False,
+                stdout=StringIO(),
+            )
+            lookup_procs.append((proc, remote))
+
+        valgrind_exception = None
+        for (proc, remote) in lookup_procs:
+            proc.wait()
+            out = proc.stdout.getvalue()
+            for line in out.split('\n'):
+                if line == '':
+                    continue
+                try:
+                    (file, kind) = line.split(':')
+                except Exception:
+                    log.error('failed to split line %s', line)
+                    raise
+                log.debug('file %s kind %s', file, kind)
+                if (file.find('mds') >= 0) and kind.find('Lost') > 0:
+                    continue
+                log.error('saw valgrind issue %s in %s', kind, file)
+                valgrind_exception = Exception('saw valgrind issues')
+
+        if config.get('expect_valgrind_errors'):
+            if not valgrind_exception:
+                raise Exception('expected valgrind issues and found none')
+        else:
+            if valgrind_exception:
+                raise valgrind_exception
+
+
+@contextlib.contextmanager
+def crush_setup(ctx, config):
+    cluster_name = config['cluster']
+    first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
+    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    profile = config.get('crush_tunables', 'default')
+    log.info('Setting crush tunables to %s', profile)
+    mon_remote.run(
+        args=['sudo', 'ceph', '--cluster', cluster_name,
+              'osd', 'crush', 'tunables', profile])
+    yield
+
+
+@contextlib.contextmanager
+def create_rbd_pool(ctx, config):
+    cluster_name = config['cluster']
+    first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
+    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    log.info('Waiting for OSDs to come up')
+    teuthology.wait_until_osds_up(
+        ctx,
+        cluster=ctx.cluster,
+        remote=mon_remote,
+        ceph_cluster=cluster_name,
+    )
+    if config.get('create_rbd_pool', True):
+        log.info('Creating RBD pool')
+        mon_remote.run(
+            args=['sudo', 'ceph', '--cluster', cluster_name,
+                  'osd', 'pool', 'create', 'rbd', '8'])
+        mon_remote.run(
+            args=[
+                'sudo', 'ceph', '--cluster', cluster_name,
+                'osd', 'pool', 'application', 'enable',
+                'rbd', 'rbd', '--yes-i-really-mean-it'
+            ],
+            check_status=False)
+    yield
+
+@contextlib.contextmanager
+def cephfs_setup(ctx, config):
+    cluster_name = config['cluster']
+    testdir = teuthology.get_testdir(ctx)
+    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+
+    first_mon = teuthology.get_first_mon(ctx, config, cluster_name)
+    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
+    # If there are any MDSs, then create a filesystem for them to use
+    # Do this last because requires mon cluster to be up and running
+    if mdss.remotes:
+        log.info('Setting up CephFS filesystem...')
+
+        fs = Filesystem(ctx, name='cephfs', create=True,
+                        ec_profile=config.get('cephfs_ec_profile', None))
+
+        is_active_mds = lambda role: 'mds.' in role and not role.endswith('-s') and '-s-' not in role
+        all_roles = [item for remote_roles in mdss.remotes.values() for item in remote_roles]
+        num_active = len([r for r in all_roles if is_active_mds(r)])
+
+        fs.set_max_mds(num_active)
+        fs.set_allow_dirfrags(True)
+
+    yield
+
+
+@contextlib.contextmanager
+def cluster(ctx, config):
+    """
+    Handle the creation and removal of a ceph cluster.
+
+    On startup:
+        Create directories needed for the cluster.
+        Create remote journals for all osds.
+        Create and set keyring.
+        Copy the monmap to tht test systems.
+        Setup mon nodes.
+        Setup mds nodes.
+        Mkfs osd nodes.
+        Add keyring information to monmaps
+        Mkfs mon nodes.
+
+    On exit:
+        If errors occured, extract a failure message and store in ctx.summary.
+        Unmount all test files and temporary journaling files.
+        Save the monitor information and archive all ceph logs.
+        Cleanup the keyring setup, and remove all monitor map and data files left over.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    if ctx.config.get('use_existing_cluster', False) is True:
+        log.info("'use_existing_cluster' is true; skipping cluster creation")
+        yield
+
+    testdir = teuthology.get_testdir(ctx)
+    cluster_name = config['cluster']
+    data_dir = '{tdir}/{cluster}.data'.format(tdir=testdir, cluster=cluster_name)
+    log.info('Creating ceph cluster %s...', cluster_name)
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'install', '-d', '-m0755', '--',
+                data_dir,
+            ],
+            wait=False,
+        )
+    )
+
+    run.wait(
+        ctx.cluster.run(
+            args=[
+                'sudo',
+                'install', '-d', '-m0777', '--', '/var/run/ceph',
+            ],
+            wait=False,
+        )
+    )
+
+    devs_to_clean = {}
+    remote_to_roles_to_devs = {}
+    remote_to_roles_to_journals = {}
+    osds = ctx.cluster.only(teuthology.is_type('osd', cluster_name))
+    for remote, roles_for_host in osds.remotes.iteritems():
+        devs = teuthology.get_scratch_devices(remote)
+        roles_to_devs = {}
+        roles_to_journals = {}
+        if config.get('fs'):
+            log.info('fs option selected, checking for scratch devs')
+            log.info('found devs: %s' % (str(devs),))
+            devs_id_map = teuthology.get_wwn_id_map(remote, devs)
+            iddevs = devs_id_map.values()
+            roles_to_devs = assign_devs(
+                teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
+            )
+            if len(roles_to_devs) < len(iddevs):
+                iddevs = iddevs[len(roles_to_devs):]
+            devs_to_clean[remote] = []
+
+        if config.get('block_journal'):
+            log.info('block journal enabled')
+            roles_to_journals = assign_devs(
+                teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name), iddevs
+            )
+            log.info('journal map: %s', roles_to_journals)
+
+        if config.get('tmpfs_journal'):
+            log.info('tmpfs journal enabled')
+            roles_to_journals = {}
+            remote.run(args=['sudo', 'mount', '-t', 'tmpfs', 'tmpfs', '/mnt'])
+            for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
+                tmpfs = '/mnt/' + role
+                roles_to_journals[role] = tmpfs
+                remote.run(args=['truncate', '-s', '1500M', tmpfs])
+            log.info('journal map: %s', roles_to_journals)
+
+        log.info('dev map: %s' % (str(roles_to_devs),))
+        remote_to_roles_to_devs[remote] = roles_to_devs
+        remote_to_roles_to_journals[remote] = roles_to_journals
+
+    log.info('Generating config...')
+    remotes_and_roles = ctx.cluster.remotes.items()
+    roles = [role_list for (remote, role_list) in remotes_and_roles]
+    ips = [host for (host, port) in
+           (remote.ssh.get_transport().getpeername() for (remote, role_list) in remotes_and_roles)]
+    conf = teuthology.skeleton_config(ctx, roles=roles, ips=ips, cluster=cluster_name)
+    for remote, roles_to_journals in remote_to_roles_to_journals.iteritems():
+        for role, journal in roles_to_journals.iteritems():
+            name = teuthology.ceph_role(role)
+            if name not in conf:
+                conf[name] = {}
+            conf[name]['osd journal'] = journal
+    for section, keys in config['conf'].iteritems():
+        for key, value in keys.iteritems():
+            log.info("[%s] %s = %s" % (section, key, value))
+            if section not in conf:
+                conf[section] = {}
+            conf[section][key] = value
+
+    if config.get('tmpfs_journal'):
+        conf['journal dio'] = False
+
+    if not hasattr(ctx, 'ceph'):
+        ctx.ceph = {}
+    ctx.ceph[cluster_name] = argparse.Namespace()
+    ctx.ceph[cluster_name].conf = conf
+
+    default_keyring = '/etc/ceph/{cluster}.keyring'.format(cluster=cluster_name)
+    keyring_path = config.get('keyring_path', default_keyring)
+
+    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+
+    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
+
+    log.info('Setting up %s...' % firstmon)
+    ctx.cluster.only(firstmon).run(
+        args=[
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            coverage_dir,
+            'ceph-authtool',
+            '--create-keyring',
+            keyring_path,
+        ],
+    )
+    ctx.cluster.only(firstmon).run(
+        args=[
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            coverage_dir,
+            'ceph-authtool',
+            '--gen-key',
+            '--name=mon.',
+            keyring_path,
+        ],
+    )
+    ctx.cluster.only(firstmon).run(
+        args=[
+            'sudo',
+            'chmod',
+            '0644',
+            keyring_path,
+        ],
+    )
+    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+    monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
+                                                   cluster=cluster_name)
+    fsid = teuthology.create_simple_monmap(
+        ctx,
+        remote=mon0_remote,
+        conf=conf,
+        path=monmap_path,
+    )
+    if not 'global' in conf:
+        conf['global'] = {}
+    conf['global']['fsid'] = fsid
+
+    default_conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster_name)
+    conf_path = config.get('conf_path', default_conf_path)
+    log.info('Writing %s for FSID %s...' % (conf_path, fsid))
+    write_conf(ctx, conf_path, cluster_name)
+
+    log.info('Creating admin key on %s...' % firstmon)
+    ctx.cluster.only(firstmon).run(
+        args=[
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            coverage_dir,
+            'ceph-authtool',
+            '--gen-key',
+            '--name=client.admin',
+            '--set-uid=0',
+            '--cap', 'mon', 'allow *',
+            '--cap', 'osd', 'allow *',
+            '--cap', 'mds', 'allow *',
+            '--cap', 'mgr', 'allow *',
+            keyring_path,
+        ],
+    )
+
+    log.info('Copying monmap to all nodes...')
+    keyring = teuthology.get_file(
+        remote=mon0_remote,
+        path=keyring_path,
+    )
+    monmap = teuthology.get_file(
+        remote=mon0_remote,
+        path=monmap_path,
+    )
+
+    for rem in ctx.cluster.remotes.iterkeys():
+        # copy mon key and initial monmap
+        log.info('Sending monmap to node {remote}'.format(remote=rem))
+        teuthology.sudo_write_file(
+            remote=rem,
+            path=keyring_path,
+            data=keyring,
+            perms='0644'
+        )
+        teuthology.write_file(
+            remote=rem,
+            path=monmap_path,
+            data=monmap,
+        )
+
+    log.info('Setting up mon nodes...')
+    mons = ctx.cluster.only(teuthology.is_type('mon', cluster_name))
+
+    if not config.get('skip_mgr_daemons', False):
+        log.info('Setting up mgr nodes...')
+        mgrs = ctx.cluster.only(teuthology.is_type('mgr', cluster_name))
+        for remote, roles_for_host in mgrs.remotes.iteritems():
+            for role in teuthology.cluster_roles_of_type(roles_for_host, 'mgr',
+                                                         cluster_name):
+                _, _, id_ = teuthology.split_role(role)
+                mgr_dir = '/var/lib/ceph/mgr/{cluster}-{id}'.format(
+                    cluster=cluster_name,
+                    id=id_,
+                )
+                remote.run(
+                    args=[
+                        'sudo',
+                        'mkdir',
+                        '-p',
+                        mgr_dir,
+                        run.Raw('&&'),
+                        'sudo',
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        coverage_dir,
+                        'ceph-authtool',
+                        '--create-keyring',
+                        '--gen-key',
+                        '--name=mgr.{id}'.format(id=id_),
+                        mgr_dir + '/keyring',
+                    ],
+                )
+
+    log.info('Setting up mds nodes...')
+    mdss = ctx.cluster.only(teuthology.is_type('mds', cluster_name))
+    for remote, roles_for_host in mdss.remotes.iteritems():
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'mds',
+                                                     cluster_name):
+            _, _, id_ = teuthology.split_role(role)
+            mds_dir = '/var/lib/ceph/mds/{cluster}-{id}'.format(
+                cluster=cluster_name,
+                id=id_,
+            )
+            remote.run(
+                args=[
+                    'sudo',
+                    'mkdir',
+                    '-p',
+                    mds_dir,
+                    run.Raw('&&'),
+                    'sudo',
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    coverage_dir,
+                    'ceph-authtool',
+                    '--create-keyring',
+                    '--gen-key',
+                    '--name=mds.{id}'.format(id=id_),
+                    mds_dir + '/keyring',
+                ],
+            )
+
+    cclient.create_keyring(ctx, cluster_name)
+    log.info('Running mkfs on osd nodes...')
+
+    if not hasattr(ctx, 'disk_config'):
+        ctx.disk_config = argparse.Namespace()
+    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev'):
+        ctx.disk_config.remote_to_roles_to_dev = {}
+    if not hasattr(ctx.disk_config, 'remote_to_roles_to_journals'):
+        ctx.disk_config.remote_to_roles_to_journals = {}
+    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_mount_options'):
+        ctx.disk_config.remote_to_roles_to_dev_mount_options = {}
+    if not hasattr(ctx.disk_config, 'remote_to_roles_to_dev_fstype'):
+        ctx.disk_config.remote_to_roles_to_dev_fstype = {}
+
+    teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_dev, remote_to_roles_to_devs)
+    teuthology.deep_merge(ctx.disk_config.remote_to_roles_to_journals, remote_to_roles_to_journals)
+
+    log.info("ctx.disk_config.remote_to_roles_to_dev: {r}".format(r=str(ctx.disk_config.remote_to_roles_to_dev)))
+    for remote, roles_for_host in osds.remotes.iteritems():
+        roles_to_devs = remote_to_roles_to_devs[remote]
+        roles_to_journals = remote_to_roles_to_journals[remote]
+
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
+            _, _, id_ = teuthology.split_role(role)
+            mnt_point = '/var/lib/ceph/osd/{cluster}-{id}'.format(cluster=cluster_name, id=id_)
+            remote.run(
+                args=[
+                    'sudo',
+                    'mkdir',
+                    '-p',
+                    mnt_point,
+                ])
+            log.info(str(roles_to_devs))
+            log.info(str(roles_to_journals))
+            log.info(role)
+            if roles_to_devs.get(role):
+                dev = roles_to_devs[role]
+                fs = config.get('fs')
+                package = None
+                mkfs_options = config.get('mkfs_options')
+                mount_options = config.get('mount_options')
+                if fs == 'btrfs':
+                    # package = 'btrfs-tools'
+                    if mount_options is None:
+                        mount_options = ['noatime', 'user_subvol_rm_allowed']
+                    if mkfs_options is None:
+                        mkfs_options = ['-m', 'single',
+                                        '-l', '32768',
+                                        '-n', '32768']
+                if fs == 'xfs':
+                    # package = 'xfsprogs'
+                    if mount_options is None:
+                        mount_options = ['noatime']
+                    if mkfs_options is None:
+                        mkfs_options = ['-f', '-i', 'size=2048']
+                if fs == 'ext4' or fs == 'ext3':
+                    if mount_options is None:
+                        mount_options = ['noatime', 'user_xattr']
+
+                if mount_options is None:
+                    mount_options = []
+                if mkfs_options is None:
+                    mkfs_options = []
+                mkfs = ['mkfs.%s' % fs] + mkfs_options
+                log.info('%s on %s on %s' % (mkfs, dev, remote))
+                if package is not None:
+                    remote.run(
+                        args=[
+                            'sudo',
+                            'apt-get', 'install', '-y', package
+                        ],
+                        stdout=StringIO(),
+                    )
+
+                try:
+                    remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
+                except run.CommandFailedError:
+                    # Newer btfs-tools doesn't prompt for overwrite, use -f
+                    if '-f' not in mount_options:
+                        mkfs_options.append('-f')
+                        mkfs = ['mkfs.%s' % fs] + mkfs_options
+                        log.info('%s on %s on %s' % (mkfs, dev, remote))
+                    remote.run(args=['yes', run.Raw('|')] + ['sudo'] + mkfs + [dev])
+
+                log.info('mount %s on %s -o %s' % (dev, remote,
+                                                   ','.join(mount_options)))
+                remote.run(
+                    args=[
+                        'sudo',
+                        'mount',
+                        '-t', fs,
+                        '-o', ','.join(mount_options),
+                        dev,
+                        mnt_point,
+                    ]
+                )
+                remote.run(
+                    args=[
+                        'sudo', '/sbin/restorecon', mnt_point,
+                    ],
+                    check_status=False,
+                )
+                if not remote in ctx.disk_config.remote_to_roles_to_dev_mount_options:
+                    ctx.disk_config.remote_to_roles_to_dev_mount_options[remote] = {}
+                ctx.disk_config.remote_to_roles_to_dev_mount_options[remote][role] = mount_options
+                if not remote in ctx.disk_config.remote_to_roles_to_dev_fstype:
+                    ctx.disk_config.remote_to_roles_to_dev_fstype[remote] = {}
+                ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role] = fs
+                devs_to_clean[remote].append(mnt_point)
+
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'osd', cluster_name):
+            _, _, id_ = teuthology.split_role(role)
+            remote.run(
+                args=[
+                    'sudo',
+                    'MALLOC_CHECK_=3',
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    coverage_dir,
+                    'ceph-osd',
+                    '--cluster',
+                    cluster_name,
+                    '--mkfs',
+                    '--mkkey',
+                    '-i', id_,
+                    '--monmap', monmap_path,
+                ],
+            )
+
+    log.info('Reading keys from all nodes...')
+    keys_fp = StringIO()
+    keys = []
+    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
+        for type_ in ['mgr',  'mds', 'osd']:
+            if type_ == 'mgr' and config.get('skip_mgr_daemons', False):
+                continue
+            for role in teuthology.cluster_roles_of_type(roles_for_host, type_, cluster_name):
+                _, _, id_ = teuthology.split_role(role)
+                data = teuthology.get_file(
+                    remote=remote,
+                    path='/var/lib/ceph/{type}/{cluster}-{id}/keyring'.format(
+                        type=type_,
+                        id=id_,
+                        cluster=cluster_name,
+                    ),
+                    sudo=True,
+                )
+                keys.append((type_, id_, data))
+                keys_fp.write(data)
+    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'client', cluster_name):
+            _, _, id_ = teuthology.split_role(role)
+            data = teuthology.get_file(
+                remote=remote,
+                path='/etc/ceph/{cluster}.client.{id}.keyring'.format(id=id_, cluster=cluster_name)
+            )
+            keys.append(('client', id_, data))
+            keys_fp.write(data)
+
+    log.info('Adding keys to all mons...')
+    writes = mons.run(
+        args=[
+            'sudo', 'tee', '-a',
+            keyring_path,
+        ],
+        stdin=run.PIPE,
+        wait=False,
+        stdout=StringIO(),
+    )
+    keys_fp.seek(0)
+    teuthology.feed_many_stdins_and_close(keys_fp, writes)
+    run.wait(writes)
+    for type_, id_, data in keys:
+        run.wait(
+            mons.run(
+                args=[
+                         'sudo',
+                         'adjust-ulimits',
+                         'ceph-coverage',
+                         coverage_dir,
+                         'ceph-authtool',
+                         keyring_path,
+                         '--name={type}.{id}'.format(
+                             type=type_,
+                             id=id_,
+                         ),
+                     ] + list(generate_caps(type_)),
+                wait=False,
+            ),
+        )
+
+    log.info('Running mkfs on mon nodes...')
+    for remote, roles_for_host in mons.remotes.iteritems():
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'mon', cluster_name):
+            _, _, id_ = teuthology.split_role(role)
+            remote.run(
+                args=[
+                    'sudo',
+                    'mkdir',
+                    '-p',
+                    '/var/lib/ceph/mon/{cluster}-{id}'.format(id=id_, cluster=cluster_name),
+                ],
+            )
+            remote.run(
+                args=[
+                    'sudo',
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    coverage_dir,
+                    'ceph-mon',
+                    '--cluster', cluster_name,
+                    '--mkfs',
+                    '-i', id_,
+                    '--monmap', monmap_path,
+                    '--keyring', keyring_path,
+                ],
+            )
+
+    run.wait(
+        mons.run(
+            args=[
+                'rm',
+                '--',
+                monmap_path,
+            ],
+            wait=False,
+        ),
+    )
+
+    try:
+        yield
+    except Exception:
+        # we need to know this below
+        ctx.summary['success'] = False
+        raise
+    finally:
+        (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+
+        log.info('Checking cluster log for badness...')
+
+        def first_in_ceph_log(pattern, excludes):
+            """
+            Find the first occurence of the pattern specified in the Ceph log,
+            Returns None if none found.
+
+            :param pattern: Pattern scanned for.
+            :param excludes: Patterns to ignore.
+            :return: First line of text (or None if not found)
+            """
+            args = [
+                'sudo',
+                'egrep', pattern,
+                '/var/log/ceph/{cluster}.log'.format(cluster=cluster_name),
+            ]
+            for exclude in excludes:
+                args.extend([run.Raw('|'), 'egrep', '-v', exclude])
+            args.extend([
+                run.Raw('|'), 'head', '-n', '1',
+            ])
+            r = mon0_remote.run(
+                stdout=StringIO(),
+                args=args,
+            )
+            stdout = r.stdout.getvalue()
+            if stdout != '':
+                return stdout
+            return None
+
+        if first_in_ceph_log('\[ERR\]|\[WRN\]|\[SEC\]',
+                             config['log_whitelist']) is not None:
+            log.warning('Found errors (ERR|WRN|SEC) in cluster log')
+            ctx.summary['success'] = False
+            # use the most severe problem as the failure reason
+            if 'failure_reason' not in ctx.summary:
+                for pattern in ['\[SEC\]', '\[ERR\]', '\[WRN\]']:
+                    match = first_in_ceph_log(pattern, config['log_whitelist'])
+                    if match is not None:
+                        ctx.summary['failure_reason'] = \
+                            '"{match}" in cluster log'.format(
+                                match=match.rstrip('\n'),
+                            )
+                        break
+
+        for remote, dirs in devs_to_clean.iteritems():
+            for dir_ in dirs:
+                log.info('Unmounting %s on %s' % (dir_, remote))
+                try:
+                    remote.run(
+                        args=[
+                            'sync',
+                            run.Raw('&&'),
+                            'sudo',
+                            'umount',
+                            '-f',
+                            dir_
+                        ]
+                    )
+                except Exception as e:
+                    remote.run(args=[
+                        'sudo',
+                        run.Raw('PATH=/usr/sbin:$PATH'),
+                        'lsof',
+                        run.Raw(';'),
+                        'ps', 'auxf',
+                    ])
+                    raise e
+
+        if config.get('tmpfs_journal'):
+            log.info('tmpfs journal enabled - unmounting tmpfs at /mnt')
+            for remote, roles_for_host in osds.remotes.iteritems():
+                remote.run(
+                    args=['sudo', 'umount', '-f', '/mnt'],
+                    check_status=False,
+                )
+
+        if ctx.archive is not None and \
+                not (ctx.config.get('archive-on-error') and ctx.summary['success']):
+
+            # archive mon data, too
+            log.info('Archiving mon data...')
+            path = os.path.join(ctx.archive, 'data')
+            try:
+                os.makedirs(path)
+            except OSError as e:
+                if e.errno == errno.EEXIST:
+                    pass
+                else:
+                    raise
+            for remote, roles in mons.remotes.iteritems():
+                for role in roles:
+                    is_mon = teuthology.is_type('mon', cluster_name)
+                    if is_mon(role):
+                        _, _, id_ = teuthology.split_role(role)
+                        mon_dir = '/var/lib/ceph/mon/' + \
+                                  '{0}-{1}'.format(cluster_name, id_)
+                        teuthology.pull_directory_tarball(
+                            remote,
+                            mon_dir,
+                            path + '/' + role + '.tgz')
+
+        log.info('Cleaning ceph cluster...')
+        run.wait(
+            ctx.cluster.run(
+                args=[
+                    'sudo',
+                    'rm',
+                    '-rf',
+                    '--',
+                    conf_path,
+                    keyring_path,
+                    data_dir,
+                    monmap_path,
+                    run.Raw('{tdir}/../*.pid'.format(tdir=testdir)),
+                ],
+                wait=False,
+            ),
+        )
+
+
+def osd_scrub_pgs(ctx, config):
+    """
+    Scrub pgs when we exit.
+
+    First make sure all pgs are active and clean.
+    Next scrub all osds.
+    Then periodically check until all pgs have scrub time stamps that
+    indicate the last scrub completed.  Time out if no progess is made
+    here after two minutes.
+    """
+    retries = 40
+    delays = 20
+    cluster_name = config['cluster']
+    manager = ctx.managers[cluster_name]
+    all_clean = False
+    for _ in range(0, retries):
+        stats = manager.get_pg_stats()
+        bad = [stat['pgid'] for stat in stats if 'active+clean' not in stat['state']]
+        if not bad:
+            all_clean = True
+            break
+        log.info(
+            "Waiting for all PGs to be active and clean, waiting on %s" % bad)
+        time.sleep(delays)
+    if not all_clean:
+        raise RuntimeError("Scrubbing terminated -- not all pgs were active and clean.")
+    check_time_now = time.localtime()
+    time.sleep(1)
+    all_roles = teuthology.all_roles(ctx.cluster)
+    for role in teuthology.cluster_roles_of_type(all_roles, 'osd', cluster_name):
+        log.info("Scrubbing {osd}".format(osd=role))
+        _, _, id_ = teuthology.split_role(role)
+        # allow this to fail; in certain cases the OSD might not be up
+        # at this point.  we will catch all pgs below.
+        try:
+            manager.raw_cluster_cmd('osd', 'deep-scrub', id_)
+        except run.CommandFailedError:
+            pass
+    prev_good = 0
+    gap_cnt = 0
+    loop = True
+    while loop:
+        stats = manager.get_pg_stats()
+        timez = [(stat['pgid'],stat['last_scrub_stamp']) for stat in stats]
+        loop = False
+        thiscnt = 0
+        for (pgid, tmval) in timez:
+            pgtm = time.strptime(tmval[0:tmval.find('.')], '%Y-%m-%d %H:%M:%S')
+            if pgtm > check_time_now:
+                thiscnt += 1
+            else:
+                log.info('pgid %s last_scrub_stamp %s %s <= %s', pgid, tmval, pgtm, check_time_now)
+                loop = True
+        if thiscnt > prev_good:
+            prev_good = thiscnt
+            gap_cnt = 0
+        else:
+            gap_cnt += 1
+            if gap_cnt % 6 == 0:
+                for (pgid, tmval) in timez:
+                    # re-request scrub every so often in case the earlier
+                    # request was missed.  do not do it everytime because
+                    # the scrub may be in progress or not reported yet and
+                    # we will starve progress.
+                    manager.raw_cluster_cmd('pg', 'deep-scrub', pgid)
+            if gap_cnt > retries:
+                raise RuntimeError('Exiting scrub checking -- not all pgs scrubbed.')
+        if loop:
+            log.info('Still waiting for all pgs to be scrubbed.')
+            time.sleep(delays)
+
+
+@contextlib.contextmanager
+def run_daemon(ctx, config, type_):
+    """
+    Run daemons for a role type.  Handle the startup and termination of a a daemon.
+    On startup -- set coverages, cpu_profile, valgrind values for all remotes,
+    and a max_mds value for one mds.
+    On cleanup -- Stop all existing daemons of this type.
+
+    :param ctx: Context
+    :param config: Configuration
+    :paran type_: Role type
+    """
+    cluster_name = config['cluster']
+    log.info('Starting %s daemons in cluster %s...', type_, cluster_name)
+    testdir = teuthology.get_testdir(ctx)
+    daemons = ctx.cluster.only(teuthology.is_type(type_, cluster_name))
+
+    # check whether any daemons if this type are configured
+    if daemons is None:
+        return
+    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+
+    daemon_signal = 'kill'
+    if config.get('coverage') or config.get('valgrind') is not None:
+        daemon_signal = 'term'
+
+    # create osds in order.  (this only matters for pre-luminous, which might
+    # be hammer, which doesn't take an id_ argument to legacy 'osd create').
+    osd_uuids  = {}
+    for remote, roles_for_host in daemons.remotes.iteritems():
+        is_type_ = teuthology.is_type(type_, cluster_name)
+        for role in roles_for_host:
+            if not is_type_(role):
+                continue
+            _, _, id_ = teuthology.split_role(role)
+
+
+            if type_ == 'osd':
+                datadir='/var/lib/ceph/osd/{cluster}-{id}'.format(
+                    cluster=cluster_name, id=id_)
+                osd_uuid = teuthology.get_file(
+                    remote=remote,
+                    path=datadir + '/fsid',
+                    sudo=True,
+                ).strip()
+                osd_uuids[id_] = osd_uuid
+    for osd_id in range(len(osd_uuids)):
+        id_ = str(osd_id)
+        osd_uuid = osd_uuids.get(id_)
+        try:
+            remote.run(
+                args=[
+                'sudo', 'ceph', '--cluster', cluster_name,
+                    'osd', 'new', osd_uuid, id_,
+                ]
+            )
+        except:
+            # fallback to pre-luminous (hammer or jewel)
+            remote.run(
+                args=[
+                'sudo', 'ceph', '--cluster', cluster_name,
+                    'osd', 'create', osd_uuid,
+                ]
+            )
+            if config.get('add_osds_to_crush'):
+                remote.run(
+                args=[
+                    'sudo', 'ceph', '--cluster', cluster_name,
+                    'osd', 'crush', 'create-or-move', 'osd.' + id_,
+                    '1.0', 'host=localhost', 'root=default',
+                ]
+            )
+
+    for remote, roles_for_host in daemons.remotes.iteritems():
+        is_type_ = teuthology.is_type(type_, cluster_name)
+        for role in roles_for_host:
+            if not is_type_(role):
+                continue
+            _, _, id_ = teuthology.split_role(role)
+
+            run_cmd = [
+                'sudo',
+                'adjust-ulimits',
+                'ceph-coverage',
+                coverage_dir,
+                'daemon-helper',
+                daemon_signal,
+            ]
+            run_cmd_tail = [
+                'ceph-%s' % (type_),
+                '-f',
+                '--cluster', cluster_name,
+                '-i', id_]
+
+            if type_ in config.get('cpu_profile', []):
+                profile_path = '/var/log/ceph/profiling-logger/%s.prof' % (role)
+                run_cmd.extend(['env', 'CPUPROFILE=%s' % profile_path])
+
+            if config.get('valgrind') is not None:
+                valgrind_args = None
+                if type_ in config['valgrind']:
+                    valgrind_args = config['valgrind'][type_]
+                if role in config['valgrind']:
+                    valgrind_args = config['valgrind'][role]
+                run_cmd = teuthology.get_valgrind_args(testdir, role,
+                                                       run_cmd,
+                                                       valgrind_args)
+
+            run_cmd.extend(run_cmd_tail)
+
+            # always register mgr; don't necessarily start
+            ctx.daemons.register_daemon(
+                remote, type_, id_,
+                cluster=cluster_name,
+                args=run_cmd,
+                logger=log.getChild(role),
+                stdin=run.PIPE,
+                wait=False
+            )
+            if type_ != 'mgr' or not config.get('skip_mgr_daemons', False):
+                role = cluster_name + '.' + type_
+                ctx.daemons.get_daemon(type_, id_, cluster_name).restart()
+
+    try:
+        yield
+    finally:
+        teuthology.stop_daemons_of_type(ctx, type_, cluster_name)
+
+
+def healthy(ctx, config):
+    """
+    Wait for all osd's to be up, and for the ceph health monitor to return HEALTH_OK.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    config = config if isinstance(config, dict) else dict()
+    cluster_name = config.get('cluster', 'ceph')
+    log.info('Waiting until %s daemons up and pgs clean...', cluster_name)
+    manager = ctx.managers[cluster_name]
+    try:
+        manager.wait_for_mgr_available(timeout=30)
+    except (run.CommandFailedError, AssertionError) as e:
+        log.info('ignoring mgr wait error, probably testing upgrade: %s', e)
+
+    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
+    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+    teuthology.wait_until_osds_up(
+        ctx,
+        cluster=ctx.cluster,
+        remote=mon0_remote,
+        ceph_cluster=cluster_name,
+    )
+
+    try:
+        manager.flush_all_pg_stats()
+    except (run.CommandFailedError, Exception) as e:
+        log.info('ignoring flush pg stats error, probably testing upgrade: %s', e)
+    manager.wait_for_clean()
+
+    log.info('Waiting until ceph cluster %s is healthy...', cluster_name)
+    teuthology.wait_until_healthy(
+        ctx,
+        remote=mon0_remote,
+        ceph_cluster=cluster_name,
+    )
+
+    if ctx.cluster.only(teuthology.is_type('mds', cluster_name)).remotes:
+        # Some MDSs exist, wait for them to be healthy
+        ceph_fs = Filesystem(ctx) # TODO: make Filesystem cluster-aware
+        ceph_fs.wait_for_daemons(timeout=300)
+
+
+def wait_for_osds_up(ctx, config):
+    """
+    Wait for all osd's to come up.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    log.info('Waiting until ceph osds are all up...')
+    cluster_name = config.get('cluster', 'ceph')
+    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
+    (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+    teuthology.wait_until_osds_up(
+        ctx,
+        cluster=ctx.cluster,
+        remote=mon0_remote
+    )
+
+
+def wait_for_mon_quorum(ctx, config):
+    """
+    Check renote ceph status until all monitors are up.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    if isinstance(config, dict):
+        mons = config['daemons']
+        cluster_name = config.get('cluster', 'ceph')
+    else:
+        assert isinstance(config, list)
+        mons = config
+        cluster_name = 'ceph'
+    firstmon = teuthology.get_first_mon(ctx, config, cluster_name)
+    (remote,) = ctx.cluster.only(firstmon).remotes.keys()
+    with contextutil.safe_while(sleep=10, tries=60,
+                                action='wait for monitor quorum') as proceed:
+        while proceed():
+            r = remote.run(
+                args=[
+                    'sudo',
+                    'ceph',
+                    'quorum_status',
+                ],
+                stdout=StringIO(),
+                logger=log.getChild('quorum_status'),
+            )
+            j = json.loads(r.stdout.getvalue())
+            q = j.get('quorum_names', [])
+            log.debug('Quorum: %s', q)
+            if sorted(q) == sorted(mons):
+                break
+
+
+def created_pool(ctx, config):
+    """
+    Add new pools to the dictionary of pools that the ceph-manager
+    knows about.
+    """
+    for new_pool in config:
+        if new_pool not in ctx.managers['ceph'].pools:
+            ctx.managers['ceph'].pools[new_pool] = ctx.managers['ceph'].get_pool_property(
+                new_pool, 'pg_num')
+
+
+@contextlib.contextmanager
+def restart(ctx, config):
+    """
+   restart ceph daemons
+
+   For example::
+      tasks:
+      - ceph.restart: [all]
+
+   For example::
+      tasks:
+      - ceph.restart: [osd.0, mon.1, mds.*]
+
+   or::
+
+      tasks:
+      - ceph.restart:
+          daemons: [osd.0, mon.1]
+          wait-for-healthy: false
+          wait-for-osds-up: true
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    if config is None:
+        config = {}
+    elif isinstance(config, list):
+        config = {'daemons': config}
+
+    daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
+    clusters = set()
+    for role in daemons:
+        cluster, type_, id_ = teuthology.split_role(role)
+        ctx.daemons.get_daemon(type_, id_, cluster).restart()
+        clusters.add(cluster)
+
+    manager = ctx.managers['ceph']
+    for dmon in daemons:
+        if '.' in dmon:
+            dm_parts = dmon.split('.')
+            if dm_parts[1].isdigit():
+                if dm_parts[0] == 'osd':
+                    manager.mark_down_osd(int(dm_parts[1]))
+
+    if config.get('wait-for-healthy', True):
+        for cluster in clusters:
+            healthy(ctx=ctx, config=dict(cluster=cluster))
+    if config.get('wait-for-osds-up', False):
+        for cluster in clusters:
+            wait_for_osds_up(ctx=ctx, config=dict(cluster=cluster))
+    yield
+
+
+@contextlib.contextmanager
+def stop(ctx, config):
+    """
+    Stop ceph daemons
+
+    For example::
+      tasks:
+      - ceph.stop: [mds.*]
+
+      tasks:
+      - ceph.stop: [osd.0, osd.2]
+
+      tasks:
+      - ceph.stop:
+          daemons: [osd.0, osd.2]
+
+    """
+    if config is None:
+        config = {}
+    elif isinstance(config, list):
+        config = {'daemons': config}
+
+    daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
+    for role in daemons:
+        cluster, type_, id_ = teuthology.split_role(role)
+        ctx.daemons.get_daemon(type_, id_, cluster).stop()
+
+    yield
+
+
+@contextlib.contextmanager
+def wait_for_failure(ctx, config):
+    """
+    Wait for a failure of a ceph daemon
+
+    For example::
+      tasks:
+      - ceph.wait_for_failure: [mds.*]
+
+      tasks:
+      - ceph.wait_for_failure: [osd.0, osd.2]
+
+      tasks:
+      - ceph.wait_for_failure:
+          daemons: [osd.0, osd.2]
+
+    """
+    if config is None:
+        config = {}
+    elif isinstance(config, list):
+        config = {'daemons': config}
+
+    daemons = ctx.daemons.resolve_role_list(config.get('daemons', None), CEPH_ROLE_TYPES, True)
+    for role in daemons:
+        cluster, type_, id_ = teuthology.split_role(role)
+        try:
+            ctx.daemons.get_daemon(type_, id_, cluster).wait()
+        except:
+            log.info('Saw expected daemon failure.  Continuing.')
+            pass
+        else:
+            raise RuntimeError('daemon %s did not fail' % role)
+
+    yield
+
+
+def validate_config(ctx, config):
+    """
+    Perform some simple validation on task configuration.
+    Raises exceptions.ConfigError if an error is found.
+    """
+    # check for osds from multiple clusters on the same host
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        last_cluster = None
+        last_role = None
+        for role in roles_for_host:
+            role_cluster, role_type, _ = teuthology.split_role(role)
+            if role_type != 'osd':
+                continue
+            if last_cluster and last_cluster != role_cluster:
+                msg = "Host should not have osds (%s and %s) from multiple clusters" % (
+                    last_role, role)
+                raise exceptions.ConfigError(msg)
+            last_cluster = role_cluster
+            last_role = role
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Set up and tear down a Ceph cluster.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - interactive:
+
+    You can also specify what branch to run::
+
+        tasks:
+        - ceph:
+            branch: foo
+
+    Or a tag::
+
+        tasks:
+        - ceph:
+            tag: v0.42.13
+
+    Or a sha1::
+
+        tasks:
+        - ceph:
+            sha1: 1376a5ab0c89780eab39ffbbe436f6a6092314ed
+
+    Or a local source dir::
+
+        tasks:
+        - ceph:
+            path: /home/sage/ceph
+
+    To capture code coverage data, use::
+
+        tasks:
+        - ceph:
+            coverage: true
+
+    To use btrfs, ext4, or xfs on the target's scratch disks, use::
+
+        tasks:
+        - ceph:
+            fs: xfs
+            mkfs_options: [-b,size=65536,-l,logdev=/dev/sdc1]
+            mount_options: [nobarrier, inode64]
+
+    Note, this will cause the task to check the /scratch_devs file on each node
+    for available devices.  If no such file is found, /dev/sdb will be used.
+
+    To run some daemons under valgrind, include their names
+    and the tool/args to use in a valgrind section::
+
+        tasks:
+        - ceph:
+          valgrind:
+            mds.1: --tool=memcheck
+            osd.1: [--tool=memcheck, --leak-check=no]
+
+    Those nodes which are using memcheck or valgrind will get
+    checked for bad results.
+
+    To adjust or modify config options, use::
+
+        tasks:
+        - ceph:
+            conf:
+              section:
+                key: value
+
+    For example::
+
+        tasks:
+        - ceph:
+            conf:
+              mds.0:
+                some option: value
+                other key: other value
+              client.0:
+                debug client: 10
+                debug ms: 1
+
+    By default, the cluster log is checked for errors and warnings,
+    and the run marked failed if any appear. You can ignore log
+    entries by giving a list of egrep compatible regexes, i.e.:
+
+        tasks:
+        - ceph:
+            log-whitelist: ['foo.*bar', 'bad message']
+
+    To run multiple ceph clusters, use multiple ceph tasks, and roles
+    with a cluster name prefix, e.g. cluster1.client.0. Roles with no
+    cluster use the default cluster name, 'ceph'. OSDs from separate
+    clusters must be on separate hosts. Clients and non-osd daemons
+    from multiple clusters may be colocated. For each cluster, add an
+    instance of the ceph task with the cluster name specified, e.g.::
+
+        roles:
+        - [mon.a, osd.0, osd.1]
+        - [backup.mon.a, backup.osd.0, backup.osd.1]
+        - [client.0, backup.client.0]
+        tasks:
+        - ceph:
+            cluster: ceph
+        - ceph:
+            cluster: backup
+
+    :param ctx: Context
+    :param config: Configuration
+
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        "task ceph only supports a dictionary for configuration"
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('ceph', {}))
+
+    first_ceph_cluster = False
+    if not hasattr(ctx, 'daemons'):
+        first_ceph_cluster = True
+        ctx.daemons = DaemonGroup()
+
+    testdir = teuthology.get_testdir(ctx)
+    if config.get('coverage'):
+        coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+        log.info('Creating coverage directory...')
+        run.wait(
+            ctx.cluster.run(
+                args=[
+                    'install', '-d', '-m0755', '--',
+                    coverage_dir,
+                ],
+                wait=False,
+            )
+        )
+
+    if 'cluster' not in config:
+        config['cluster'] = 'ceph'
+
+    validate_config(ctx, config)
+
+    subtasks = []
+    if first_ceph_cluster:
+        # these tasks handle general log setup and parsing on all hosts,
+        # so they should only be run once
+        subtasks = [
+            lambda: ceph_log(ctx=ctx, config=None),
+            lambda: valgrind_post(ctx=ctx, config=config),
+        ]
+
+    subtasks += [
+        lambda: cluster(ctx=ctx, config=dict(
+            conf=config.get('conf', {}),
+            fs=config.get('fs', 'xfs'),
+            mkfs_options=config.get('mkfs_options', None),
+            mount_options=config.get('mount_options', None),
+            block_journal=config.get('block_journal', None),
+            tmpfs_journal=config.get('tmpfs_journal', None),
+            skip_mgr_daemons=config.get('skip_mgr_daemons', False),
+            log_whitelist=config.get('log-whitelist', []),
+            cpu_profile=set(config.get('cpu_profile', []),),
+            cluster=config['cluster'],
+        )),
+        lambda: run_daemon(ctx=ctx, config=config, type_='mon'),
+        lambda: run_daemon(ctx=ctx, config=config, type_='mgr'),
+        lambda: crush_setup(ctx=ctx, config=config),
+        lambda: run_daemon(ctx=ctx, config=config, type_='osd'),
+        lambda: create_rbd_pool(ctx=ctx, config=config),
+        lambda: cephfs_setup(ctx=ctx, config=config),
+        lambda: run_daemon(ctx=ctx, config=config, type_='mds'),
+    ]
+
+    with contextutil.nested(*subtasks):
+        first_mon = teuthology.get_first_mon(ctx, config, config['cluster'])
+        (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+        if not hasattr(ctx, 'managers'):
+            ctx.managers = {}
+        ctx.managers[config['cluster']] = CephManager(
+            mon,
+            ctx=ctx,
+            logger=log.getChild('ceph_manager.' + config['cluster']),
+            cluster=config['cluster'],
+        )
+
+        try:
+            if config.get('wait-for-healthy', True):
+                healthy(ctx=ctx, config=dict(cluster=config['cluster']))
+
+            yield
+        finally:
+            if config.get('wait-for-scrub', True):
+                osd_scrub_pgs(ctx, config)
+
+            # stop logging health to clog during shutdown, or else we generate
+            # a bunch of scary messages unrelated to our actual run.
+            firstmon = teuthology.get_first_mon(ctx, config, config['cluster'])
+            (mon0_remote,) = ctx.cluster.only(firstmon).remotes.keys()
+            mon0_remote.run(
+                args=[
+                    'sudo',
+                    'ceph',
+                    '--cluster', config['cluster'],
+                    'tell',
+                    'mon.*',
+                    'injectargs',
+                    '--',
+                    '--no-mon-health-to-clog',
+                ]
+            )
diff --git a/src/ceph/qa/tasks/ceph_client.py b/src/ceph/qa/tasks/ceph_client.py
new file mode 100644
index 0000000..3ca90b7
--- /dev/null
+++ b/src/ceph/qa/tasks/ceph_client.py
@@ -0,0 +1,42 @@
+"""
+Set up client keyring
+"""
+import logging
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+def create_keyring(ctx, cluster_name):
+    """
+    Set up key ring on remote sites
+    """
+    log.info('Setting up client nodes...')
+    clients = ctx.cluster.only(teuthology.is_type('client', cluster_name))
+    testdir = teuthology.get_testdir(ctx)
+    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+    for remote, roles_for_host in clients.remotes.iteritems():
+        for role in teuthology.cluster_roles_of_type(roles_for_host, 'client',
+                                                     cluster_name):
+            name = teuthology.ceph_role(role)
+            client_keyring = '/etc/ceph/{0}.{1}.keyring'.format(cluster_name, name)
+            remote.run(
+                args=[
+                    'sudo',
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    coverage_dir,
+                    'ceph-authtool',
+                    '--create-keyring',
+                    '--gen-key',
+                    # TODO this --name= is not really obeyed, all unknown "types" are munged to "client"
+                    '--name={name}'.format(name=name),
+                    client_keyring,
+                    run.Raw('&&'),
+                    'sudo',
+                    'chmod',
+                    '0644',
+                    client_keyring,
+                    ],
+                )
diff --git a/src/ceph/qa/tasks/ceph_deploy.py b/src/ceph/qa/tasks/ceph_deploy.py
new file mode 100644
index 0000000..38fbe43
--- /dev/null
+++ b/src/ceph/qa/tasks/ceph_deploy.py
@@ -0,0 +1,862 @@
+"""
+Execute ceph-deploy as a task
+"""
+from cStringIO import StringIO
+
+import contextlib
+import os
+import time
+import logging
+import traceback
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.task import install as install_fn
+from teuthology.orchestra import run
+from tasks.cephfs.filesystem import Filesystem
+from teuthology.misc import wait_until_healthy
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download_ceph_deploy(ctx, config):
+    """
+    Downloads ceph-deploy from the ceph.com git mirror and (by default)
+    switches to the master branch. If the `ceph-deploy-branch` is specified, it
+    will use that instead. The `bootstrap` script is ran, with the argument
+    obtained from `python_version`, if specified.
+    """
+    # use mon.a for ceph_admin
+    (ceph_admin,) = ctx.cluster.only('mon.a').remotes.iterkeys()
+
+    try:
+        py_ver = str(config['python_version'])
+    except KeyError:
+        pass
+    else:
+        supported_versions = ['2', '3']
+        if py_ver not in supported_versions:
+            raise ValueError("python_version must be: {}, not {}".format(
+                ' or '.join(supported_versions), py_ver
+            ))
+
+        log.info("Installing Python")
+        system_type = teuthology.get_system_type(ceph_admin)
+
+        if system_type == 'rpm':
+            package = 'python34' if py_ver == '3' else 'python'
+            ctx.cluster.run(args=[
+                'sudo', 'yum', '-y', 'install',
+                package, 'python-virtualenv'
+            ])
+        else:
+            package = 'python3' if py_ver == '3' else 'python'
+            ctx.cluster.run(args=[
+                'sudo', 'apt-get', '-y', '--force-yes', 'install',
+                package, 'python-virtualenv'
+            ])
+
+    log.info('Downloading ceph-deploy...')
+    testdir = teuthology.get_testdir(ctx)
+    ceph_deploy_branch = config.get('ceph-deploy-branch', 'master')
+
+    ceph_admin.run(
+        args=[
+            'git', 'clone', '-b', ceph_deploy_branch,
+            teuth_config.ceph_git_base_url + 'ceph-deploy.git',
+            '{tdir}/ceph-deploy'.format(tdir=testdir),
+        ],
+    )
+    args = [
+        'cd',
+        '{tdir}/ceph-deploy'.format(tdir=testdir),
+        run.Raw('&&'),
+        './bootstrap',
+    ]
+    try:
+        args.append(str(config['python_version']))
+    except KeyError:
+        pass
+    ceph_admin.run(args=args)
+
+    try:
+        yield
+    finally:
+        log.info('Removing ceph-deploy ...')
+        ceph_admin.run(
+            args=[
+                'rm',
+                '-rf',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+            ],
+        )
+
+
+def is_healthy(ctx, config):
+    """Wait until a Ceph cluster is healthy."""
+    testdir = teuthology.get_testdir(ctx)
+    ceph_admin = teuthology.get_first_mon(ctx, config)
+    (remote,) = ctx.cluster.only(ceph_admin).remotes.keys()
+    max_tries = 90  # 90 tries * 10 secs --> 15 minutes
+    tries = 0
+    while True:
+        tries += 1
+        if tries >= max_tries:
+            msg = "ceph health was unable to get 'HEALTH_OK' after waiting 15 minutes"
+            remote.run(
+                args=[
+                    'cd',
+                    '{tdir}'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'sudo', 'ceph',
+                    'report',
+                ],
+            )
+            raise RuntimeError(msg)
+
+        r = remote.run(
+            args=[
+                'cd',
+                '{tdir}'.format(tdir=testdir),
+                run.Raw('&&'),
+                'sudo', 'ceph',
+                'health',
+            ],
+            stdout=StringIO(),
+            logger=log.getChild('health'),
+        )
+        out = r.stdout.getvalue()
+        log.info('Ceph health: %s', out.rstrip('\n'))
+        if out.split(None, 1)[0] == 'HEALTH_OK':
+            break
+        time.sleep(10)
+
+
+def get_nodes_using_role(ctx, target_role):
+    """
+    Extract the names of nodes that match a given role from a cluster, and modify the
+    cluster's service IDs to match the resulting node-based naming scheme that ceph-deploy
+    uses, such that if "mon.a" is on host "foo23", it'll be renamed to "mon.foo23".
+    """
+
+    # Nodes containing a service of the specified role
+    nodes_of_interest = []
+
+    # Prepare a modified version of cluster.remotes with ceph-deploy-ized names
+    modified_remotes = {}
+    ceph_deploy_mapped = dict()
+    for _remote, roles_for_host in ctx.cluster.remotes.iteritems():
+        modified_remotes[_remote] = []
+        for svc_id in roles_for_host:
+            if svc_id.startswith("{0}.".format(target_role)):
+                fqdn = str(_remote).split('@')[-1]
+                nodename = str(str(_remote).split('.')[0]).split('@')[1]
+                if target_role == 'mon':
+                    nodes_of_interest.append(fqdn)
+                else:
+                    nodes_of_interest.append(nodename)
+                mapped_role = "{0}.{1}".format(target_role, nodename)
+                modified_remotes[_remote].append(mapped_role)
+                # keep dict of mapped role for later use by tasks
+                # eg. mon.a => mon.node1
+                ceph_deploy_mapped[svc_id] = mapped_role
+            else:
+                modified_remotes[_remote].append(svc_id)
+
+    ctx.cluster.remotes = modified_remotes
+    ctx.cluster.mapped_role = ceph_deploy_mapped
+
+    return nodes_of_interest
+
+
+def get_dev_for_osd(ctx, config):
+    """Get a list of all osd device names."""
+    osd_devs = []
+    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
+        host = remote.name.split('@')[-1]
+        shortname = host.split('.')[0]
+        devs = teuthology.get_scratch_devices(remote)
+        num_osd_per_host = list(
+            teuthology.roles_of_type(
+                roles_for_host, 'osd'))
+        num_osds = len(num_osd_per_host)
+        if config.get('separate_journal_disk') is not None:
+            num_devs_reqd = 2 * num_osds
+            assert num_devs_reqd <= len(
+                devs), 'fewer data and journal disks than required ' + shortname
+            for dindex in range(0, num_devs_reqd, 2):
+                jd_index = dindex + 1
+                dev_short = devs[dindex].split('/')[-1]
+                jdev_short = devs[jd_index].split('/')[-1]
+                osd_devs.append((shortname, dev_short, jdev_short))
+        else:
+            assert num_osds <= len(devs), 'fewer disks than osds ' + shortname
+            for dev in devs[:num_osds]:
+                dev_short = dev.split('/')[-1]
+                osd_devs.append((shortname, dev_short))
+    return osd_devs
+
+
+def get_all_nodes(ctx, config):
+    """Return a string of node names separated by blanks"""
+    nodelist = []
+    for t, k in ctx.config['targets'].iteritems():
+        host = t.split('@')[-1]
+        simple_host = host.split('.')[0]
+        nodelist.append(simple_host)
+    nodelist = " ".join(nodelist)
+    return nodelist
+
+
+@contextlib.contextmanager
+def build_ceph_cluster(ctx, config):
+    """Build a ceph cluster"""
+
+    # Expect to find ceph_admin on the first mon by ID, same place that the download task
+    # puts it.  Remember this here, because subsequently IDs will change from those in
+    # the test config to those that ceph-deploy invents.
+
+    (ceph_admin,) = ctx.cluster.only('mon.a').remotes.iterkeys()
+
+    def execute_ceph_deploy(cmd):
+        """Remotely execute a ceph_deploy command"""
+        return ceph_admin.run(
+            args=[
+                'cd',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+                run.Raw('&&'),
+                run.Raw(cmd),
+            ],
+            check_status=False,
+        ).exitstatus
+
+    try:
+        log.info('Building ceph cluster using ceph-deploy...')
+        testdir = teuthology.get_testdir(ctx)
+        ceph_branch = None
+        if config.get('branch') is not None:
+            cbranch = config.get('branch')
+            for var, val in cbranch.iteritems():
+                ceph_branch = '--{var}={val}'.format(var=var, val=val)
+        all_nodes = get_all_nodes(ctx, config)
+        mds_nodes = get_nodes_using_role(ctx, 'mds')
+        mds_nodes = " ".join(mds_nodes)
+        mon_node = get_nodes_using_role(ctx, 'mon')
+        mon_nodes = " ".join(mon_node)
+        # skip mgr based on config item
+        # this is needed when test uses latest code to install old ceph
+        # versions
+        skip_mgr = config.get('skip-mgr', False)
+        if not skip_mgr:
+            mgr_nodes = get_nodes_using_role(ctx, 'mgr')
+            mgr_nodes = " ".join(mgr_nodes)
+        new_mon = './ceph-deploy new' + " " + mon_nodes
+        if not skip_mgr:
+            mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
+        mon_hostname = mon_nodes.split(' ')[0]
+        mon_hostname = str(mon_hostname)
+        gather_keys = './ceph-deploy gatherkeys' + " " + mon_hostname
+        deploy_mds = './ceph-deploy mds create' + " " + mds_nodes
+        no_of_osds = 0
+
+        if mon_nodes is None:
+            raise RuntimeError("no monitor nodes in the config file")
+
+        estatus_new = execute_ceph_deploy(new_mon)
+        if estatus_new != 0:
+            raise RuntimeError("ceph-deploy: new command failed")
+
+        log.info('adding config inputs...')
+        testdir = teuthology.get_testdir(ctx)
+        conf_path = '{tdir}/ceph-deploy/ceph.conf'.format(tdir=testdir)
+
+        if config.get('conf') is not None:
+            confp = config.get('conf')
+            for section, keys in confp.iteritems():
+                lines = '[{section}]\n'.format(section=section)
+                teuthology.append_lines_to_file(ceph_admin, conf_path, lines,
+                                                sudo=True)
+                for key, value in keys.iteritems():
+                    log.info("[%s] %s = %s" % (section, key, value))
+                    lines = '{key} = {value}\n'.format(key=key, value=value)
+                    teuthology.append_lines_to_file(
+                        ceph_admin, conf_path, lines, sudo=True)
+
+        # install ceph
+        dev_branch = ctx.config['branch']
+        branch = '--dev={branch}'.format(branch=dev_branch)
+        if ceph_branch:
+            option = ceph_branch
+        else:
+            option = branch
+        install_nodes = './ceph-deploy install ' + option + " " + all_nodes
+        estatus_install = execute_ceph_deploy(install_nodes)
+        if estatus_install != 0:
+            raise RuntimeError("ceph-deploy: Failed to install ceph")
+        # install ceph-test package too
+        install_nodes2 = './ceph-deploy install --tests ' + option + \
+                         " " + all_nodes
+        estatus_install = execute_ceph_deploy(install_nodes2)
+        if estatus_install != 0:
+            raise RuntimeError("ceph-deploy: Failed to install ceph-test")
+
+        mon_create_nodes = './ceph-deploy mon create-initial'
+        # If the following fails, it is OK, it might just be that the monitors
+        # are taking way more than a minute/monitor to form quorum, so lets
+        # try the next block which will wait up to 15 minutes to gatherkeys.
+        execute_ceph_deploy(mon_create_nodes)
+
+        # create-keys is explicit now
+        # http://tracker.ceph.com/issues/16036
+        mons = ctx.cluster.only(teuthology.is_type('mon'))
+        for remote in mons.remotes.iterkeys():
+            remote.run(args=['sudo', 'ceph-create-keys', '--cluster', 'ceph',
+                             '--id', remote.shortname])
+
+        estatus_gather = execute_ceph_deploy(gather_keys)
+
+        if not skip_mgr:
+            execute_ceph_deploy(mgr_create)
+
+        if mds_nodes:
+            estatus_mds = execute_ceph_deploy(deploy_mds)
+            if estatus_mds != 0:
+                raise RuntimeError("ceph-deploy: Failed to deploy mds")
+
+        if config.get('test_mon_destroy') is not None:
+            for d in range(1, len(mon_node)):
+                mon_destroy_nodes = './ceph-deploy mon destroy' + \
+                    " " + mon_node[d]
+                estatus_mon_d = execute_ceph_deploy(mon_destroy_nodes)
+                if estatus_mon_d != 0:
+                    raise RuntimeError("ceph-deploy: Failed to delete monitor")
+
+        node_dev_list = get_dev_for_osd(ctx, config)
+        for d in node_dev_list:
+            node = d[0]
+            for disk in d[1:]:
+                zap = './ceph-deploy disk zap ' + node + ':' + disk
+                estatus = execute_ceph_deploy(zap)
+                if estatus != 0:
+                    raise RuntimeError("ceph-deploy: Failed to zap osds")
+            osd_create_cmd = './ceph-deploy osd create '
+            # first check for filestore, default is bluestore with ceph-deploy
+            if config.get('filestore') is not None:
+                osd_create_cmd += '--filestore '
+            elif config.get('bluestore') is not None:
+                osd_create_cmd += '--bluestore '
+            if config.get('dmcrypt') is not None:
+                osd_create_cmd += '--dmcrypt '
+            osd_create_cmd += ":".join(d)
+            estatus_osd = execute_ceph_deploy(osd_create_cmd)
+            if estatus_osd == 0:
+                log.info('successfully created osd')
+                no_of_osds += 1
+            else:
+                raise RuntimeError("ceph-deploy: Failed to create osds")
+
+        if config.get('wait-for-healthy', True) and no_of_osds >= 2:
+            is_healthy(ctx=ctx, config=None)
+
+            log.info('Setting up client nodes...')
+            conf_path = '/etc/ceph/ceph.conf'
+            admin_keyring_path = '/etc/ceph/ceph.client.admin.keyring'
+            first_mon = teuthology.get_first_mon(ctx, config)
+            (mon0_remote,) = ctx.cluster.only(first_mon).remotes.keys()
+            conf_data = teuthology.get_file(
+                remote=mon0_remote,
+                path=conf_path,
+                sudo=True,
+            )
+            admin_keyring = teuthology.get_file(
+                remote=mon0_remote,
+                path=admin_keyring_path,
+                sudo=True,
+            )
+
+            clients = ctx.cluster.only(teuthology.is_type('client'))
+            for remot, roles_for_host in clients.remotes.iteritems():
+                for id_ in teuthology.roles_of_type(roles_for_host, 'client'):
+                    client_keyring = \
+                        '/etc/ceph/ceph.client.{id}.keyring'.format(id=id_)
+                    mon0_remote.run(
+                        args=[
+                            'cd',
+                            '{tdir}'.format(tdir=testdir),
+                            run.Raw('&&'),
+                            'sudo', 'bash', '-c',
+                            run.Raw('"'), 'ceph',
+                            'auth',
+                            'get-or-create',
+                            'client.{id}'.format(id=id_),
+                            'mds', 'allow',
+                            'mon', 'allow *',
+                            'osd', 'allow *',
+                            run.Raw('>'),
+                            client_keyring,
+                            run.Raw('"'),
+                        ],
+                    )
+                    key_data = teuthology.get_file(
+                        remote=mon0_remote,
+                        path=client_keyring,
+                        sudo=True,
+                    )
+                    teuthology.sudo_write_file(
+                        remote=remot,
+                        path=client_keyring,
+                        data=key_data,
+                        perms='0644'
+                    )
+                    teuthology.sudo_write_file(
+                        remote=remot,
+                        path=admin_keyring_path,
+                        data=admin_keyring,
+                        perms='0644'
+                    )
+                    teuthology.sudo_write_file(
+                        remote=remot,
+                        path=conf_path,
+                        data=conf_data,
+                        perms='0644'
+                    )
+
+            if mds_nodes:
+                log.info('Configuring CephFS...')
+                Filesystem(ctx, create=True)
+        elif not config.get('only_mon'):
+            raise RuntimeError(
+                "The cluster is NOT operational due to insufficient OSDs")
+        yield
+
+    except Exception:
+        log.info(
+            "Error encountered, logging exception before tearing down ceph-deploy")
+        log.info(traceback.format_exc())
+        raise
+    finally:
+        if config.get('keep_running'):
+            return
+        log.info('Stopping ceph...')
+        ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
+                              'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
+                              'sudo', 'systemctl', 'stop', 'ceph.target'])
+
+        # Are you really not running anymore?
+        # try first with the init tooling
+        # ignoring the status so this becomes informational only
+        ctx.cluster.run(
+            args=[
+                'sudo', 'status', 'ceph-all', run.Raw('||'),
+                'sudo', 'service', 'ceph', 'status', run.Raw('||'),
+                'sudo', 'systemctl', 'status', 'ceph.target'],
+            check_status=False)
+
+        # and now just check for the processes themselves, as if upstart/sysvinit
+        # is lying to us. Ignore errors if the grep fails
+        ctx.cluster.run(args=['sudo', 'ps', 'aux', run.Raw('|'),
+                              'grep', '-v', 'grep', run.Raw('|'),
+                              'grep', 'ceph'], check_status=False)
+
+        if ctx.archive is not None:
+            # archive mon data, too
+            log.info('Archiving mon data...')
+            path = os.path.join(ctx.archive, 'data')
+            os.makedirs(path)
+            mons = ctx.cluster.only(teuthology.is_type('mon'))
+            for remote, roles in mons.remotes.iteritems():
+                for role in roles:
+                    if role.startswith('mon.'):
+                        teuthology.pull_directory_tarball(
+                            remote,
+                            '/var/lib/ceph/mon',
+                            path + '/' + role + '.tgz')
+
+            log.info('Compressing logs...')
+            run.wait(
+                ctx.cluster.run(
+                    args=[
+                        'sudo',
+                        'find',
+                        '/var/log/ceph',
+                        '-name',
+                        '*.log',
+                        '-print0',
+                        run.Raw('|'),
+                        'sudo',
+                        'xargs',
+                        '-0',
+                        '--no-run-if-empty',
+                        '--',
+                        'gzip',
+                        '--',
+                    ],
+                    wait=False,
+                ),
+            )
+
+            log.info('Archiving logs...')
+            path = os.path.join(ctx.archive, 'remote')
+            os.makedirs(path)
+            for remote in ctx.cluster.remotes.iterkeys():
+                sub = os.path.join(path, remote.shortname)
+                os.makedirs(sub)
+                teuthology.pull_directory(remote, '/var/log/ceph',
+                                          os.path.join(sub, 'log'))
+
+        # Prevent these from being undefined if the try block fails
+        all_nodes = get_all_nodes(ctx, config)
+        purge_nodes = './ceph-deploy purge' + " " + all_nodes
+        purgedata_nodes = './ceph-deploy purgedata' + " " + all_nodes
+
+        log.info('Purging package...')
+        execute_ceph_deploy(purge_nodes)
+        log.info('Purging data...')
+        execute_ceph_deploy(purgedata_nodes)
+
+
+@contextlib.contextmanager
+def cli_test(ctx, config):
+    """
+     ceph-deploy cli to exercise most commonly use cli's and ensure
+     all commands works and also startup the init system.
+
+    """
+    log.info('Ceph-deploy Test')
+    if config is None:
+        config = {}
+    test_branch = ''
+    conf_dir = teuthology.get_testdir(ctx) + "/cdtest"
+
+    def execute_cdeploy(admin, cmd, path):
+        """Execute ceph-deploy commands """
+        """Either use git path or repo path """
+        args = ['cd', conf_dir, run.Raw(';')]
+        if path:
+            args.append('{path}/ceph-deploy/ceph-deploy'.format(path=path))
+        else:
+            args.append('ceph-deploy')
+        args.append(run.Raw(cmd))
+        ec = admin.run(args=args, check_status=False).exitstatus
+        if ec != 0:
+            raise RuntimeError(
+                "failed during ceph-deploy cmd: {cmd} , ec={ec}".format(cmd=cmd, ec=ec))
+
+    if config.get('rhbuild'):
+        path = None
+    else:
+        path = teuthology.get_testdir(ctx)
+        # test on branch from config eg: wip-* , master or next etc
+        # packages for all distro's should exist for wip*
+        if ctx.config.get('branch'):
+            branch = ctx.config.get('branch')
+            test_branch = ' --dev={branch} '.format(branch=branch)
+    mons = ctx.cluster.only(teuthology.is_type('mon'))
+    for node, role in mons.remotes.iteritems():
+        admin = node
+        admin.run(args=['mkdir', conf_dir], check_status=False)
+        nodename = admin.shortname
+    system_type = teuthology.get_system_type(admin)
+    if config.get('rhbuild'):
+        admin.run(args=['sudo', 'yum', 'install', 'ceph-deploy', '-y'])
+    log.info('system type is %s', system_type)
+    osds = ctx.cluster.only(teuthology.is_type('osd'))
+
+    for remote, roles in osds.remotes.iteritems():
+        devs = teuthology.get_scratch_devices(remote)
+        log.info("roles %s", roles)
+        if (len(devs) < 3):
+            log.error(
+                'Test needs minimum of 3 devices, only found %s',
+                str(devs))
+            raise RuntimeError("Needs minimum of 3 devices ")
+
+    conf_path = '{conf_dir}/ceph.conf'.format(conf_dir=conf_dir)
+    new_cmd = 'new ' + nodename
+    execute_cdeploy(admin, new_cmd, path)
+    if config.get('conf') is not None:
+        confp = config.get('conf')
+        for section, keys in confp.iteritems():
+            lines = '[{section}]\n'.format(section=section)
+            teuthology.append_lines_to_file(admin, conf_path, lines,
+                                            sudo=True)
+            for key, value in keys.iteritems():
+                log.info("[%s] %s = %s" % (section, key, value))
+                lines = '{key} = {value}\n'.format(key=key, value=value)
+                teuthology.append_lines_to_file(admin, conf_path, lines,
+                                                sudo=True)
+    new_mon_install = 'install {branch} --mon '.format(
+        branch=test_branch) + nodename
+    new_mgr_install = 'install {branch} --mgr '.format(
+        branch=test_branch) + nodename
+    new_osd_install = 'install {branch} --osd '.format(
+        branch=test_branch) + nodename
+    new_admin = 'install {branch} --cli '.format(branch=test_branch) + nodename
+    create_initial = 'mon create-initial '
+    # either use create-keys or push command
+    push_keys = 'admin ' + nodename
+    execute_cdeploy(admin, new_mon_install, path)
+    execute_cdeploy(admin, new_mgr_install, path)
+    execute_cdeploy(admin, new_osd_install, path)
+    execute_cdeploy(admin, new_admin, path)
+    execute_cdeploy(admin, create_initial, path)
+    execute_cdeploy(admin, push_keys, path)
+
+    for i in range(3):
+        zap_disk = 'disk zap ' + "{n}:{d}".format(n=nodename, d=devs[i])
+        prepare = 'osd prepare ' + "{n}:{d}".format(n=nodename, d=devs[i])
+        execute_cdeploy(admin, zap_disk, path)
+        execute_cdeploy(admin, prepare, path)
+
+    log.info("list files for debugging purpose to check file permissions")
+    admin.run(args=['ls', run.Raw('-lt'), conf_dir])
+    remote.run(args=['sudo', 'ceph', '-s'], check_status=False)
+    r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
+    out = r.stdout.getvalue()
+    log.info('Ceph health: %s', out.rstrip('\n'))
+    log.info("Waiting for cluster to become healthy")
+    with contextutil.safe_while(sleep=10, tries=6,
+                                action='check health') as proceed:
+        while proceed():
+            r = remote.run(args=['sudo', 'ceph', 'health'], stdout=StringIO())
+            out = r.stdout.getvalue()
+            if (out.split(None, 1)[0] == 'HEALTH_OK'):
+                break
+    rgw_install = 'install {branch} --rgw {node}'.format(
+        branch=test_branch,
+        node=nodename,
+    )
+    rgw_create = 'rgw create ' + nodename
+    execute_cdeploy(admin, rgw_install, path)
+    execute_cdeploy(admin, rgw_create, path)
+    log.info('All ceph-deploy cli tests passed')
+    try:
+        yield
+    finally:
+        log.info("cleaning up")
+        ctx.cluster.run(args=['sudo', 'stop', 'ceph-all', run.Raw('||'),
+                              'sudo', 'service', 'ceph', 'stop', run.Raw('||'),
+                              'sudo', 'systemctl', 'stop', 'ceph.target'],
+                        check_status=False)
+        time.sleep(4)
+        for i in range(3):
+            umount_dev = "{d}1".format(d=devs[i])
+            r = remote.run(args=['sudo', 'umount', run.Raw(umount_dev)])
+        cmd = 'purge ' + nodename
+        execute_cdeploy(admin, cmd, path)
+        cmd = 'purgedata ' + nodename
+        execute_cdeploy(admin, cmd, path)
+        log.info("Removing temporary dir")
+        admin.run(
+            args=[
+                'rm',
+                run.Raw('-rf'),
+                run.Raw(conf_dir)],
+            check_status=False)
+        if config.get('rhbuild'):
+            admin.run(args=['sudo', 'yum', 'remove', 'ceph-deploy', '-y'])
+
+
+@contextlib.contextmanager
+def single_node_test(ctx, config):
+    """
+    - ceph-deploy.single_node_test: null
+
+    #rhbuild testing
+    - ceph-deploy.single_node_test:
+        rhbuild: 1.2.3
+
+    """
+    log.info("Testing ceph-deploy on single node")
+    if config is None:
+        config = {}
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
+
+    if config.get('rhbuild'):
+        log.info("RH Build, Skip Download")
+        with contextutil.nested(
+            lambda: cli_test(ctx=ctx, config=config),
+        ):
+            yield
+    else:
+        with contextutil.nested(
+            lambda: install_fn.ship_utilities(ctx=ctx, config=None),
+            lambda: download_ceph_deploy(ctx=ctx, config=config),
+            lambda: cli_test(ctx=ctx, config=config),
+        ):
+            yield
+
+
+@contextlib.contextmanager
+def upgrade(ctx, config):
+    """
+     Upgrade using ceph-deploy
+     eg:
+       ceph-deploy.upgrade:
+          # to upgrade to specific branch, use
+          branch:
+             stable: jewel
+           # to setup mgr node, use
+           setup-mgr-node: True
+           # to wait for cluster to be healthy after all upgrade, use
+           wait-for-healthy: True
+           role: (upgrades the below roles serially)
+              mon.a
+              mon.b
+              osd.0
+     """
+    roles = config.get('roles')
+    # get the roles that are mapped as per ceph-deploy
+    # roles are mapped for mon/mds eg: mon.a  => mon.host_short_name
+    mapped_role = ctx.cluster.mapped_role
+    if config.get('branch'):
+        branch = config.get('branch')
+        (var, val) = branch.items()[0]
+        ceph_branch = '--{var}={val}'.format(var=var, val=val)
+    else:
+        # default to master
+        ceph_branch = '--dev=master'
+    # get the node used for initial deployment which is mon.a
+    mon_a = mapped_role.get('mon.a')
+    (ceph_admin,) = ctx.cluster.only(mon_a).remotes.iterkeys()
+    testdir = teuthology.get_testdir(ctx)
+    cmd = './ceph-deploy install ' + ceph_branch
+    for role in roles:
+        # check if this role is mapped (mon or mds)
+        if mapped_role.get(role):
+            role = mapped_role.get(role)
+        remotes_and_roles = ctx.cluster.only(role).remotes
+        for remote, roles in remotes_and_roles.iteritems():
+            nodename = remote.shortname
+            cmd = cmd + ' ' + nodename
+            log.info("Upgrading ceph on  %s", nodename)
+            ceph_admin.run(
+                args=[
+                    'cd',
+                    '{tdir}/ceph-deploy'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    run.Raw(cmd),
+                ],
+            )
+            # restart all ceph services, ideally upgrade should but it does not
+            remote.run(
+                args=[
+                    'sudo', 'systemctl', 'restart', 'ceph.target'
+                ]
+            )
+            ceph_admin.run(args=['sudo', 'ceph', '-s'])
+
+    # workaround for http://tracker.ceph.com/issues/20950
+    # write the correct mgr key to disk
+    if config.get('setup-mgr-node', None):
+        mons = ctx.cluster.only(teuthology.is_type('mon'))
+        for remote, roles in mons.remotes.iteritems():
+            remote.run(
+                args=[
+                    run.Raw('sudo ceph auth get client.bootstrap-mgr'),
+                    run.Raw('|'),
+                    run.Raw('sudo tee'),
+                    run.Raw('/var/lib/ceph/bootstrap-mgr/ceph.keyring')
+                ]
+            )
+
+    if config.get('setup-mgr-node', None):
+        mgr_nodes = get_nodes_using_role(ctx, 'mgr')
+        mgr_nodes = " ".join(mgr_nodes)
+        mgr_install = './ceph-deploy install --mgr ' + ceph_branch + " " + mgr_nodes
+        mgr_create = './ceph-deploy mgr create' + " " + mgr_nodes
+        # install mgr
+        ceph_admin.run(
+            args=[
+                'cd',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+                run.Raw('&&'),
+                run.Raw(mgr_install),
+                ],
+            )
+        # create mgr
+        ceph_admin.run(
+            args=[
+                'cd',
+                '{tdir}/ceph-deploy'.format(tdir=testdir),
+                run.Raw('&&'),
+                run.Raw(mgr_create),
+                ],
+            )
+        ceph_admin.run(args=['sudo', 'ceph', '-s'])
+    if config.get('wait-for-healthy', None):
+        wait_until_healthy(ctx, ceph_admin, use_sudo=True)
+    yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Set up and tear down a Ceph cluster.
+
+    For example::
+
+        tasks:
+        - install:
+             extras: yes
+        - ssh_keys:
+        - ceph-deploy:
+             branch:
+                stable: bobtail
+             mon_initial_members: 1
+             ceph-deploy-branch: my-ceph-deploy-branch
+             only_mon: true
+             keep_running: true
+             # either choose bluestore or filestore, default is bluestore
+             bluestore: True
+             # or
+             filestore: True
+             # skip install of mgr for old release using below flag
+             skip-mgr: True  ( default is False )
+
+        tasks:
+        - install:
+             extras: yes
+        - ssh_keys:
+        - ceph-deploy:
+             branch:
+                dev: master
+             conf:
+                mon:
+                   debug mon = 20
+
+        tasks:
+        - install:
+             extras: yes
+        - ssh_keys:
+        - ceph-deploy:
+             branch:
+                testing:
+             dmcrypt: yes
+             separate_journal_disk: yes
+
+    """
+    if config is None:
+        config = {}
+
+    assert isinstance(config, dict), \
+        "task ceph-deploy only supports a dictionary for configuration"
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('ceph-deploy', {}))
+
+    if config.get('branch') is not None:
+        assert isinstance(
+            config['branch'], dict), 'branch must be a dictionary'
+
+    log.info('task ceph-deploy with config ' + str(config))
+
+    with contextutil.nested(
+        lambda: install_fn.ship_utilities(ctx=ctx, config=None),
+        lambda: download_ceph_deploy(ctx=ctx, config=config),
+        lambda: build_ceph_cluster(ctx=ctx, config=config),
+    ):
+        yield
diff --git a/src/ceph/qa/tasks/ceph_fuse.py b/src/ceph/qa/tasks/ceph_fuse.py
new file mode 100644
index 0000000..c9d8354
--- /dev/null
+++ b/src/ceph/qa/tasks/ceph_fuse.py
@@ -0,0 +1,145 @@
+"""
+Ceph FUSE client task
+"""
+
+import contextlib
+import logging
+
+from teuthology import misc as teuthology
+from cephfs.fuse_mount import FuseMount
+
+log = logging.getLogger(__name__)
+
+
+def get_client_configs(ctx, config):
+    """
+    Get a map of the configuration for each FUSE client in the configuration by
+    combining the configuration of the current task with any global overrides.
+
+    :param ctx: Context instance
+    :param config: configuration for this task
+    :return: dict of client name to config or to None
+    """
+    if config is None:
+        config = dict(('client.{id}'.format(id=id_), None)
+                      for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client'))
+    elif isinstance(config, list):
+        config = dict((name, None) for name in config)
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('ceph-fuse', {}))
+
+    return config
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Mount/unmount a ``ceph-fuse`` client.
+
+    The config is optional and defaults to mounting on all clients. If
+    a config is given, it is expected to be a list of clients to do
+    this operation on. This lets you e.g. set up one client with
+    ``ceph-fuse`` and another with ``kclient``.
+
+    Example that mounts all clients::
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - interactive:
+
+    Example that uses both ``kclient` and ``ceph-fuse``::
+
+        tasks:
+        - ceph:
+        - ceph-fuse: [client.0]
+        - kclient: [client.1]
+        - interactive:
+
+    Example that enables valgrind:
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+            client.0:
+              valgrind: [--tool=memcheck, --leak-check=full, --show-reachable=yes]
+        - interactive:
+
+    Example that stops an already-mounted client:
+
+    ::
+
+        tasks:
+            - ceph:
+            - ceph-fuse: [client.0]
+            - ... do something that requires the FS mounted ...
+            - ceph-fuse:
+                client.0:
+                    mounted: false
+            - ... do something that requires the FS unmounted ...
+
+    Example that adds more generous wait time for mount (for virtual machines):
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+            client.0:
+              mount_wait: 60 # default is 0, do not wait before checking /sys/
+              mount_timeout: 120 # default is 30, give up if /sys/ is not populated
+        - interactive:
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    log.info('Mounting ceph-fuse clients...')
+
+    testdir = teuthology.get_testdir(ctx)
+    config = get_client_configs(ctx, config)
+
+    # List clients we will configure mounts for, default is all clients
+    clients = list(teuthology.get_clients(ctx=ctx, roles=filter(lambda x: 'client.' in x, config.keys())))
+
+    all_mounts = getattr(ctx, 'mounts', {})
+    mounted_by_me = {}
+
+    # Construct any new FuseMount instances
+    for id_, remote in clients:
+        client_config = config.get("client.%s" % id_)
+        if client_config is None:
+            client_config = {}
+
+        if id_ not in all_mounts:
+            fuse_mount = FuseMount(client_config, testdir, id_, remote)
+            all_mounts[id_] = fuse_mount
+        else:
+            # Catch bad configs where someone has e.g. tried to use ceph-fuse and kcephfs for the same client
+            assert isinstance(all_mounts[id_], FuseMount)
+
+        if not config.get("disabled", False) and client_config.get('mounted', True):
+            mounted_by_me[id_] = all_mounts[id_]
+
+    ctx.mounts = all_mounts
+
+    # Mount any clients we have been asked to (default to mount all)
+    for mount in mounted_by_me.values():
+        mount.mount()
+
+    for mount in mounted_by_me.values():
+        mount.wait_until_mounted()
+
+    # Umount any pre-existing clients that we have not been asked to mount
+    for client_id in set(all_mounts.keys()) - set(mounted_by_me.keys()):
+        mount = all_mounts[client_id]
+        if mount.is_mounted():
+            mount.umount_wait()
+
+    try:
+        yield all_mounts
+    finally:
+        log.info('Unmounting ceph-fuse clients...')
+
+        for mount in mounted_by_me.values():
+            # Conditional because an inner context might have umounted it
+            if mount.is_mounted():
+                mount.umount_wait()
diff --git a/src/ceph/qa/tasks/ceph_manager.py b/src/ceph/qa/tasks/ceph_manager.py
new file mode 100644
index 0000000..5a89f23
--- /dev/null
+++ b/src/ceph/qa/tasks/ceph_manager.py
@@ -0,0 +1,2592 @@
+"""
+ceph manager -- Thrasher and CephManager objects
+"""
+from cStringIO import StringIO
+from functools import wraps
+import contextlib
+import random
+import signal
+import time
+import gevent
+import base64
+import json
+import logging
+import threading
+import traceback
+import os
+from teuthology import misc as teuthology
+from tasks.scrub import Scrubber
+from util.rados import cmd_erasure_code_profile
+from util import get_remote
+from teuthology.contextutil import safe_while
+from teuthology.orchestra.remote import Remote
+from teuthology.orchestra import run
+from teuthology.exceptions import CommandFailedError
+
+try:
+    from subprocess import DEVNULL # py3k
+except ImportError:
+    DEVNULL = open(os.devnull, 'r+')
+
+DEFAULT_CONF_PATH = '/etc/ceph/ceph.conf'
+
+log = logging.getLogger(__name__)
+
+
+def write_conf(ctx, conf_path=DEFAULT_CONF_PATH, cluster='ceph'):
+    conf_fp = StringIO()
+    ctx.ceph[cluster].conf.write(conf_fp)
+    conf_fp.seek(0)
+    writes = ctx.cluster.run(
+        args=[
+            'sudo', 'mkdir', '-p', '/etc/ceph', run.Raw('&&'),
+            'sudo', 'chmod', '0755', '/etc/ceph', run.Raw('&&'),
+            'sudo', 'python',
+            '-c',
+            ('import shutil, sys; '
+             'shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))'),
+            conf_path,
+            run.Raw('&&'),
+            'sudo', 'chmod', '0644', conf_path,
+        ],
+        stdin=run.PIPE,
+        wait=False)
+    teuthology.feed_many_stdins_and_close(conf_fp, writes)
+    run.wait(writes)
+
+
+def mount_osd_data(ctx, remote, cluster, osd):
+    """
+    Mount a remote OSD
+
+    :param ctx: Context
+    :param remote: Remote site
+    :param cluster: name of ceph cluster
+    :param osd: Osd name
+    """
+    log.debug('Mounting data for osd.{o} on {r}'.format(o=osd, r=remote))
+    role = "{0}.osd.{1}".format(cluster, osd)
+    alt_role = role if cluster != 'ceph' else "osd.{0}".format(osd)
+    if remote in ctx.disk_config.remote_to_roles_to_dev:
+        if alt_role in ctx.disk_config.remote_to_roles_to_dev[remote]:
+            role = alt_role
+        if role not in ctx.disk_config.remote_to_roles_to_dev[remote]:
+            return
+        dev = ctx.disk_config.remote_to_roles_to_dev[remote][role]
+        mount_options = ctx.disk_config.\
+            remote_to_roles_to_dev_mount_options[remote][role]
+        fstype = ctx.disk_config.remote_to_roles_to_dev_fstype[remote][role]
+        mnt = os.path.join('/var/lib/ceph/osd', '{0}-{1}'.format(cluster, osd))
+
+        log.info('Mounting osd.{o}: dev: {n}, cluster: {c}'
+                 'mountpoint: {p}, type: {t}, options: {v}'.format(
+                     o=osd, n=remote.name, p=mnt, t=fstype, v=mount_options,
+                     c=cluster))
+
+        remote.run(
+            args=[
+                'sudo',
+                'mount',
+                '-t', fstype,
+                '-o', ','.join(mount_options),
+                dev,
+                mnt,
+            ]
+            )
+
+
+class Thrasher:
+    """
+    Object used to thrash Ceph
+    """
+    def __init__(self, manager, config, logger=None):
+        self.ceph_manager = manager
+        self.cluster = manager.cluster
+        self.ceph_manager.wait_for_clean()
+        osd_status = self.ceph_manager.get_osd_status()
+        self.in_osds = osd_status['in']
+        self.live_osds = osd_status['live']
+        self.out_osds = osd_status['out']
+        self.dead_osds = osd_status['dead']
+        self.stopping = False
+        self.logger = logger
+        self.config = config
+        self.revive_timeout = self.config.get("revive_timeout", 360)
+        self.pools_to_fix_pgp_num = set()
+        if self.config.get('powercycle'):
+            self.revive_timeout += 120
+        self.clean_wait = self.config.get('clean_wait', 0)
+        self.minin = self.config.get("min_in", 4)
+        self.chance_move_pg = self.config.get('chance_move_pg', 1.0)
+        self.sighup_delay = self.config.get('sighup_delay')
+        self.optrack_toggle_delay = self.config.get('optrack_toggle_delay')
+        self.dump_ops_enable = self.config.get('dump_ops_enable')
+        self.noscrub_toggle_delay = self.config.get('noscrub_toggle_delay')
+        self.chance_thrash_cluster_full = self.config.get('chance_thrash_cluster_full', .05)
+        self.chance_thrash_pg_upmap = self.config.get('chance_thrash_pg_upmap', 1.0)
+        self.chance_thrash_pg_upmap_items = self.config.get('chance_thrash_pg_upmap', 1.0)
+        self.random_eio = self.config.get('random_eio')
+        self.chance_force_recovery = self.config.get('chance_force_recovery', 0.3)
+
+        num_osds = self.in_osds + self.out_osds
+        self.max_pgs = self.config.get("max_pgs_per_pool_osd", 1200) * num_osds
+        if self.logger is not None:
+            self.log = lambda x: self.logger.info(x)
+        else:
+            def tmp(x):
+                """
+                Implement log behavior
+                """
+                print x
+            self.log = tmp
+        if self.config is None:
+            self.config = dict()
+        # prevent monitor from auto-marking things out while thrasher runs
+        # try both old and new tell syntax, in case we are testing old code
+        self.saved_options = []
+        # assuming that the default settings do not vary from one daemon to
+        # another
+        first_mon = teuthology.get_first_mon(manager.ctx, self.config).split('.')
+        opts = [('mon', 'mon_osd_down_out_interval', 0)]
+        for service, opt, new_value in opts:
+            old_value = manager.get_config(first_mon[0],
+                                           first_mon[1],
+                                           opt)
+            self.saved_options.append((service, opt, old_value))
+            self._set_config(service, '*', opt, new_value)
+        # initialize ceph_objectstore_tool property - must be done before
+        # do_thrash is spawned - http://tracker.ceph.com/issues/18799
+        if (self.config.get('powercycle') or
+            not self.cmd_exists_on_osds("ceph-objectstore-tool") or
+            self.config.get('disable_objectstore_tool_tests', False)):
+            self.ceph_objectstore_tool = False
+            self.test_rm_past_intervals = False
+            if self.config.get('powercycle'):
+                self.log("Unable to test ceph-objectstore-tool, "
+                         "powercycle testing")
+            else:
+                self.log("Unable to test ceph-objectstore-tool, "
+                         "not available on all OSD nodes")
+        else:
+            self.ceph_objectstore_tool = \
+                self.config.get('ceph_objectstore_tool', True)
+            self.test_rm_past_intervals = \
+                self.config.get('test_rm_past_intervals', True)
+        # spawn do_thrash
+        self.thread = gevent.spawn(self.do_thrash)
+        if self.sighup_delay:
+            self.sighup_thread = gevent.spawn(self.do_sighup)
+        if self.optrack_toggle_delay:
+            self.optrack_toggle_thread = gevent.spawn(self.do_optrack_toggle)
+        if self.dump_ops_enable == "true":
+            self.dump_ops_thread = gevent.spawn(self.do_dump_ops)
+        if self.noscrub_toggle_delay:
+            self.noscrub_toggle_thread = gevent.spawn(self.do_noscrub_toggle)
+
+    def _set_config(self, service_type, service_id, name, value):
+        opt_arg = '--{name} {value}'.format(name=name, value=value)
+        whom = '.'.join([service_type, service_id])
+        self.ceph_manager.raw_cluster_cmd('--', 'tell', whom,
+                                          'injectargs', opt_arg)
+
+
+    def cmd_exists_on_osds(self, cmd):
+        allremotes = self.ceph_manager.ctx.cluster.only(\
+            teuthology.is_type('osd', self.cluster)).remotes.keys()
+        allremotes = list(set(allremotes))
+        for remote in allremotes:
+            proc = remote.run(args=['type', cmd], wait=True,
+                              check_status=False, stdout=StringIO(),
+                              stderr=StringIO())
+            if proc.exitstatus != 0:
+                return False;
+        return True;
+
+    def kill_osd(self, osd=None, mark_down=False, mark_out=False):
+        """
+        :param osd: Osd to be killed.
+        :mark_down: Mark down if true.
+        :mark_out: Mark out if true.
+        """
+        if osd is None:
+            osd = random.choice(self.live_osds)
+        self.log("Killing osd %s, live_osds are %s" % (str(osd),
+                                                       str(self.live_osds)))
+        self.live_osds.remove(osd)
+        self.dead_osds.append(osd)
+        self.ceph_manager.kill_osd(osd)
+        if mark_down:
+            self.ceph_manager.mark_down_osd(osd)
+        if mark_out and osd in self.in_osds:
+            self.out_osd(osd)
+        if self.ceph_objectstore_tool:
+            self.log("Testing ceph-objectstore-tool on down osd")
+            remote = self.ceph_manager.find_remote('osd', osd)
+            FSPATH = self.ceph_manager.get_filepath()
+            JPATH = os.path.join(FSPATH, "journal")
+            exp_osd = imp_osd = osd
+            exp_remote = imp_remote = remote
+            # If an older osd is available we'll move a pg from there
+            if (len(self.dead_osds) > 1 and
+                    random.random() < self.chance_move_pg):
+                exp_osd = random.choice(self.dead_osds[:-1])
+                exp_remote = self.ceph_manager.find_remote('osd', exp_osd)
+            if ('keyvaluestore_backend' in
+                    self.ceph_manager.ctx.ceph[self.cluster].conf['osd']):
+                prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+                          "--data-path {fpath} --journal-path {jpath} "
+                          "--type keyvaluestore "
+                          "--log-file="
+                          "/var/log/ceph/objectstore_tool.\\$pid.log ".
+                          format(fpath=FSPATH, jpath=JPATH))
+            else:
+                prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+                          "--data-path {fpath} --journal-path {jpath} "
+                          "--log-file="
+                          "/var/log/ceph/objectstore_tool.\\$pid.log ".
+                          format(fpath=FSPATH, jpath=JPATH))
+            cmd = (prefix + "--op list-pgs").format(id=exp_osd)
+
+            # ceph-objectstore-tool might be temporarily absent during an 
+            # upgrade - see http://tracker.ceph.com/issues/18014
+            with safe_while(sleep=15, tries=40, action="type ceph-objectstore-tool") as proceed:
+                while proceed():
+                    proc = exp_remote.run(args=['type', 'ceph-objectstore-tool'], 
+                               wait=True, check_status=False, stdout=StringIO(),
+                               stderr=StringIO())
+                    if proc.exitstatus == 0:
+                        break
+                    log.debug("ceph-objectstore-tool binary not present, trying again")
+
+            # ceph-objectstore-tool might bogusly fail with "OSD has the store locked"
+            # see http://tracker.ceph.com/issues/19556
+            with safe_while(sleep=15, tries=40, action="ceph-objectstore-tool --op list-pgs") as proceed:
+                while proceed():
+                    proc = exp_remote.run(args=cmd, wait=True,
+                                          check_status=False,
+                                          stdout=StringIO(), stderr=StringIO())
+                    if proc.exitstatus == 0:
+                        break
+                    elif proc.exitstatus == 1 and proc.stderr == "OSD has the store locked":
+                        continue
+                    else:
+                        raise Exception("ceph-objectstore-tool: "
+                                        "exp list-pgs failure with status {ret}".
+                                        format(ret=proc.exitstatus))
+
+            pgs = proc.stdout.getvalue().split('\n')[:-1]
+            if len(pgs) == 0:
+                self.log("No PGs found for osd.{osd}".format(osd=exp_osd))
+                return
+            pg = random.choice(pgs)
+            exp_path = teuthology.get_testdir(self.ceph_manager.ctx)
+            exp_path = os.path.join(exp_path, '{0}.data'.format(self.cluster))
+            exp_path = os.path.join(exp_path,
+                                    "exp.{pg}.{id}".format(
+                                        pg=pg,
+                                        id=exp_osd))
+            # export
+            # Can't use new export-remove op since this is part of upgrade testing
+            cmd = prefix + "--op export --pgid {pg} --file {file}"
+            cmd = cmd.format(id=exp_osd, pg=pg, file=exp_path)
+            proc = exp_remote.run(args=cmd)
+            if proc.exitstatus:
+                raise Exception("ceph-objectstore-tool: "
+                                "export failure with status {ret}".
+                                format(ret=proc.exitstatus))
+            # remove
+            cmd = prefix + "--force --op remove --pgid {pg}"
+            cmd = cmd.format(id=exp_osd, pg=pg)
+            proc = exp_remote.run(args=cmd)
+            if proc.exitstatus:
+                raise Exception("ceph-objectstore-tool: "
+                                "remove failure with status {ret}".
+                                format(ret=proc.exitstatus))
+            # If there are at least 2 dead osds we might move the pg
+            if exp_osd != imp_osd:
+                # If pg isn't already on this osd, then we will move it there
+                cmd = (prefix + "--op list-pgs").format(id=imp_osd)
+                proc = imp_remote.run(args=cmd, wait=True,
+                                      check_status=False, stdout=StringIO())
+                if proc.exitstatus:
+                    raise Exception("ceph-objectstore-tool: "
+                                    "imp list-pgs failure with status {ret}".
+                                    format(ret=proc.exitstatus))
+                pgs = proc.stdout.getvalue().split('\n')[:-1]
+                if pg not in pgs:
+                    self.log("Moving pg {pg} from osd.{fosd} to osd.{tosd}".
+                             format(pg=pg, fosd=exp_osd, tosd=imp_osd))
+                    if imp_remote != exp_remote:
+                        # Copy export file to the other machine
+                        self.log("Transfer export file from {srem} to {trem}".
+                                 format(srem=exp_remote, trem=imp_remote))
+                        tmpexport = Remote.get_file(exp_remote, exp_path)
+                        Remote.put_file(imp_remote, tmpexport, exp_path)
+                        os.remove(tmpexport)
+                else:
+                    # Can't move the pg after all
+                    imp_osd = exp_osd
+                    imp_remote = exp_remote
+            # import
+            cmd = (prefix + "--op import --file {file}")
+            cmd = cmd.format(id=imp_osd, file=exp_path)
+            proc = imp_remote.run(args=cmd, wait=True, check_status=False,
+                                  stderr=StringIO())
+            if proc.exitstatus == 1:
+                bogosity = "The OSD you are using is older than the exported PG"
+                if bogosity in proc.stderr.getvalue():
+                    self.log("OSD older than exported PG"
+                             "...ignored")
+            elif proc.exitstatus == 10:
+                self.log("Pool went away before processing an import"
+                         "...ignored")
+            elif proc.exitstatus == 11:
+                self.log("Attempt to import an incompatible export"
+                         "...ignored")
+            elif proc.exitstatus:
+                raise Exception("ceph-objectstore-tool: "
+                                "import failure with status {ret}".
+                                format(ret=proc.exitstatus))
+            cmd = "rm -f {file}".format(file=exp_path)
+            exp_remote.run(args=cmd)
+            if imp_remote != exp_remote:
+                imp_remote.run(args=cmd)
+
+            # apply low split settings to each pool
+            for pool in self.ceph_manager.list_pools():
+                no_sudo_prefix = prefix[5:]
+                cmd = ("CEPH_ARGS='--filestore-merge-threshold 1 "
+                       "--filestore-split-multiple 1' sudo -E "
+                       + no_sudo_prefix + "--op apply-layout-settings --pool " + pool).format(id=osd)
+                proc = remote.run(args=cmd, wait=True, check_status=False, stderr=StringIO())
+                output = proc.stderr.getvalue()
+                if 'Couldn\'t find pool' in output:
+                    continue
+                if proc.exitstatus:
+                    raise Exception("ceph-objectstore-tool apply-layout-settings"
+                                    " failed with {status}".format(status=proc.exitstatus))
+
+    def rm_past_intervals(self, osd=None):
+        """
+        :param osd: Osd to find pg to remove past intervals
+        """
+        if self.test_rm_past_intervals:
+            if osd is None:
+                osd = random.choice(self.dead_osds)
+            self.log("Use ceph_objectstore_tool to remove past intervals")
+            remote = self.ceph_manager.find_remote('osd', osd)
+            FSPATH = self.ceph_manager.get_filepath()
+            JPATH = os.path.join(FSPATH, "journal")
+            if ('keyvaluestore_backend' in
+                    self.ceph_manager.ctx.ceph[self.cluster].conf['osd']):
+                prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+                          "--data-path {fpath} --journal-path {jpath} "
+                          "--type keyvaluestore "
+                          "--log-file="
+                          "/var/log/ceph/objectstore_tool.\\$pid.log ".
+                          format(fpath=FSPATH, jpath=JPATH))
+            else:
+                prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+                          "--data-path {fpath} --journal-path {jpath} "
+                          "--log-file="
+                          "/var/log/ceph/objectstore_tool.\\$pid.log ".
+                          format(fpath=FSPATH, jpath=JPATH))
+            cmd = (prefix + "--op list-pgs").format(id=osd)
+            proc = remote.run(args=cmd, wait=True,
+                              check_status=False, stdout=StringIO())
+            if proc.exitstatus:
+                raise Exception("ceph_objectstore_tool: "
+                                "exp list-pgs failure with status {ret}".
+                                format(ret=proc.exitstatus))
+            pgs = proc.stdout.getvalue().split('\n')[:-1]
+            if len(pgs) == 0:
+                self.log("No PGs found for osd.{osd}".format(osd=osd))
+                return
+            pg = random.choice(pgs)
+            cmd = (prefix + "--op rm-past-intervals --pgid {pg}").\
+                format(id=osd, pg=pg)
+            proc = remote.run(args=cmd)
+            if proc.exitstatus:
+                raise Exception("ceph_objectstore_tool: "
+                                "rm-past-intervals failure with status {ret}".
+                                format(ret=proc.exitstatus))
+
+    def blackhole_kill_osd(self, osd=None):
+        """
+        If all else fails, kill the osd.
+        :param osd: Osd to be killed.
+        """
+        if osd is None:
+            osd = random.choice(self.live_osds)
+        self.log("Blackholing and then killing osd %s, live_osds are %s" %
+                 (str(osd), str(self.live_osds)))
+        self.live_osds.remove(osd)
+        self.dead_osds.append(osd)
+        self.ceph_manager.blackhole_kill_osd(osd)
+
+    def revive_osd(self, osd=None, skip_admin_check=False):
+        """
+        Revive the osd.
+        :param osd: Osd to be revived.
+        """
+        if osd is None:
+            osd = random.choice(self.dead_osds)
+        self.log("Reviving osd %s" % (str(osd),))
+        self.ceph_manager.revive_osd(
+            osd,
+            self.revive_timeout,
+            skip_admin_check=skip_admin_check)
+        self.dead_osds.remove(osd)
+        self.live_osds.append(osd)
+        if self.random_eio > 0 and osd is self.rerrosd:
+            self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
+                          'injectargs', '--', '--filestore_debug_random_read_err='+str(self.random_eio))
+            self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
+                          'injectargs', '--', '--bluestore_debug_random_read_err='+str(self.random_eio))
+
+
+    def out_osd(self, osd=None):
+        """
+        Mark the osd out
+        :param osd: Osd to be marked.
+        """
+        if osd is None:
+            osd = random.choice(self.in_osds)
+        self.log("Removing osd %s, in_osds are: %s" %
+                 (str(osd), str(self.in_osds)))
+        self.ceph_manager.mark_out_osd(osd)
+        self.in_osds.remove(osd)
+        self.out_osds.append(osd)
+
+    def in_osd(self, osd=None):
+        """
+        Mark the osd out
+        :param osd: Osd to be marked.
+        """
+        if osd is None:
+            osd = random.choice(self.out_osds)
+        if osd in self.dead_osds:
+            return self.revive_osd(osd)
+        self.log("Adding osd %s" % (str(osd),))
+        self.out_osds.remove(osd)
+        self.in_osds.append(osd)
+        self.ceph_manager.mark_in_osd(osd)
+        self.log("Added osd %s" % (str(osd),))
+
+    def reweight_osd_or_by_util(self, osd=None):
+        """
+        Reweight an osd that is in
+        :param osd: Osd to be marked.
+        """
+        if osd is not None or random.choice([True, False]):
+            if osd is None:
+                osd = random.choice(self.in_osds)
+            val = random.uniform(.1, 1.0)
+            self.log("Reweighting osd %s to %s" % (str(osd), str(val)))
+            self.ceph_manager.raw_cluster_cmd('osd', 'reweight',
+                                              str(osd), str(val))
+        else:
+            # do it several times, the option space is large
+            for i in range(5):
+                options = {
+                    'max_change': random.choice(['0.05', '1.0', '3.0']),
+                    'overage': random.choice(['110', '1000']),
+                    'type': random.choice([
+                        'reweight-by-utilization',
+                        'test-reweight-by-utilization']),
+                }
+                self.log("Reweighting by: %s"%(str(options),))
+                self.ceph_manager.raw_cluster_cmd(
+                    'osd',
+                    options['type'],
+                    options['overage'],
+                    options['max_change'])
+
+    def primary_affinity(self, osd=None):
+        if osd is None:
+            osd = random.choice(self.in_osds)
+        if random.random() >= .5:
+            pa = random.random()
+        elif random.random() >= .5:
+            pa = 1
+        else:
+            pa = 0
+        self.log('Setting osd %s primary_affinity to %f' % (str(osd), pa))
+        self.ceph_manager.raw_cluster_cmd('osd', 'primary-affinity',
+                                          str(osd), str(pa))
+
+    def thrash_cluster_full(self):
+        """
+        Set and unset cluster full condition
+        """
+        self.log('Setting full ratio to .001')
+        self.ceph_manager.raw_cluster_cmd('osd', 'set-full-ratio', '.001')
+        time.sleep(1)
+        self.log('Setting full ratio back to .95')
+        self.ceph_manager.raw_cluster_cmd('osd', 'set-full-ratio', '.95')
+
+    def thrash_pg_upmap(self):
+        """
+        Install or remove random pg_upmap entries in OSDMap
+        """
+        from random import shuffle
+        out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty')
+        j = json.loads(out)
+        self.log('j is %s' % j)
+        try:
+            if random.random() >= .3:
+                pgs = self.ceph_manager.get_pg_stats()
+                pg = random.choice(pgs)
+                pgid = str(pg['pgid'])
+                poolid = int(pgid.split('.')[0])
+                sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid]
+                if len(sizes) == 0:
+                    return
+                n = sizes[0]
+                osds = self.in_osds + self.out_osds
+                shuffle(osds)
+                osds = osds[0:n]
+                self.log('Setting %s to %s' % (pgid, osds))
+                cmd = ['osd', 'pg-upmap', pgid] + [str(x) for x in osds]
+                self.log('cmd %s' % cmd)
+                self.ceph_manager.raw_cluster_cmd(*cmd)
+            else:
+                m = j['pg_upmap']
+                if len(m) > 0:
+                    shuffle(m)
+                    pg = m[0]['pgid']
+                    self.log('Clearing pg_upmap on %s' % pg)
+                    self.ceph_manager.raw_cluster_cmd(
+                        'osd',
+                        'rm-pg-upmap',
+                        pg)
+                else:
+                    self.log('No pg_upmap entries; doing nothing')
+        except CommandFailedError:
+            self.log('Failed to rm-pg-upmap, ignoring')
+
+    def thrash_pg_upmap_items(self):
+        """
+        Install or remove random pg_upmap_items entries in OSDMap
+        """
+        from random import shuffle
+        out = self.ceph_manager.raw_cluster_cmd('osd', 'dump', '-f', 'json-pretty')
+        j = json.loads(out)
+        self.log('j is %s' % j)
+        try:
+            if random.random() >= .3:
+                pgs = self.ceph_manager.get_pg_stats()
+                pg = random.choice(pgs)
+                pgid = str(pg['pgid'])
+                poolid = int(pgid.split('.')[0])
+                sizes = [x['size'] for x in j['pools'] if x['pool'] == poolid]
+                if len(sizes) == 0:
+                    return
+                n = sizes[0]
+                osds = self.in_osds + self.out_osds
+                shuffle(osds)
+                osds = osds[0:n*2]
+                self.log('Setting %s to %s' % (pgid, osds))
+                cmd = ['osd', 'pg-upmap-items', pgid] + [str(x) for x in osds]
+                self.log('cmd %s' % cmd)
+                self.ceph_manager.raw_cluster_cmd(*cmd)
+            else:
+                m = j['pg_upmap_items']
+                if len(m) > 0:
+                    shuffle(m)
+                    pg = m[0]['pgid']
+                    self.log('Clearing pg_upmap on %s' % pg)
+                    self.ceph_manager.raw_cluster_cmd(
+                        'osd',
+                        'rm-pg-upmap-items',
+                        pg)
+                else:
+                    self.log('No pg_upmap entries; doing nothing')
+        except CommandFailedError:
+            self.log('Failed to rm-pg-upmap-items, ignoring')
+
+    def force_recovery(self):
+        """
+        Force recovery on some of PGs
+        """
+        backfill = random.random() >= 0.5
+        j = self.ceph_manager.get_pgids_to_force(backfill)
+        if j:
+            if backfill:
+                self.ceph_manager.raw_cluster_cmd('pg', 'force-backfill', *j)
+            else:
+                self.ceph_manager.raw_cluster_cmd('pg', 'force-recovery', *j)
+
+    def cancel_force_recovery(self):
+        """
+        Force recovery on some of PGs
+        """
+        backfill = random.random() >= 0.5
+        j = self.ceph_manager.get_pgids_to_cancel_force(backfill)
+        if j:
+            if backfill:
+                self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-backfill', *j)
+            else:
+                self.ceph_manager.raw_cluster_cmd('pg', 'cancel-force-recovery', *j)
+
+    def force_cancel_recovery(self):
+        """
+        Force or cancel forcing recovery
+        """
+        if random.random() >= 0.4:
+           self.force_recovery()
+        else:
+           self.cancel_force_recovery()
+
+    def all_up(self):
+        """
+        Make sure all osds are up and not out.
+        """
+        while len(self.dead_osds) > 0:
+            self.log("reviving osd")
+            self.revive_osd()
+        while len(self.out_osds) > 0:
+            self.log("inning osd")
+            self.in_osd()
+
+    def all_up_in(self):
+        """
+        Make sure all osds are up and fully in.
+        """
+        self.all_up();
+        for osd in self.live_osds:
+            self.ceph_manager.raw_cluster_cmd('osd', 'reweight',
+                                              str(osd), str(1))
+            self.ceph_manager.raw_cluster_cmd('osd', 'primary-affinity',
+                                              str(osd), str(1))
+
+    def do_join(self):
+        """
+        Break out of this Ceph loop
+        """
+        self.stopping = True
+        self.thread.get()
+        if self.sighup_delay:
+            self.log("joining the do_sighup greenlet")
+            self.sighup_thread.get()
+        if self.optrack_toggle_delay:
+            self.log("joining the do_optrack_toggle greenlet")
+            self.optrack_toggle_thread.join()
+        if self.dump_ops_enable == "true":
+            self.log("joining the do_dump_ops greenlet")
+            self.dump_ops_thread.join()
+        if self.noscrub_toggle_delay:
+            self.log("joining the do_noscrub_toggle greenlet")
+            self.noscrub_toggle_thread.join()
+
+    def grow_pool(self):
+        """
+        Increase the size of the pool
+        """
+        pool = self.ceph_manager.get_pool()
+        orig_pg_num = self.ceph_manager.get_pool_pg_num(pool)
+        self.log("Growing pool %s" % (pool,))
+        if self.ceph_manager.expand_pool(pool,
+                                         self.config.get('pool_grow_by', 10),
+                                         self.max_pgs):
+            self.pools_to_fix_pgp_num.add(pool)
+
+    def fix_pgp_num(self, pool=None):
+        """
+        Fix number of pgs in pool.
+        """
+        if pool is None:
+            pool = self.ceph_manager.get_pool()
+            force = False
+        else:
+            force = True
+        self.log("fixing pg num pool %s" % (pool,))
+        if self.ceph_manager.set_pool_pgpnum(pool, force):
+            self.pools_to_fix_pgp_num.discard(pool)
+
+    def test_pool_min_size(self):
+        """
+        Kill and revive all osds except one.
+        """
+        self.log("test_pool_min_size")
+        self.all_up()
+        self.ceph_manager.wait_for_recovery(
+            timeout=self.config.get('timeout')
+            )
+        the_one = random.choice(self.in_osds)
+        self.log("Killing everyone but %s", the_one)
+        to_kill = filter(lambda x: x != the_one, self.in_osds)
+        [self.kill_osd(i) for i in to_kill]
+        [self.out_osd(i) for i in to_kill]
+        time.sleep(self.config.get("test_pool_min_size_time", 10))
+        self.log("Killing %s" % (the_one,))
+        self.kill_osd(the_one)
+        self.out_osd(the_one)
+        self.log("Reviving everyone but %s" % (the_one,))
+        [self.revive_osd(i) for i in to_kill]
+        [self.in_osd(i) for i in to_kill]
+        self.log("Revived everyone but %s" % (the_one,))
+        self.log("Waiting for clean")
+        self.ceph_manager.wait_for_recovery(
+            timeout=self.config.get('timeout')
+            )
+
+    def inject_pause(self, conf_key, duration, check_after, should_be_down):
+        """
+        Pause injection testing. Check for osd being down when finished.
+        """
+        the_one = random.choice(self.live_osds)
+        self.log("inject_pause on {osd}".format(osd=the_one))
+        self.log(
+            "Testing {key} pause injection for duration {duration}".format(
+                key=conf_key,
+                duration=duration
+                ))
+        self.log(
+            "Checking after {after}, should_be_down={shouldbedown}".format(
+                after=check_after,
+                shouldbedown=should_be_down
+                ))
+        self.ceph_manager.set_config(the_one, **{conf_key: duration})
+        if not should_be_down:
+            return
+        time.sleep(check_after)
+        status = self.ceph_manager.get_osd_status()
+        assert the_one in status['down']
+        time.sleep(duration - check_after + 20)
+        status = self.ceph_manager.get_osd_status()
+        assert not the_one in status['down']
+
+    def test_backfill_full(self):
+        """
+        Test backfills stopping when the replica fills up.
+
+        First, use injectfull admin command to simulate a now full
+        osd by setting it to 0 on all of the OSDs.
+
+        Second, on a random subset, set
+        osd_debug_skip_full_check_in_backfill_reservation to force
+        the more complicated check in do_scan to be exercised.
+
+        Then, verify that all backfillings stop.
+        """
+        self.log("injecting backfill full")
+        for i in self.live_osds:
+            self.ceph_manager.set_config(
+                i,
+                osd_debug_skip_full_check_in_backfill_reservation=
+                random.choice(['false', 'true']))
+            self.ceph_manager.osd_admin_socket(i, command=['injectfull', 'backfillfull'],
+                                     check_status=True, timeout=30, stdout=DEVNULL)
+        for i in range(30):
+            status = self.ceph_manager.compile_pg_status()
+            if 'backfilling' not in status.keys():
+                break
+            self.log(
+                "waiting for {still_going} backfillings".format(
+                    still_going=status.get('backfilling')))
+            time.sleep(1)
+        assert('backfilling' not in self.ceph_manager.compile_pg_status().keys())
+        for i in self.live_osds:
+            self.ceph_manager.set_config(
+                i,
+                osd_debug_skip_full_check_in_backfill_reservation='false')
+            self.ceph_manager.osd_admin_socket(i, command=['injectfull', 'none'],
+                                     check_status=True, timeout=30, stdout=DEVNULL)
+
+    def test_map_discontinuity(self):
+        """
+        1) Allows the osds to recover
+        2) kills an osd
+        3) allows the remaining osds to recover
+        4) waits for some time
+        5) revives the osd
+        This sequence should cause the revived osd to have to handle
+        a map gap since the mons would have trimmed
+        """
+        while len(self.in_osds) < (self.minin + 1):
+            self.in_osd()
+        self.log("Waiting for recovery")
+        self.ceph_manager.wait_for_all_osds_up(
+            timeout=self.config.get('timeout')
+            )
+        # now we wait 20s for the pg status to change, if it takes longer,
+        # the test *should* fail!
+        time.sleep(20)
+        self.ceph_manager.wait_for_clean(
+            timeout=self.config.get('timeout')
+            )
+
+        # now we wait 20s for the backfill replicas to hear about the clean
+        time.sleep(20)
+        self.log("Recovered, killing an osd")
+        self.kill_osd(mark_down=True, mark_out=True)
+        self.log("Waiting for clean again")
+        self.ceph_manager.wait_for_clean(
+            timeout=self.config.get('timeout')
+            )
+        self.log("Waiting for trim")
+        time.sleep(int(self.config.get("map_discontinuity_sleep_time", 40)))
+        self.revive_osd()
+
+    def choose_action(self):
+        """
+        Random action selector.
+        """
+        chance_down = self.config.get('chance_down', 0.4)
+        chance_test_min_size = self.config.get('chance_test_min_size', 0)
+        chance_test_backfill_full = \
+            self.config.get('chance_test_backfill_full', 0)
+        if isinstance(chance_down, int):
+            chance_down = float(chance_down) / 100
+        minin = self.minin
+        minout = self.config.get("min_out", 0)
+        minlive = self.config.get("min_live", 2)
+        mindead = self.config.get("min_dead", 0)
+
+        self.log('choose_action: min_in %d min_out '
+                 '%d min_live %d min_dead %d' %
+                 (minin, minout, minlive, mindead))
+        actions = []
+        if len(self.in_osds) > minin:
+            actions.append((self.out_osd, 1.0,))
+        if len(self.live_osds) > minlive and chance_down > 0:
+            actions.append((self.kill_osd, chance_down,))
+        if len(self.dead_osds) > 1:
+            actions.append((self.rm_past_intervals, 1.0,))
+        if len(self.out_osds) > minout:
+            actions.append((self.in_osd, 1.7,))
+        if len(self.dead_osds) > mindead:
+            actions.append((self.revive_osd, 1.0,))
+        if self.config.get('thrash_primary_affinity', True):
+            actions.append((self.primary_affinity, 1.0,))
+        actions.append((self.reweight_osd_or_by_util,
+                        self.config.get('reweight_osd', .5),))
+        actions.append((self.grow_pool,
+                        self.config.get('chance_pgnum_grow', 0),))
+        actions.append((self.fix_pgp_num,
+                        self.config.get('chance_pgpnum_fix', 0),))
+        actions.append((self.test_pool_min_size,
+                        chance_test_min_size,))
+        actions.append((self.test_backfill_full,
+                        chance_test_backfill_full,))
+        if self.chance_thrash_cluster_full > 0:
+            actions.append((self.thrash_cluster_full, self.chance_thrash_cluster_full,))
+        if self.chance_thrash_pg_upmap > 0:
+            actions.append((self.thrash_pg_upmap, self.chance_thrash_pg_upmap,))
+        if self.chance_thrash_pg_upmap_items > 0:
+            actions.append((self.thrash_pg_upmap_items, self.chance_thrash_pg_upmap_items,))
+        if self.chance_force_recovery > 0:
+            actions.append((self.force_cancel_recovery, self.chance_force_recovery))
+
+        for key in ['heartbeat_inject_failure', 'filestore_inject_stall']:
+            for scenario in [
+                (lambda:
+                 self.inject_pause(key,
+                                   self.config.get('pause_short', 3),
+                                   0,
+                                   False),
+                 self.config.get('chance_inject_pause_short', 1),),
+                (lambda:
+                 self.inject_pause(key,
+                                   self.config.get('pause_long', 80),
+                                   self.config.get('pause_check_after', 70),
+                                   True),
+                 self.config.get('chance_inject_pause_long', 0),)]:
+                actions.append(scenario)
+
+        total = sum([y for (x, y) in actions])
+        val = random.uniform(0, total)
+        for (action, prob) in actions:
+            if val < prob:
+                return action
+            val -= prob
+        return None
+
+    def log_exc(func):
+        @wraps(func)
+        def wrapper(self):
+            try:
+                return func(self)
+            except:
+                self.log(traceback.format_exc())
+                raise
+        return wrapper
+
+    @log_exc
+    def do_sighup(self):
+        """
+        Loops and sends signal.SIGHUP to a random live osd.
+
+        Loop delay is controlled by the config value sighup_delay.
+        """
+        delay = float(self.sighup_delay)
+        self.log("starting do_sighup with a delay of {0}".format(delay))
+        while not self.stopping:
+            osd = random.choice(self.live_osds)
+            self.ceph_manager.signal_osd(osd, signal.SIGHUP, silent=True)
+            time.sleep(delay)
+
+    @log_exc
+    def do_optrack_toggle(self):
+        """
+        Loops and toggle op tracking to all osds.
+
+        Loop delay is controlled by the config value optrack_toggle_delay.
+        """
+        delay = float(self.optrack_toggle_delay)
+        osd_state = "true"
+        self.log("starting do_optrack_toggle with a delay of {0}".format(delay))
+        while not self.stopping:
+            if osd_state == "true":
+                osd_state = "false"
+            else:
+                osd_state = "true"
+            self.ceph_manager.raw_cluster_cmd_result('tell', 'osd.*',
+                             'injectargs', '--osd_enable_op_tracker=%s' % osd_state)
+            gevent.sleep(delay)
+
+    @log_exc
+    def do_dump_ops(self):
+        """
+        Loops and does op dumps on all osds
+        """
+        self.log("starting do_dump_ops")
+        while not self.stopping:
+            for osd in self.live_osds:
+                # Ignore errors because live_osds is in flux
+                self.ceph_manager.osd_admin_socket(osd, command=['dump_ops_in_flight'],
+                                     check_status=False, timeout=30, stdout=DEVNULL)
+                self.ceph_manager.osd_admin_socket(osd, command=['dump_blocked_ops'],
+                                     check_status=False, timeout=30, stdout=DEVNULL)
+                self.ceph_manager.osd_admin_socket(osd, command=['dump_historic_ops'],
+                                     check_status=False, timeout=30, stdout=DEVNULL)
+            gevent.sleep(0)
+
+    @log_exc
+    def do_noscrub_toggle(self):
+        """
+        Loops and toggle noscrub flags
+
+        Loop delay is controlled by the config value noscrub_toggle_delay.
+        """
+        delay = float(self.noscrub_toggle_delay)
+        scrub_state = "none"
+        self.log("starting do_noscrub_toggle with a delay of {0}".format(delay))
+        while not self.stopping:
+            if scrub_state == "none":
+                self.ceph_manager.raw_cluster_cmd('osd', 'set', 'noscrub')
+                scrub_state = "noscrub"
+            elif scrub_state == "noscrub":
+                self.ceph_manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub')
+                scrub_state = "both"
+            elif scrub_state == "both":
+                self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
+                scrub_state = "nodeep-scrub"
+            else:
+                self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')
+                scrub_state = "none"
+            gevent.sleep(delay)
+        self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
+        self.ceph_manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')
+
+    @log_exc
+    def do_thrash(self):
+        """
+        Loop to select random actions to thrash ceph manager with.
+        """
+        cleanint = self.config.get("clean_interval", 60)
+        scrubint = self.config.get("scrub_interval", -1)
+        maxdead = self.config.get("max_dead", 0)
+        delay = self.config.get("op_delay", 5)
+        self.rerrosd = self.live_osds[0]
+        if self.random_eio > 0:
+            self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
+                          'injectargs', '--', '--filestore_debug_random_read_err='+str(self.random_eio))
+            self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
+                          'injectargs', '--', '--bluestore_debug_random_read_err='+str(self.random_eio))
+        self.log("starting do_thrash")
+        while not self.stopping:
+            to_log = [str(x) for x in ["in_osds: ", self.in_osds,
+                                       "out_osds: ", self.out_osds,
+                                       "dead_osds: ", self.dead_osds,
+                                       "live_osds: ", self.live_osds]]
+            self.log(" ".join(to_log))
+            if random.uniform(0, 1) < (float(delay) / cleanint):
+                while len(self.dead_osds) > maxdead:
+                    self.revive_osd()
+                for osd in self.in_osds:
+                    self.ceph_manager.raw_cluster_cmd('osd', 'reweight',
+                                                      str(osd), str(1))
+                if random.uniform(0, 1) < float(
+                        self.config.get('chance_test_map_discontinuity', 0)):
+                    self.test_map_discontinuity()
+                else:
+                    self.ceph_manager.wait_for_recovery(
+                        timeout=self.config.get('timeout')
+                        )
+                time.sleep(self.clean_wait)
+                if scrubint > 0:
+                    if random.uniform(0, 1) < (float(delay) / scrubint):
+                        self.log('Scrubbing while thrashing being performed')
+                        Scrubber(self.ceph_manager, self.config)
+            self.choose_action()()
+            time.sleep(delay)
+        self.all_up()
+        if self.random_eio > 0:
+            self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
+                          'injectargs', '--', '--filestore_debug_random_read_err=0.0')
+            self.ceph_manager.raw_cluster_cmd('tell', 'osd.'+str(self.rerrosd),
+                          'injectargs', '--', '--bluestore_debug_random_read_err=0.0')
+        for pool in list(self.pools_to_fix_pgp_num):
+            if self.ceph_manager.get_pool_pg_num(pool) > 0:
+                self.fix_pgp_num(pool)
+        self.pools_to_fix_pgp_num.clear()
+        for service, opt, saved_value in self.saved_options:
+            self._set_config(service, '*', opt, saved_value)
+        self.saved_options = []
+        self.all_up_in()
+
+
+class ObjectStoreTool:
+
+    def __init__(self, manager, pool, **kwargs):
+        self.manager = manager
+        self.pool = pool
+        self.osd = kwargs.get('osd', None)
+        self.object_name = kwargs.get('object_name', None)
+        self.do_revive = kwargs.get('do_revive', True)
+        if self.osd and self.pool and self.object_name:
+            if self.osd == "primary":
+                self.osd = self.manager.get_object_primary(self.pool,
+                                                           self.object_name)
+        assert self.osd
+        if self.object_name:
+            self.pgid = self.manager.get_object_pg_with_shard(self.pool,
+                                                              self.object_name,
+                                                              self.osd)
+        self.remote = self.manager.ctx.\
+            cluster.only('osd.{o}'.format(o=self.osd)).remotes.keys()[0]
+        path = self.manager.get_filepath().format(id=self.osd)
+        self.paths = ("--data-path {path} --journal-path {path}/journal".
+                      format(path=path))
+
+    def build_cmd(self, options, args, stdin):
+        lines = []
+        if self.object_name:
+            lines.append("object=$(sudo adjust-ulimits ceph-objectstore-tool "
+                         "{paths} --pgid {pgid} --op list |"
+                         "grep '\"oid\":\"{name}\"')".
+                         format(paths=self.paths,
+                                pgid=self.pgid,
+                                name=self.object_name))
+            args = '"$object" ' + args
+            options += " --pgid {pgid}".format(pgid=self.pgid)
+        cmd = ("sudo adjust-ulimits ceph-objectstore-tool {paths} {options} {args}".
+               format(paths=self.paths,
+                      args=args,
+                      options=options))
+        if stdin:
+            cmd = ("echo {payload} | base64 --decode | {cmd}".
+                   format(payload=base64.encode(stdin),
+                          cmd=cmd))
+        lines.append(cmd)
+        return "\n".join(lines)
+
+    def run(self, options, args, stdin=None, stdout=None):
+        if stdout is None:
+            stdout = StringIO()
+        self.manager.kill_osd(self.osd)
+        cmd = self.build_cmd(options, args, stdin)
+        self.manager.log(cmd)
+        try:
+            proc = self.remote.run(args=['bash', '-e', '-x', '-c', cmd],
+                                   check_status=False,
+                                   stdout=stdout,
+                                   stderr=StringIO())
+            proc.wait()
+            if proc.exitstatus != 0:
+                self.manager.log("failed with " + str(proc.exitstatus))
+                error = proc.stdout.getvalue() + " " + proc.stderr.getvalue()
+                raise Exception(error)
+        finally:
+            if self.do_revive:
+                self.manager.revive_osd(self.osd)
+                self.manager.wait_till_osd_is_up(self.osd, 300)
+
+
+class CephManager:
+    """
+    Ceph manager object.
+    Contains several local functions that form a bulk of this module.
+
+    Note: this class has nothing to do with the Ceph daemon (ceph-mgr) of
+    the same name.
+    """
+
+    REPLICATED_POOL = 1
+    ERASURE_CODED_POOL = 3
+
+    def __init__(self, controller, ctx=None, config=None, logger=None,
+                 cluster='ceph'):
+        self.lock = threading.RLock()
+        self.ctx = ctx
+        self.config = config
+        self.controller = controller
+        self.next_pool_id = 0
+        self.cluster = cluster
+        if (logger):
+            self.log = lambda x: logger.info(x)
+        else:
+            def tmp(x):
+                """
+                implement log behavior.
+                """
+                print x
+            self.log = tmp
+        if self.config is None:
+            self.config = dict()
+        pools = self.list_pools()
+        self.pools = {}
+        for pool in pools:
+            # we may race with a pool deletion; ignore failures here
+            try:
+                self.pools[pool] = self.get_pool_property(pool, 'pg_num')
+            except CommandFailedError:
+                self.log('Failed to get pg_num from pool %s, ignoring' % pool)
+
+    def raw_cluster_cmd(self, *args):
+        """
+        Start ceph on a raw cluster.  Return count
+        """
+        testdir = teuthology.get_testdir(self.ctx)
+        ceph_args = [
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'timeout',
+            '120',
+            'ceph',
+            '--cluster',
+            self.cluster,
+        ]
+        ceph_args.extend(args)
+        proc = self.controller.run(
+            args=ceph_args,
+            stdout=StringIO(),
+            )
+        return proc.stdout.getvalue()
+
+    def raw_cluster_cmd_result(self, *args):
+        """
+        Start ceph on a cluster.  Return success or failure information.
+        """
+        testdir = teuthology.get_testdir(self.ctx)
+        ceph_args = [
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'timeout',
+            '120',
+            'ceph',
+            '--cluster',
+            self.cluster,
+        ]
+        ceph_args.extend(args)
+        proc = self.controller.run(
+            args=ceph_args,
+            check_status=False,
+            )
+        return proc.exitstatus
+
+    def run_ceph_w(self):
+        """
+        Execute "ceph -w" in the background with stdout connected to a StringIO,
+        and return the RemoteProcess.
+        """
+        return self.controller.run(
+            args=["sudo",
+                  "daemon-helper",
+                  "kill",
+                  "ceph",
+                  '--cluster',
+                  self.cluster,
+                  "-w"],
+            wait=False, stdout=StringIO(), stdin=run.PIPE)
+
+    def flush_pg_stats(self, osds, no_wait=None, wait_for_mon=300):
+        """
+        Flush pg stats from a list of OSD ids, ensuring they are reflected
+        all the way to the monitor.  Luminous and later only.
+
+        :param osds: list of OSDs to flush
+        :param no_wait: list of OSDs not to wait for seq id. by default, we
+                        wait for all specified osds, but some of them could be
+                        moved out of osdmap, so we cannot get their updated
+                        stat seq from monitor anymore. in that case, you need
+                        to pass a blacklist.
+        :param wait_for_mon: wait for mon to be synced with mgr. 0 to disable
+                             it. (5 min by default)
+        """
+        seq = {osd: self.raw_cluster_cmd('tell', 'osd.%d' % osd, 'flush_pg_stats')
+               for osd in osds}
+        if not wait_for_mon:
+            return
+        if no_wait is None:
+            no_wait = []
+        for osd, need in seq.iteritems():
+            if osd in no_wait:
+                continue
+            got = 0
+            while wait_for_mon > 0:
+                got = self.raw_cluster_cmd('osd', 'last-stat-seq', 'osd.%d' % osd)
+                self.log('need seq {need} got {got} for osd.{osd}'.format(
+                    need=need, got=got, osd=osd))
+                if got >= need:
+                    break
+                A_WHILE = 1
+                time.sleep(A_WHILE)
+                wait_for_mon -= A_WHILE
+            else:
+                raise Exception('timed out waiting for mon to be updated with '
+                                'osd.{osd}: {got} < {need}'.
+                                format(osd=osd, got=got, need=need))
+
+    def flush_all_pg_stats(self):
+        self.flush_pg_stats(range(len(self.get_osd_dump())))
+
+    def do_rados(self, remote, cmd, check_status=True):
+        """
+        Execute a remote rados command.
+        """
+        testdir = teuthology.get_testdir(self.ctx)
+        pre = [
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'rados',
+            '--cluster',
+            self.cluster,
+            ]
+        pre.extend(cmd)
+        proc = remote.run(
+            args=pre,
+            wait=True,
+            check_status=check_status
+            )
+        return proc
+
+    def rados_write_objects(self, pool, num_objects, size,
+                            timelimit, threads, cleanup=False):
+        """
+        Write rados objects
+        Threads not used yet.
+        """
+        args = [
+            '-p', pool,
+            '--num-objects', num_objects,
+            '-b', size,
+            'bench', timelimit,
+            'write'
+            ]
+        if not cleanup:
+            args.append('--no-cleanup')
+        return self.do_rados(self.controller, map(str, args))
+
+    def do_put(self, pool, obj, fname, namespace=None):
+        """
+        Implement rados put operation
+        """
+        args = ['-p', pool]
+        if namespace is not None:
+            args += ['-N', namespace]
+        args += [
+            'put',
+            obj,
+            fname
+        ]
+        return self.do_rados(
+            self.controller,
+            args,
+            check_status=False
+        ).exitstatus
+
+    def do_get(self, pool, obj, fname='/dev/null', namespace=None):
+        """
+        Implement rados get operation
+        """
+        args = ['-p', pool]
+        if namespace is not None:
+            args += ['-N', namespace]
+        args += [
+            'get',
+            obj,
+            fname
+        ]
+        return self.do_rados(
+            self.controller,
+            args,
+            check_status=False
+        ).exitstatus
+
+    def do_rm(self, pool, obj, namespace=None):
+        """
+        Implement rados rm operation
+        """
+        args = ['-p', pool]
+        if namespace is not None:
+            args += ['-N', namespace]
+        args += [
+            'rm',
+            obj
+        ]
+        return self.do_rados(
+            self.controller,
+            args,
+            check_status=False
+        ).exitstatus
+
+    def osd_admin_socket(self, osd_id, command, check_status=True, timeout=0, stdout=None):
+        if stdout is None:
+            stdout = StringIO()
+        return self.admin_socket('osd', osd_id, command, check_status, timeout, stdout)
+
+    def find_remote(self, service_type, service_id):
+        """
+        Get the Remote for the host where a particular service runs.
+
+        :param service_type: 'mds', 'osd', 'client'
+        :param service_id: The second part of a role, e.g. '0' for
+                           the role 'client.0'
+        :return: a Remote instance for the host where the
+                 requested role is placed
+        """
+        return get_remote(self.ctx, self.cluster,
+                          service_type, service_id)
+
+    def admin_socket(self, service_type, service_id,
+                     command, check_status=True, timeout=0, stdout=None):
+        """
+        Remotely start up ceph specifying the admin socket
+        :param command: a list of words to use as the command
+                        to the admin socket
+        """
+        if stdout is None:
+            stdout = StringIO()
+        testdir = teuthology.get_testdir(self.ctx)
+        remote = self.find_remote(service_type, service_id)
+        args = [
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'timeout',
+            str(timeout),
+            'ceph',
+            '--cluster',
+            self.cluster,
+            '--admin-daemon',
+            '/var/run/ceph/{cluster}-{type}.{id}.asok'.format(
+                cluster=self.cluster,
+                type=service_type,
+                id=service_id),
+            ]
+        args.extend(command)
+        return remote.run(
+            args=args,
+            stdout=stdout,
+            wait=True,
+            check_status=check_status
+            )
+
+    def objectstore_tool(self, pool, options, args, **kwargs):
+        return ObjectStoreTool(self, pool, **kwargs).run(options, args)
+
+    def get_pgid(self, pool, pgnum):
+        """
+        :param pool: pool name
+        :param pgnum: pg number
+        :returns: a string representing this pg.
+        """
+        poolnum = self.get_pool_num(pool)
+        pg_str = "{poolnum}.{pgnum}".format(
+            poolnum=poolnum,
+            pgnum=pgnum)
+        return pg_str
+
+    def get_pg_replica(self, pool, pgnum):
+        """
+        get replica for pool, pgnum (e.g. (data, 0)->0
+        """
+        pg_str = self.get_pgid(pool, pgnum)
+        output = self.raw_cluster_cmd("pg", "map", pg_str, '--format=json')
+        j = json.loads('\n'.join(output.split('\n')[1:]))
+        return int(j['acting'][-1])
+        assert False
+
+    def wait_for_pg_stats(func):
+        # both osd_mon_report_interval_min and mgr_stats_period are 5 seconds
+        # by default, and take the faulty injection in ms into consideration,
+        # 12 seconds are more than enough
+        delays = [1, 1, 2, 3, 5, 8, 13]
+        @wraps(func)
+        def wrapper(self, *args, **kwargs):
+            exc = None
+            for delay in delays:
+                try:
+                    return func(self, *args, **kwargs)
+                except AssertionError as e:
+                    time.sleep(delay)
+                    exc = e
+            raise exc
+        return wrapper
+
+    def get_pg_primary(self, pool, pgnum):
+        """
+        get primary for pool, pgnum (e.g. (data, 0)->0
+        """
+        pg_str = self.get_pgid(pool, pgnum)
+        output = self.raw_cluster_cmd("pg", "map", pg_str, '--format=json')
+        j = json.loads('\n'.join(output.split('\n')[1:]))
+        return int(j['acting'][0])
+        assert False
+
+    def get_pool_num(self, pool):
+        """
+        get number for pool (e.g., data -> 2)
+        """
+        return int(self.get_pool_dump(pool)['pool'])
+
+    def list_pools(self):
+        """
+        list all pool names
+        """
+        osd_dump = self.get_osd_dump_json()
+        self.log(osd_dump['pools'])
+        return [str(i['pool_name']) for i in osd_dump['pools']]
+
+    def clear_pools(self):
+        """
+        remove all pools
+        """
+        [self.remove_pool(i) for i in self.list_pools()]
+
+    def kick_recovery_wq(self, osdnum):
+        """
+        Run kick_recovery_wq on cluster.
+        """
+        return self.raw_cluster_cmd(
+            'tell', "osd.%d" % (int(osdnum),),
+            'debug',
+            'kick_recovery_wq',
+            '0')
+
+    def wait_run_admin_socket(self, service_type,
+                              service_id, args=['version'], timeout=75, stdout=None):
+        """
+        If osd_admin_socket call suceeds, return.  Otherwise wait
+        five seconds and try again.
+        """
+        if stdout is None:
+            stdout = StringIO()
+        tries = 0
+        while True:
+            proc = self.admin_socket(service_type, service_id,
+                                     args, check_status=False, stdout=stdout)
+            if proc.exitstatus is 0:
+                return proc
+            else:
+                tries += 1
+                if (tries * 5) > timeout:
+                    raise Exception('timed out waiting for admin_socket '
+                                    'to appear after {type}.{id} restart'.
+                                    format(type=service_type,
+                                           id=service_id))
+                self.log("waiting on admin_socket for {type}-{id}, "
+                         "{command}".format(type=service_type,
+                                            id=service_id,
+                                            command=args))
+                time.sleep(5)
+
+    def get_pool_dump(self, pool):
+        """
+        get the osd dump part of a pool
+        """
+        osd_dump = self.get_osd_dump_json()
+        for i in osd_dump['pools']:
+            if i['pool_name'] == pool:
+                return i
+        assert False
+
+    def get_config(self, service_type, service_id, name):
+        """
+        :param node: like 'mon.a'
+        :param name: the option name
+        """
+        proc = self.wait_run_admin_socket(service_type, service_id,
+                                          ['config', 'show'])
+        j = json.loads(proc.stdout.getvalue())
+        return j[name]
+
+    def set_config(self, osdnum, **argdict):
+        """
+        :param osdnum: osd number
+        :param argdict: dictionary containing values to set.
+        """
+        for k, v in argdict.iteritems():
+            self.wait_run_admin_socket(
+                'osd', osdnum,
+                ['config', 'set', str(k), str(v)])
+
+    def raw_cluster_status(self):
+        """
+        Get status from cluster
+        """
+        status = self.raw_cluster_cmd('status', '--format=json-pretty')
+        return json.loads(status)
+
+    def raw_osd_status(self):
+        """
+        Get osd status from cluster
+        """
+        return self.raw_cluster_cmd('osd', 'dump')
+
+    def get_osd_status(self):
+        """
+        Get osd statuses sorted by states that the osds are in.
+        """
+        osd_lines = filter(
+            lambda x: x.startswith('osd.') and (("up" in x) or ("down" in x)),
+            self.raw_osd_status().split('\n'))
+        self.log(osd_lines)
+        in_osds = [int(i[4:].split()[0])
+                   for i in filter(lambda x: " in " in x, osd_lines)]
+        out_osds = [int(i[4:].split()[0])
+                    for i in filter(lambda x: " out " in x, osd_lines)]
+        up_osds = [int(i[4:].split()[0])
+                   for i in filter(lambda x: " up " in x, osd_lines)]
+        down_osds = [int(i[4:].split()[0])
+                     for i in filter(lambda x: " down " in x, osd_lines)]
+        dead_osds = [int(x.id_)
+                     for x in filter(lambda x:
+                                     not x.running(),
+                                     self.ctx.daemons.
+                                     iter_daemons_of_role('osd', self.cluster))]
+        live_osds = [int(x.id_) for x in
+                     filter(lambda x:
+                            x.running(),
+                            self.ctx.daemons.iter_daemons_of_role('osd',
+                                                                  self.cluster))]
+        return {'in': in_osds, 'out': out_osds, 'up': up_osds,
+                'down': down_osds, 'dead': dead_osds, 'live': live_osds,
+                'raw': osd_lines}
+
+    def get_num_pgs(self):
+        """
+        Check cluster status for the number of pgs
+        """
+        status = self.raw_cluster_status()
+        self.log(status)
+        return status['pgmap']['num_pgs']
+
+    def create_erasure_code_profile(self, profile_name, profile):
+        """
+        Create an erasure code profile name that can be used as a parameter
+        when creating an erasure coded pool.
+        """
+        with self.lock:
+            args = cmd_erasure_code_profile(profile_name, profile)
+            self.raw_cluster_cmd(*args)
+
+    def create_pool_with_unique_name(self, pg_num=16,
+                                     erasure_code_profile_name=None,
+                                     min_size=None,
+                                     erasure_code_use_overwrites=False):
+        """
+        Create a pool named unique_pool_X where X is unique.
+        """
+        name = ""
+        with self.lock:
+            name = "unique_pool_%s" % (str(self.next_pool_id),)
+            self.next_pool_id += 1
+            self.create_pool(
+                name,
+                pg_num,
+                erasure_code_profile_name=erasure_code_profile_name,
+                min_size=min_size,
+                erasure_code_use_overwrites=erasure_code_use_overwrites)
+        return name
+
+    @contextlib.contextmanager
+    def pool(self, pool_name, pg_num=16, erasure_code_profile_name=None):
+        self.create_pool(pool_name, pg_num, erasure_code_profile_name)
+        yield
+        self.remove_pool(pool_name)
+
+    def create_pool(self, pool_name, pg_num=16,
+                    erasure_code_profile_name=None,
+                    min_size=None,
+                    erasure_code_use_overwrites=False):
+        """
+        Create a pool named from the pool_name parameter.
+        :param pool_name: name of the pool being created.
+        :param pg_num: initial number of pgs.
+        :param erasure_code_profile_name: if set and !None create an
+                                          erasure coded pool using the profile
+        :param erasure_code_use_overwrites: if true, allow overwrites
+        """
+        with self.lock:
+            assert isinstance(pool_name, basestring)
+            assert isinstance(pg_num, int)
+            assert pool_name not in self.pools
+            self.log("creating pool_name %s" % (pool_name,))
+            if erasure_code_profile_name:
+                self.raw_cluster_cmd('osd', 'pool', 'create',
+                                     pool_name, str(pg_num), str(pg_num),
+                                     'erasure', erasure_code_profile_name)
+            else:
+                self.raw_cluster_cmd('osd', 'pool', 'create',
+                                     pool_name, str(pg_num))
+            if min_size is not None:
+                self.raw_cluster_cmd(
+                    'osd', 'pool', 'set', pool_name,
+                    'min_size',
+                    str(min_size))
+            if erasure_code_use_overwrites:
+                self.raw_cluster_cmd(
+                    'osd', 'pool', 'set', pool_name,
+                    'allow_ec_overwrites',
+                    'true')
+            self.raw_cluster_cmd(
+                'osd', 'pool', 'application', 'enable',
+                pool_name, 'rados', '--yes-i-really-mean-it',
+                run.Raw('||'), 'true')
+            self.pools[pool_name] = pg_num
+        time.sleep(1)
+
+    def add_pool_snap(self, pool_name, snap_name):
+        """
+        Add pool snapshot
+        :param pool_name: name of pool to snapshot
+        :param snap_name: name of snapshot to take
+        """
+        self.raw_cluster_cmd('osd', 'pool', 'mksnap',
+                             str(pool_name), str(snap_name))
+
+    def remove_pool_snap(self, pool_name, snap_name):
+        """
+        Remove pool snapshot
+        :param pool_name: name of pool to snapshot
+        :param snap_name: name of snapshot to remove
+        """
+        self.raw_cluster_cmd('osd', 'pool', 'rmsnap',
+                             str(pool_name), str(snap_name))
+
+    def remove_pool(self, pool_name):
+        """
+        Remove the indicated pool
+        :param pool_name: Pool to be removed
+        """
+        with self.lock:
+            assert isinstance(pool_name, basestring)
+            assert pool_name in self.pools
+            self.log("removing pool_name %s" % (pool_name,))
+            del self.pools[pool_name]
+            self.do_rados(self.controller,
+                          ['rmpool', pool_name, pool_name,
+                           "--yes-i-really-really-mean-it"])
+
+    def get_pool(self):
+        """
+        Pick a random pool
+        """
+        with self.lock:
+            return random.choice(self.pools.keys())
+
+    def get_pool_pg_num(self, pool_name):
+        """
+        Return the number of pgs in the pool specified.
+        """
+        with self.lock:
+            assert isinstance(pool_name, basestring)
+            if pool_name in self.pools:
+                return self.pools[pool_name]
+            return 0
+
+    def get_pool_property(self, pool_name, prop):
+        """
+        :param pool_name: pool
+        :param prop: property to be checked.
+        :returns: property as an int value.
+        """
+        with self.lock:
+            assert isinstance(pool_name, basestring)
+            assert isinstance(prop, basestring)
+            output = self.raw_cluster_cmd(
+                'osd',
+                'pool',
+                'get',
+                pool_name,
+                prop)
+            return int(output.split()[1])
+
+    def set_pool_property(self, pool_name, prop, val):
+        """
+        :param pool_name: pool
+        :param prop: property to be set.
+        :param val: value to set.
+
+        This routine retries if set operation fails.
+        """
+        with self.lock:
+            assert isinstance(pool_name, basestring)
+            assert isinstance(prop, basestring)
+            assert isinstance(val, int)
+            tries = 0
+            while True:
+                r = self.raw_cluster_cmd_result(
+                    'osd',
+                    'pool',
+                    'set',
+                    pool_name,
+                    prop,
+                    str(val))
+                if r != 11:  # EAGAIN
+                    break
+                tries += 1
+                if tries > 50:
+                    raise Exception('timed out getting EAGAIN '
+                                    'when setting pool property %s %s = %s' %
+                                    (pool_name, prop, val))
+                self.log('got EAGAIN setting pool property, '
+                         'waiting a few seconds...')
+                time.sleep(2)
+
+    def expand_pool(self, pool_name, by, max_pgs):
+        """
+        Increase the number of pgs in a pool
+        """
+        with self.lock:
+            assert isinstance(pool_name, basestring)
+            assert isinstance(by, int)
+            assert pool_name in self.pools
+            if self.get_num_creating() > 0:
+                return False
+            if (self.pools[pool_name] + by) > max_pgs:
+                return False
+            self.log("increase pool size by %d" % (by,))
+            new_pg_num = self.pools[pool_name] + by
+            self.set_pool_property(pool_name, "pg_num", new_pg_num)
+            self.pools[pool_name] = new_pg_num
+            return True
+
+    def set_pool_pgpnum(self, pool_name, force):
+        """
+        Set pgpnum property of pool_name pool.
+        """
+        with self.lock:
+            assert isinstance(pool_name, basestring)
+            assert pool_name in self.pools
+            if not force and self.get_num_creating() > 0:
+                return False
+            self.set_pool_property(pool_name, 'pgp_num', self.pools[pool_name])
+            return True
+
+    def list_pg_missing(self, pgid):
+        """
+        return list of missing pgs with the id specified
+        """
+        r = None
+        offset = {}
+        while True:
+            out = self.raw_cluster_cmd('--', 'pg', pgid, 'list_missing',
+                                       json.dumps(offset))
+            j = json.loads(out)
+            if r is None:
+                r = j
+            else:
+                r['objects'].extend(j['objects'])
+            if not 'more' in j:
+                break
+            if j['more'] == 0:
+                break
+            offset = j['objects'][-1]['oid']
+        if 'more' in r:
+            del r['more']
+        return r
+
+    def get_pg_stats(self):
+        """
+        Dump the cluster and get pg stats
+        """
+        out = self.raw_cluster_cmd('pg', 'dump', '--format=json')
+        j = json.loads('\n'.join(out.split('\n')[1:]))
+        return j['pg_stats']
+
+    def get_pgids_to_force(self, backfill):
+        """
+        Return the randomized list of PGs that can have their recovery/backfill forced
+        """
+        j = self.get_pg_stats();
+        pgids = []
+        if backfill:
+            wanted = ['degraded', 'backfilling', 'backfill_wait']
+        else:
+            wanted = ['recovering', 'degraded', 'recovery_wait']
+        for pg in j:
+            status = pg['state'].split('+')
+            for t in wanted:
+                if random.random() > 0.5 and not ('forced_backfill' in status or 'forced_recovery' in status) and t in status:
+                    pgids.append(pg['pgid'])
+                    break
+        return pgids
+
+    def get_pgids_to_cancel_force(self, backfill):
+       """
+       Return the randomized list of PGs whose recovery/backfill priority is forced
+       """
+       j = self.get_pg_stats();
+       pgids = []
+       if backfill:
+           wanted = 'forced_backfill'
+       else:
+           wanted = 'forced_recovery'
+       for pg in j:
+           status = pg['state'].split('+')
+           if wanted in status and random.random() > 0.5:
+               pgids.append(pg['pgid'])
+       return pgids
+
+    def compile_pg_status(self):
+        """
+        Return a histogram of pg state values
+        """
+        ret = {}
+        j = self.get_pg_stats()
+        for pg in j:
+            for status in pg['state'].split('+'):
+                if status not in ret:
+                    ret[status] = 0
+                ret[status] += 1
+        return ret
+
+    @wait_for_pg_stats
+    def with_pg_state(self, pool, pgnum, check):
+        pgstr = self.get_pgid(pool, pgnum)
+        stats = self.get_single_pg_stats(pgstr)
+        assert(check(stats['state']))
+
+    @wait_for_pg_stats
+    def with_pg(self, pool, pgnum, check):
+        pgstr = self.get_pgid(pool, pgnum)
+        stats = self.get_single_pg_stats(pgstr)
+        return check(stats)
+
+    def get_last_scrub_stamp(self, pool, pgnum):
+        """
+        Get the timestamp of the last scrub.
+        """
+        stats = self.get_single_pg_stats(self.get_pgid(pool, pgnum))
+        return stats["last_scrub_stamp"]
+
+    def do_pg_scrub(self, pool, pgnum, stype):
+        """
+        Scrub pg and wait for scrubbing to finish
+        """
+        init = self.get_last_scrub_stamp(pool, pgnum)
+        RESEND_TIMEOUT = 120    # Must be a multiple of SLEEP_TIME
+        FATAL_TIMEOUT = RESEND_TIMEOUT * 3
+        SLEEP_TIME = 10
+        timer = 0
+        while init == self.get_last_scrub_stamp(pool, pgnum):
+            assert timer < FATAL_TIMEOUT, "fatal timeout trying to " + stype
+            self.log("waiting for scrub type %s" % (stype,))
+            if (timer % RESEND_TIMEOUT) == 0:
+                self.raw_cluster_cmd('pg', stype, self.get_pgid(pool, pgnum))
+                # The first time in this loop is the actual request
+                if timer != 0 and stype == "repair":
+                    self.log("WARNING: Resubmitted a non-idempotent repair")
+            time.sleep(SLEEP_TIME)
+            timer += SLEEP_TIME
+
+    def wait_snap_trimming_complete(self, pool):
+        """
+        Wait for snap trimming on pool to end
+        """
+        POLL_PERIOD = 10
+        FATAL_TIMEOUT = 600
+        start = time.time()
+        poolnum = self.get_pool_num(pool)
+        poolnumstr = "%s." % (poolnum,)
+        while (True):
+            now = time.time()
+            if (now - start) > FATAL_TIMEOUT:
+                assert (now - start) < FATAL_TIMEOUT, \
+                    'failed to complete snap trimming before timeout'
+            all_stats = self.get_pg_stats()
+            trimming = False
+            for pg in all_stats:
+                if (poolnumstr in pg['pgid']) and ('snaptrim' in pg['state']):
+                    self.log("pg {pg} in trimming, state: {state}".format(
+                        pg=pg['pgid'],
+                        state=pg['state']))
+                    trimming = True
+            if not trimming:
+                break
+            self.log("{pool} still trimming, waiting".format(pool=pool))
+            time.sleep(POLL_PERIOD)
+
+    def get_single_pg_stats(self, pgid):
+        """
+        Return pg for the pgid specified.
+        """
+        all_stats = self.get_pg_stats()
+
+        for pg in all_stats:
+            if pg['pgid'] == pgid:
+                return pg
+
+        return None
+
+    def get_object_pg_with_shard(self, pool, name, osdid):
+        """
+        """
+        pool_dump = self.get_pool_dump(pool)
+        object_map = self.get_object_map(pool, name)
+        if pool_dump["type"] == CephManager.ERASURE_CODED_POOL:
+            shard = object_map['acting'].index(osdid)
+            return "{pgid}s{shard}".format(pgid=object_map['pgid'],
+                                           shard=shard)
+        else:
+            return object_map['pgid']
+
+    def get_object_primary(self, pool, name):
+        """
+        """
+        object_map = self.get_object_map(pool, name)
+        return object_map['acting_primary']
+
+    def get_object_map(self, pool, name):
+        """
+        osd map --format=json converted to a python object
+        :returns: the python object
+        """
+        out = self.raw_cluster_cmd('--format=json', 'osd', 'map', pool, name)
+        return json.loads('\n'.join(out.split('\n')[1:]))
+
+    def get_osd_dump_json(self):
+        """
+        osd dump --format=json converted to a python object
+        :returns: the python object
+        """
+        out = self.raw_cluster_cmd('osd', 'dump', '--format=json')
+        return json.loads('\n'.join(out.split('\n')[1:]))
+
+    def get_osd_dump(self):
+        """
+        Dump osds
+        :returns: all osds
+        """
+        return self.get_osd_dump_json()['osds']
+
+    def get_mgr_dump(self):
+        out = self.raw_cluster_cmd('mgr', 'dump', '--format=json')
+        return json.loads(out)
+
+    def get_stuck_pgs(self, type_, threshold):
+        """
+        :returns: stuck pg information from the cluster
+        """
+        out = self.raw_cluster_cmd('pg', 'dump_stuck', type_, str(threshold),
+                                   '--format=json')
+        return json.loads(out)
+
+    def get_num_unfound_objects(self):
+        """
+        Check cluster status to get the number of unfound objects
+        """
+        status = self.raw_cluster_status()
+        self.log(status)
+        return status['pgmap'].get('unfound_objects', 0)
+
+    def get_num_creating(self):
+        """
+        Find the number of pgs in creating mode.
+        """
+        pgs = self.get_pg_stats()
+        num = 0
+        for pg in pgs:
+            if 'creating' in pg['state']:
+                num += 1
+        return num
+
+    def get_num_active_clean(self):
+        """
+        Find the number of active and clean pgs.
+        """
+        pgs = self.get_pg_stats()
+        num = 0
+        for pg in pgs:
+            if (pg['state'].count('active') and
+                    pg['state'].count('clean') and
+                    not pg['state'].count('stale')):
+                num += 1
+        return num
+
+    def get_num_active_recovered(self):
+        """
+        Find the number of active and recovered pgs.
+        """
+        pgs = self.get_pg_stats()
+        num = 0
+        for pg in pgs:
+            if (pg['state'].count('active') and
+                    not pg['state'].count('recover') and
+                    not pg['state'].count('backfilling') and
+                    not pg['state'].count('stale')):
+                num += 1
+        return num
+
+    def get_is_making_recovery_progress(self):
+        """
+        Return whether there is recovery progress discernable in the
+        raw cluster status
+        """
+        status = self.raw_cluster_status()
+        kps = status['pgmap'].get('recovering_keys_per_sec', 0)
+        bps = status['pgmap'].get('recovering_bytes_per_sec', 0)
+        ops = status['pgmap'].get('recovering_objects_per_sec', 0)
+        return kps > 0 or bps > 0 or ops > 0
+
+    def get_num_active(self):
+        """
+        Find the number of active pgs.
+        """
+        pgs = self.get_pg_stats()
+        num = 0
+        for pg in pgs:
+            if pg['state'].count('active') and not pg['state'].count('stale'):
+                num += 1
+        return num
+
+    def get_num_down(self):
+        """
+        Find the number of pgs that are down.
+        """
+        pgs = self.get_pg_stats()
+        num = 0
+        for pg in pgs:
+            if ((pg['state'].count('down') and not
+                    pg['state'].count('stale')) or
+                (pg['state'].count('incomplete') and not
+                    pg['state'].count('stale'))):
+                num += 1
+        return num
+
+    def get_num_active_down(self):
+        """
+        Find the number of pgs that are either active or down.
+        """
+        pgs = self.get_pg_stats()
+        num = 0
+        for pg in pgs:
+            if ((pg['state'].count('active') and not
+                    pg['state'].count('stale')) or
+                (pg['state'].count('down') and not
+                    pg['state'].count('stale')) or
+                (pg['state'].count('incomplete') and not
+                    pg['state'].count('stale'))):
+                num += 1
+        return num
+
+    def is_clean(self):
+        """
+        True if all pgs are clean
+        """
+        return self.get_num_active_clean() == self.get_num_pgs()
+
+    def is_recovered(self):
+        """
+        True if all pgs have recovered
+        """
+        return self.get_num_active_recovered() == self.get_num_pgs()
+
+    def is_active_or_down(self):
+        """
+        True if all pgs are active or down
+        """
+        return self.get_num_active_down() == self.get_num_pgs()
+
+    def wait_for_clean(self, timeout=None):
+        """
+        Returns true when all pgs are clean.
+        """
+        self.log("waiting for clean")
+        start = time.time()
+        num_active_clean = self.get_num_active_clean()
+        while not self.is_clean():
+            if timeout is not None:
+                if self.get_is_making_recovery_progress():
+                    self.log("making progress, resetting timeout")
+                    start = time.time()
+                else:
+                    self.log("no progress seen, keeping timeout for now")
+                    if time.time() - start >= timeout:
+                        self.log('dumping pgs')
+                        out = self.raw_cluster_cmd('pg', 'dump')
+                        self.log(out)
+                        assert time.time() - start < timeout, \
+                            'failed to become clean before timeout expired'
+            cur_active_clean = self.get_num_active_clean()
+            if cur_active_clean != num_active_clean:
+                start = time.time()
+                num_active_clean = cur_active_clean
+            time.sleep(3)
+        self.log("clean!")
+
+    def are_all_osds_up(self):
+        """
+        Returns true if all osds are up.
+        """
+        x = self.get_osd_dump()
+        return (len(x) == sum([(y['up'] > 0) for y in x]))
+
+    def wait_for_all_osds_up(self, timeout=None):
+        """
+        When this exits, either the timeout has expired, or all
+        osds are up.
+        """
+        self.log("waiting for all up")
+        start = time.time()
+        while not self.are_all_osds_up():
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'timeout expired in wait_for_all_osds_up'
+            time.sleep(3)
+        self.log("all up!")
+
+    def pool_exists(self, pool):
+        if pool in self.list_pools():
+            return True
+        return False
+
+    def wait_for_pool(self, pool, timeout=300):
+        """
+        Wait for a pool to exist
+        """
+        self.log('waiting for pool %s to exist' % pool)
+        start = time.time()
+        while not self.pool_exists(pool):
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'timeout expired in wait_for_pool'
+            time.sleep(3)
+
+    def wait_for_pools(self, pools):
+        for pool in pools:
+            self.wait_for_pool(pool)
+
+    def is_mgr_available(self):
+        x = self.get_mgr_dump()
+        return x.get('available', False)
+
+    def wait_for_mgr_available(self, timeout=None):
+        self.log("waiting for mgr available")
+        start = time.time()
+        while not self.is_mgr_available():
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'timeout expired in wait_for_mgr_available'
+            time.sleep(3)
+        self.log("mgr available!")
+
+    def wait_for_recovery(self, timeout=None):
+        """
+        Check peering. When this exists, we have recovered.
+        """
+        self.log("waiting for recovery to complete")
+        start = time.time()
+        num_active_recovered = self.get_num_active_recovered()
+        while not self.is_recovered():
+            now = time.time()
+            if timeout is not None:
+                if self.get_is_making_recovery_progress():
+                    self.log("making progress, resetting timeout")
+                    start = time.time()
+                else:
+                    self.log("no progress seen, keeping timeout for now")
+                    if now - start >= timeout:
+			if self.is_recovered():
+			    break
+                        self.log('dumping pgs')
+                        out = self.raw_cluster_cmd('pg', 'dump')
+                        self.log(out)
+                        assert now - start < timeout, \
+                            'failed to recover before timeout expired'
+            cur_active_recovered = self.get_num_active_recovered()
+            if cur_active_recovered != num_active_recovered:
+                start = time.time()
+                num_active_recovered = cur_active_recovered
+            time.sleep(3)
+        self.log("recovered!")
+
+    def wait_for_active(self, timeout=None):
+        """
+        Check peering. When this exists, we are definitely active
+        """
+        self.log("waiting for peering to complete")
+        start = time.time()
+        num_active = self.get_num_active()
+        while not self.is_active():
+            if timeout is not None:
+                if time.time() - start >= timeout:
+                    self.log('dumping pgs')
+                    out = self.raw_cluster_cmd('pg', 'dump')
+                    self.log(out)
+                    assert time.time() - start < timeout, \
+                        'failed to recover before timeout expired'
+            cur_active = self.get_num_active()
+            if cur_active != num_active:
+                start = time.time()
+                num_active = cur_active
+            time.sleep(3)
+        self.log("active!")
+
+    def wait_for_active_or_down(self, timeout=None):
+        """
+        Check peering. When this exists, we are definitely either
+        active or down
+        """
+        self.log("waiting for peering to complete or become blocked")
+        start = time.time()
+        num_active_down = self.get_num_active_down()
+        while not self.is_active_or_down():
+            if timeout is not None:
+                if time.time() - start >= timeout:
+                    self.log('dumping pgs')
+                    out = self.raw_cluster_cmd('pg', 'dump')
+                    self.log(out)
+                    assert time.time() - start < timeout, \
+                        'failed to recover before timeout expired'
+            cur_active_down = self.get_num_active_down()
+            if cur_active_down != num_active_down:
+                start = time.time()
+                num_active_down = cur_active_down
+            time.sleep(3)
+        self.log("active or down!")
+
+    def osd_is_up(self, osd):
+        """
+        Wrapper for osd check
+        """
+        osds = self.get_osd_dump()
+        return osds[osd]['up'] > 0
+
+    def wait_till_osd_is_up(self, osd, timeout=None):
+        """
+        Loop waiting for osd.
+        """
+        self.log('waiting for osd.%d to be up' % osd)
+        start = time.time()
+        while not self.osd_is_up(osd):
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'osd.%d failed to come up before timeout expired' % osd
+            time.sleep(3)
+        self.log('osd.%d is up' % osd)
+
+    def is_active(self):
+        """
+        Wrapper to check if all pgs are active
+        """
+        return self.get_num_active() == self.get_num_pgs()
+
+    def wait_till_active(self, timeout=None):
+        """
+        Wait until all pgs are active.
+        """
+        self.log("waiting till active")
+        start = time.time()
+        while not self.is_active():
+            if timeout is not None:
+                if time.time() - start >= timeout:
+                    self.log('dumping pgs')
+                    out = self.raw_cluster_cmd('pg', 'dump')
+                    self.log(out)
+                    assert time.time() - start < timeout, \
+                        'failed to become active before timeout expired'
+            time.sleep(3)
+        self.log("active!")
+
+    def wait_till_pg_convergence(self, timeout=None):
+        start = time.time()
+        old_stats = None
+        active_osds = [osd['osd'] for osd in self.get_osd_dump()
+                       if osd['in'] and osd['up']]
+        while True:
+            # strictly speaking, no need to wait for mon. but due to the
+            # "ms inject socket failures" setting, the osdmap could be delayed,
+            # so mgr is likely to ignore the pg-stat messages with pgs serving
+            # newly created pools which is not yet known by mgr. so, to make sure
+            # the mgr is updated with the latest pg-stats, waiting for mon/mgr is
+            # necessary.
+            self.flush_pg_stats(active_osds)
+            new_stats = dict((stat['pgid'], stat['state'])
+                             for stat in self.get_pg_stats())
+            if old_stats == new_stats:
+                return old_stats
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    'failed to reach convergence before %d secs' % timeout
+            old_stats = new_stats
+            # longer than mgr_stats_period
+            time.sleep(5 + 1)
+
+    def mark_out_osd(self, osd):
+        """
+        Wrapper to mark osd out.
+        """
+        self.raw_cluster_cmd('osd', 'out', str(osd))
+
+    def kill_osd(self, osd):
+        """
+        Kill osds by either power cycling (if indicated by the config)
+        or by stopping.
+        """
+        if self.config.get('powercycle'):
+            remote = self.find_remote('osd', osd)
+            self.log('kill_osd on osd.{o} '
+                     'doing powercycle of {s}'.format(o=osd, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_off()
+        elif self.config.get('bdev_inject_crash') and self.config.get('bdev_inject_crash_probability'):
+            if random.uniform(0, 1) < self.config.get('bdev_inject_crash_probability', .5):
+                self.raw_cluster_cmd(
+                    '--', 'tell', 'osd.%d' % osd,
+                    'injectargs',
+                    '--bdev-inject-crash %d' % self.config.get('bdev_inject_crash'),
+                )
+                try:
+                    self.ctx.daemons.get_daemon('osd', osd, self.cluster).wait()
+                except:
+                    pass
+                else:
+                    raise RuntimeError('osd.%s did not fail' % osd)
+            else:
+                self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
+        else:
+            self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
+
+    @staticmethod
+    def _assert_ipmi(remote):
+        assert remote.console.has_ipmi_credentials, (
+            "powercycling requested but RemoteConsole is not "
+            "initialized.  Check ipmi config.")
+
+    def blackhole_kill_osd(self, osd):
+        """
+        Stop osd if nothing else works.
+        """
+        self.raw_cluster_cmd('--', 'tell', 'osd.%d' % osd,
+                             'injectargs',
+                             '--objectstore-blackhole')
+        time.sleep(2)
+        self.ctx.daemons.get_daemon('osd', osd, self.cluster).stop()
+
+    def revive_osd(self, osd, timeout=360, skip_admin_check=False):
+        """
+        Revive osds by either power cycling (if indicated by the config)
+        or by restarting.
+        """
+        if self.config.get('powercycle'):
+            remote = self.find_remote('osd', osd)
+            self.log('kill_osd on osd.{o} doing powercycle of {s}'.
+                     format(o=osd, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_on()
+            if not remote.console.check_status(300):
+                raise Exception('Failed to revive osd.{o} via ipmi'.
+                                format(o=osd))
+            teuthology.reconnect(self.ctx, 60, [remote])
+            mount_osd_data(self.ctx, remote, self.cluster, str(osd))
+            self.make_admin_daemon_dir(remote)
+            self.ctx.daemons.get_daemon('osd', osd, self.cluster).reset()
+        self.ctx.daemons.get_daemon('osd', osd, self.cluster).restart()
+
+        if not skip_admin_check:
+            # wait for dump_ops_in_flight; this command doesn't appear
+            # until after the signal handler is installed and it is safe
+            # to stop the osd again without making valgrind leak checks
+            # unhappy.  see #5924.
+            self.wait_run_admin_socket('osd', osd,
+                                       args=['dump_ops_in_flight'],
+                                       timeout=timeout, stdout=DEVNULL)
+
+    def mark_down_osd(self, osd):
+        """
+        Cluster command wrapper
+        """
+        self.raw_cluster_cmd('osd', 'down', str(osd))
+
+    def mark_in_osd(self, osd):
+        """
+        Cluster command wrapper
+        """
+        self.raw_cluster_cmd('osd', 'in', str(osd))
+
+    def signal_osd(self, osd, sig, silent=False):
+        """
+        Wrapper to local get_daemon call which sends the given
+        signal to the given osd.
+        """
+        self.ctx.daemons.get_daemon('osd', osd,
+                                    self.cluster).signal(sig, silent=silent)
+
+    ## monitors
+    def signal_mon(self, mon, sig, silent=False):
+        """
+        Wrapper to local get_deamon call
+        """
+        self.ctx.daemons.get_daemon('mon', mon,
+                                    self.cluster).signal(sig, silent=silent)
+
+    def kill_mon(self, mon):
+        """
+        Kill the monitor by either power cycling (if the config says so),
+        or by doing a stop.
+        """
+        if self.config.get('powercycle'):
+            remote = self.find_remote('mon', mon)
+            self.log('kill_mon on mon.{m} doing powercycle of {s}'.
+                     format(m=mon, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_off()
+        else:
+            self.ctx.daemons.get_daemon('mon', mon, self.cluster).stop()
+
+    def revive_mon(self, mon):
+        """
+        Restart by either power cycling (if the config says so),
+        or by doing a normal restart.
+        """
+        if self.config.get('powercycle'):
+            remote = self.find_remote('mon', mon)
+            self.log('revive_mon on mon.{m} doing powercycle of {s}'.
+                     format(m=mon, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_on()
+            self.make_admin_daemon_dir(remote)
+        self.ctx.daemons.get_daemon('mon', mon, self.cluster).restart()
+
+    def revive_mgr(self, mgr):
+        """
+        Restart by either power cycling (if the config says so),
+        or by doing a normal restart.
+        """
+        if self.config.get('powercycle'):
+            remote = self.find_remote('mgr', mgr)
+            self.log('revive_mgr on mgr.{m} doing powercycle of {s}'.
+                     format(m=mgr, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_on()
+            self.make_admin_daemon_dir(remote)
+        self.ctx.daemons.get_daemon('mgr', mgr, self.cluster).restart()
+
+    def get_mon_status(self, mon):
+        """
+        Extract all the monitor status information from the cluster
+        """
+        addr = self.ctx.ceph[self.cluster].conf['mon.%s' % mon]['mon addr']
+        out = self.raw_cluster_cmd('-m', addr, 'mon_status')
+        return json.loads(out)
+
+    def get_mon_quorum(self):
+        """
+        Extract monitor quorum information from the cluster
+        """
+        out = self.raw_cluster_cmd('quorum_status')
+        j = json.loads(out)
+        self.log('quorum_status is %s' % out)
+        return j['quorum']
+
+    def wait_for_mon_quorum_size(self, size, timeout=300):
+        """
+        Loop until quorum size is reached.
+        """
+        self.log('waiting for quorum size %d' % size)
+        start = time.time()
+        while not len(self.get_mon_quorum()) == size:
+            if timeout is not None:
+                assert time.time() - start < timeout, \
+                    ('failed to reach quorum size %d '
+                     'before timeout expired' % size)
+            time.sleep(3)
+        self.log("quorum is size %d" % size)
+
+    def get_mon_health(self, debug=False):
+        """
+        Extract all the monitor health information.
+        """
+        out = self.raw_cluster_cmd('health', '--format=json')
+        if debug:
+            self.log('health:\n{h}'.format(h=out))
+        return json.loads(out)
+
+    def get_mds_status(self, mds):
+        """
+        Run cluster commands for the mds in order to get mds information
+        """
+        out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
+        j = json.loads(' '.join(out.splitlines()[1:]))
+        # collate; for dup ids, larger gid wins.
+        for info in j['info'].itervalues():
+            if info['name'] == mds:
+                return info
+        return None
+
+    def get_filepath(self):
+        """
+        Return path to osd data with {id} needing to be replaced
+        """
+        return '/var/lib/ceph/osd/' + self.cluster + '-{id}'
+
+    def make_admin_daemon_dir(self, remote):
+        """
+        Create /var/run/ceph directory on remote site.
+
+        :param ctx: Context
+        :param remote: Remote site
+        """
+        remote.run(args=['sudo',
+                         'install', '-d', '-m0777', '--', '/var/run/ceph', ], )
+
+
+def utility_task(name):
+    """
+    Generate ceph_manager subtask corresponding to ceph_manager
+    method name
+    """
+    def task(ctx, config):
+        if config is None:
+            config = {}
+        args = config.get('args', [])
+        kwargs = config.get('kwargs', {})
+        cluster = config.get('cluster', 'ceph')
+        fn = getattr(ctx.managers[cluster], name)
+        fn(*args, **kwargs)
+    return task
+
+revive_osd = utility_task("revive_osd")
+revive_mon = utility_task("revive_mon")
+kill_osd = utility_task("kill_osd")
+kill_mon = utility_task("kill_mon")
+create_pool = utility_task("create_pool")
+remove_pool = utility_task("remove_pool")
+wait_for_clean = utility_task("wait_for_clean")
+flush_all_pg_stats = utility_task("flush_all_pg_stats")
+set_pool_property = utility_task("set_pool_property")
+do_pg_scrub = utility_task("do_pg_scrub")
+wait_for_pool = utility_task("wait_for_pool")
+wait_for_pools = utility_task("wait_for_pools")
diff --git a/src/ceph/qa/tasks/ceph_objectstore_tool.py b/src/ceph/qa/tasks/ceph_objectstore_tool.py
new file mode 100644
index 0000000..9125773
--- /dev/null
+++ b/src/ceph/qa/tasks/ceph_objectstore_tool.py
@@ -0,0 +1,670 @@
+"""
+ceph_objectstore_tool - Simple test of ceph-objectstore-tool utility
+"""
+from cStringIO import StringIO
+import contextlib
+import logging
+import ceph_manager
+from teuthology import misc as teuthology
+import time
+import os
+import string
+from teuthology.orchestra import run
+import sys
+import tempfile
+import json
+from util.rados import (rados, create_replicated_pool, create_ec_pool)
+# from util.rados import (rados, create_ec_pool,
+#                               create_replicated_pool,
+#                               create_cache_pool)
+
+log = logging.getLogger(__name__)
+
+# Should get cluster name "ceph" from somewhere
+# and normal path from osd_data and osd_journal in conf
+FSPATH = "/var/lib/ceph/osd/ceph-{id}"
+JPATH = "/var/lib/ceph/osd/ceph-{id}/journal"
+
+
+def cod_setup_local_data(log, ctx, NUM_OBJECTS, DATADIR,
+                         BASE_NAME, DATALINECOUNT):
+    objects = range(1, NUM_OBJECTS + 1)
+    for i in objects:
+        NAME = BASE_NAME + "{num}".format(num=i)
+        LOCALNAME = os.path.join(DATADIR, NAME)
+
+        dataline = range(DATALINECOUNT)
+        fd = open(LOCALNAME, "w")
+        data = "This is the data for " + NAME + "\n"
+        for _ in dataline:
+            fd.write(data)
+        fd.close()
+
+
+def cod_setup_remote_data(log, ctx, remote, NUM_OBJECTS, DATADIR,
+                          BASE_NAME, DATALINECOUNT):
+
+    objects = range(1, NUM_OBJECTS + 1)
+    for i in objects:
+        NAME = BASE_NAME + "{num}".format(num=i)
+        DDNAME = os.path.join(DATADIR, NAME)
+
+        remote.run(args=['rm', '-f', DDNAME])
+
+        dataline = range(DATALINECOUNT)
+        data = "This is the data for " + NAME + "\n"
+        DATA = ""
+        for _ in dataline:
+            DATA += data
+        teuthology.write_file(remote, DDNAME, DATA)
+
+
+def cod_setup(log, ctx, remote, NUM_OBJECTS, DATADIR,
+              BASE_NAME, DATALINECOUNT, POOL, db, ec):
+    ERRORS = 0
+    log.info("Creating {objs} objects in pool".format(objs=NUM_OBJECTS))
+
+    objects = range(1, NUM_OBJECTS + 1)
+    for i in objects:
+        NAME = BASE_NAME + "{num}".format(num=i)
+        DDNAME = os.path.join(DATADIR, NAME)
+
+        proc = rados(ctx, remote, ['-p', POOL, 'put', NAME, DDNAME],
+                     wait=False)
+        # proc = remote.run(args=['rados', '-p', POOL, 'put', NAME, DDNAME])
+        ret = proc.wait()
+        if ret != 0:
+            log.critical("Rados put failed with status {ret}".
+                         format(ret=proc.exitstatus))
+            sys.exit(1)
+
+        db[NAME] = {}
+
+        keys = range(i)
+        db[NAME]["xattr"] = {}
+        for k in keys:
+            if k == 0:
+                continue
+            mykey = "key{i}-{k}".format(i=i, k=k)
+            myval = "val{i}-{k}".format(i=i, k=k)
+            proc = remote.run(args=['rados', '-p', POOL, 'setxattr',
+                                    NAME, mykey, myval])
+            ret = proc.wait()
+            if ret != 0:
+                log.error("setxattr failed with {ret}".format(ret=ret))
+                ERRORS += 1
+            db[NAME]["xattr"][mykey] = myval
+
+        # Erasure coded pools don't support omap
+        if ec:
+            continue
+
+        # Create omap header in all objects but REPobject1
+        if i != 1:
+            myhdr = "hdr{i}".format(i=i)
+            proc = remote.run(args=['rados', '-p', POOL, 'setomapheader',
+                                    NAME, myhdr])
+            ret = proc.wait()
+            if ret != 0:
+                log.critical("setomapheader failed with {ret}".format(ret=ret))
+                ERRORS += 1
+            db[NAME]["omapheader"] = myhdr
+
+        db[NAME]["omap"] = {}
+        for k in keys:
+            if k == 0:
+                continue
+            mykey = "okey{i}-{k}".format(i=i, k=k)
+            myval = "oval{i}-{k}".format(i=i, k=k)
+            proc = remote.run(args=['rados', '-p', POOL, 'setomapval',
+                                    NAME, mykey, myval])
+            ret = proc.wait()
+            if ret != 0:
+                log.critical("setomapval failed with {ret}".format(ret=ret))
+            db[NAME]["omap"][mykey] = myval
+
+    return ERRORS
+
+
+def get_lines(filename):
+    tmpfd = open(filename, "r")
+    line = True
+    lines = []
+    while line:
+        line = tmpfd.readline().rstrip('\n')
+        if line:
+            lines += [line]
+    tmpfd.close()
+    os.unlink(filename)
+    return lines
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run ceph_objectstore_tool test
+
+    The config should be as follows::
+
+        ceph_objectstore_tool:
+          objects: 20 # <number of objects>
+          pgnum: 12
+    """
+
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'ceph_objectstore_tool task only accepts a dict for configuration'
+
+    log.info('Beginning ceph_objectstore_tool...')
+
+    log.debug(config)
+    log.debug(ctx)
+    clients = ctx.cluster.only(teuthology.is_type('client'))
+    assert len(clients.remotes) > 0, 'Must specify at least 1 client'
+    (cli_remote, _) = clients.remotes.popitem()
+    log.debug(cli_remote)
+
+    # clients = dict(teuthology.get_clients(ctx=ctx, roles=config.keys()))
+    # client = clients.popitem()
+    # log.info(client)
+    osds = ctx.cluster.only(teuthology.is_type('osd'))
+    log.info("OSDS")
+    log.info(osds)
+    log.info(osds.remotes)
+
+    manager = ctx.managers['ceph']
+    while (len(manager.get_osd_status()['up']) !=
+           len(manager.get_osd_status()['raw'])):
+        time.sleep(10)
+    while (len(manager.get_osd_status()['in']) !=
+           len(manager.get_osd_status()['up'])):
+        time.sleep(10)
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+    manager.raw_cluster_cmd('osd', 'set', 'nodown')
+
+    PGNUM = config.get('pgnum', 12)
+    log.info("pgnum: {num}".format(num=PGNUM))
+
+    ERRORS = 0
+
+    REP_POOL = "rep_pool"
+    REP_NAME = "REPobject"
+    create_replicated_pool(cli_remote, REP_POOL, PGNUM)
+    ERRORS += test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME)
+
+    EC_POOL = "ec_pool"
+    EC_NAME = "ECobject"
+    create_ec_pool(cli_remote, EC_POOL, 'default', PGNUM)
+    ERRORS += test_objectstore(ctx, config, cli_remote,
+                               EC_POOL, EC_NAME, ec=True)
+
+    if ERRORS == 0:
+        log.info("TEST PASSED")
+    else:
+        log.error("TEST FAILED WITH {errcount} ERRORS".format(errcount=ERRORS))
+
+    assert ERRORS == 0
+
+    try:
+        yield
+    finally:
+        log.info('Ending ceph_objectstore_tool')
+
+
+def test_objectstore(ctx, config, cli_remote, REP_POOL, REP_NAME, ec=False):
+    manager = ctx.managers['ceph']
+
+    osds = ctx.cluster.only(teuthology.is_type('osd'))
+
+    TEUTHDIR = teuthology.get_testdir(ctx)
+    DATADIR = os.path.join(TEUTHDIR, "ceph.data")
+    DATALINECOUNT = 10000
+    ERRORS = 0
+    NUM_OBJECTS = config.get('objects', 10)
+    log.info("objects: {num}".format(num=NUM_OBJECTS))
+
+    pool_dump = manager.get_pool_dump(REP_POOL)
+    REPID = pool_dump['pool']
+
+    log.debug("repid={num}".format(num=REPID))
+
+    db = {}
+
+    LOCALDIR = tempfile.mkdtemp("cod")
+
+    cod_setup_local_data(log, ctx, NUM_OBJECTS, LOCALDIR,
+                         REP_NAME, DATALINECOUNT)
+    allremote = []
+    allremote.append(cli_remote)
+    allremote += osds.remotes.keys()
+    allremote = list(set(allremote))
+    for remote in allremote:
+        cod_setup_remote_data(log, ctx, remote, NUM_OBJECTS, DATADIR,
+                              REP_NAME, DATALINECOUNT)
+
+    ERRORS += cod_setup(log, ctx, cli_remote, NUM_OBJECTS, DATADIR,
+                        REP_NAME, DATALINECOUNT, REP_POOL, db, ec)
+
+    pgs = {}
+    for stats in manager.get_pg_stats():
+        if stats["pgid"].find(str(REPID) + ".") != 0:
+            continue
+        if pool_dump["type"] == ceph_manager.CephManager.REPLICATED_POOL:
+            for osd in stats["acting"]:
+                pgs.setdefault(osd, []).append(stats["pgid"])
+        elif pool_dump["type"] == ceph_manager.CephManager.ERASURE_CODED_POOL:
+            shard = 0
+            for osd in stats["acting"]:
+                pgs.setdefault(osd, []).append("{pgid}s{shard}".
+                                               format(pgid=stats["pgid"],
+                                                      shard=shard))
+                shard += 1
+        else:
+            raise Exception("{pool} has an unexpected type {type}".
+                            format(pool=REP_POOL, type=pool_dump["type"]))
+
+    log.info(pgs)
+    log.info(db)
+
+    for osd in manager.get_osd_status()['up']:
+        manager.kill_osd(osd)
+    time.sleep(5)
+
+    pgswithobjects = set()
+    objsinpg = {}
+
+    # Test --op list and generate json for all objects
+    log.info("Test --op list by generating json for all objects")
+    prefix = ("sudo ceph-objectstore-tool "
+              "--data-path {fpath} "
+              "--journal-path {jpath} ").format(fpath=FSPATH, jpath=JPATH)
+    for remote in osds.remotes.iterkeys():
+        log.debug(remote)
+        log.debug(osds.remotes[remote])
+        for role in osds.remotes[remote]:
+            if string.find(role, "osd.") != 0:
+                continue
+            osdid = int(role.split('.')[1])
+            log.info("process osd.{id} on {remote}".
+                     format(id=osdid, remote=remote))
+            cmd = (prefix + "--op list").format(id=osdid)
+            proc = remote.run(args=cmd.split(), check_status=False,
+                              stdout=StringIO())
+            if proc.exitstatus != 0:
+                log.error("Bad exit status {ret} from --op list request".
+                          format(ret=proc.exitstatus))
+                ERRORS += 1
+            else:
+                for pgline in proc.stdout.getvalue().splitlines():
+                    if not pgline:
+                        continue
+                    (pg, obj) = json.loads(pgline)
+                    name = obj['oid']
+                    if name in db:
+                        pgswithobjects.add(pg)
+                        objsinpg.setdefault(pg, []).append(name)
+                        db[name].setdefault("pg2json",
+                                            {})[pg] = json.dumps(obj)
+
+    log.info(db)
+    log.info(pgswithobjects)
+    log.info(objsinpg)
+
+    if pool_dump["type"] == ceph_manager.CephManager.REPLICATED_POOL:
+        # Test get-bytes
+        log.info("Test get-bytes and set-bytes")
+        for basename in db.keys():
+            file = os.path.join(DATADIR, basename)
+            GETNAME = os.path.join(DATADIR, "get")
+            SETNAME = os.path.join(DATADIR, "set")
+
+            for remote in osds.remotes.iterkeys():
+                for role in osds.remotes[remote]:
+                    if string.find(role, "osd.") != 0:
+                        continue
+                    osdid = int(role.split('.')[1])
+                    if osdid not in pgs:
+                        continue
+
+                    for pg, JSON in db[basename]["pg2json"].iteritems():
+                        if pg in pgs[osdid]:
+                            cmd = ((prefix + "--pgid {pg}").
+                                   format(id=osdid, pg=pg).split())
+                            cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                            cmd += ("get-bytes {fname}".
+                                    format(fname=GETNAME).split())
+                            proc = remote.run(args=cmd, check_status=False)
+                            if proc.exitstatus != 0:
+                                remote.run(args="rm -f {getfile}".
+                                           format(getfile=GETNAME).split())
+                                log.error("Bad exit status {ret}".
+                                          format(ret=proc.exitstatus))
+                                ERRORS += 1
+                                continue
+                            cmd = ("diff -q {file} {getfile}".
+                                   format(file=file, getfile=GETNAME))
+                            proc = remote.run(args=cmd.split())
+                            if proc.exitstatus != 0:
+                                log.error("Data from get-bytes differ")
+                                # log.debug("Got:")
+                                # cat_file(logging.DEBUG, GETNAME)
+                                # log.debug("Expected:")
+                                # cat_file(logging.DEBUG, file)
+                                ERRORS += 1
+                            remote.run(args="rm -f {getfile}".
+                                       format(getfile=GETNAME).split())
+
+                            data = ("put-bytes going into {file}\n".
+                                    format(file=file))
+                            teuthology.write_file(remote, SETNAME, data)
+                            cmd = ((prefix + "--pgid {pg}").
+                                   format(id=osdid, pg=pg).split())
+                            cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                            cmd += ("set-bytes {fname}".
+                                    format(fname=SETNAME).split())
+                            proc = remote.run(args=cmd, check_status=False)
+                            proc.wait()
+                            if proc.exitstatus != 0:
+                                log.info("set-bytes failed for object {obj} "
+                                         "in pg {pg} osd.{id} ret={ret}".
+                                         format(obj=basename, pg=pg,
+                                                id=osdid, ret=proc.exitstatus))
+                                ERRORS += 1
+
+                            cmd = ((prefix + "--pgid {pg}").
+                                   format(id=osdid, pg=pg).split())
+                            cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                            cmd += "get-bytes -".split()
+                            proc = remote.run(args=cmd, check_status=False,
+                                              stdout=StringIO())
+                            proc.wait()
+                            if proc.exitstatus != 0:
+                                log.error("get-bytes after "
+                                          "set-bytes ret={ret}".
+                                          format(ret=proc.exitstatus))
+                                ERRORS += 1
+                            else:
+                                if data != proc.stdout.getvalue():
+                                    log.error("Data inconsistent after "
+                                              "set-bytes, got:")
+                                    log.error(proc.stdout.getvalue())
+                                    ERRORS += 1
+
+                            cmd = ((prefix + "--pgid {pg}").
+                                   format(id=osdid, pg=pg).split())
+                            cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                            cmd += ("set-bytes {fname}".
+                                    format(fname=file).split())
+                            proc = remote.run(args=cmd, check_status=False)
+                            proc.wait()
+                            if proc.exitstatus != 0:
+                                log.info("set-bytes failed for object {obj} "
+                                         "in pg {pg} osd.{id} ret={ret}".
+                                         format(obj=basename, pg=pg,
+                                                id=osdid, ret=proc.exitstatus))
+                                ERRORS += 1
+
+    log.info("Test list-attrs get-attr")
+    for basename in db.keys():
+        file = os.path.join(DATADIR, basename)
+        GETNAME = os.path.join(DATADIR, "get")
+        SETNAME = os.path.join(DATADIR, "set")
+
+        for remote in osds.remotes.iterkeys():
+            for role in osds.remotes[remote]:
+                if string.find(role, "osd.") != 0:
+                    continue
+                osdid = int(role.split('.')[1])
+                if osdid not in pgs:
+                    continue
+
+                for pg, JSON in db[basename]["pg2json"].iteritems():
+                    if pg in pgs[osdid]:
+                        cmd = ((prefix + "--pgid {pg}").
+                               format(id=osdid, pg=pg).split())
+                        cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                        cmd += ["list-attrs"]
+                        proc = remote.run(args=cmd, check_status=False,
+                                          stdout=StringIO(), stderr=StringIO())
+                        proc.wait()
+                        if proc.exitstatus != 0:
+                            log.error("Bad exit status {ret}".
+                                      format(ret=proc.exitstatus))
+                            ERRORS += 1
+                            continue
+                        keys = proc.stdout.getvalue().split()
+                        values = dict(db[basename]["xattr"])
+
+                        for key in keys:
+                            if (key == "_" or
+                                    key == "snapset" or
+                                    key == "hinfo_key"):
+                                continue
+                            key = key.strip("_")
+                            if key not in values:
+                                log.error("The key {key} should be present".
+                                          format(key=key))
+                                ERRORS += 1
+                                continue
+                            exp = values.pop(key)
+                            cmd = ((prefix + "--pgid {pg}").
+                                   format(id=osdid, pg=pg).split())
+                            cmd.append(run.Raw("'{json}'".format(json=JSON)))
+                            cmd += ("get-attr {key}".
+                                    format(key="_" + key).split())
+                            proc = remote.run(args=cmd, check_status=False,
+                                              stdout=StringIO())
+                            proc.wait()
+                            if proc.exitstatus != 0:
+                                log.error("get-attr failed with {ret}".
+                                          format(ret=proc.exitstatus))
+                                ERRORS += 1
+                                continue
+                            val = proc.stdout.getvalue()
+                            if exp != val:
+                                log.error("For key {key} got value {got} "
+                                          "instead of {expected}".
+                                          format(key=key, got=val,
+                                                 expected=exp))
+                                ERRORS += 1
+                        if "hinfo_key" in keys:
+                            cmd_prefix = prefix.format(id=osdid)
+                            cmd = """
+      expected=$({prefix} --pgid {pg} '{json}' get-attr {key} | base64)
+      echo placeholder | {prefix} --pgid {pg} '{json}' set-attr {key} -
+      test $({prefix} --pgid {pg} '{json}' get-attr {key}) = placeholder
+      echo $expected | base64 --decode | \
+         {prefix} --pgid {pg} '{json}' set-attr {key} -
+      test $({prefix} --pgid {pg} '{json}' get-attr {key} | base64) = $expected
+                            """.format(prefix=cmd_prefix, pg=pg, json=JSON,
+                                       key="hinfo_key")
+                            log.debug(cmd)
+                            proc = remote.run(args=['bash', '-e', '-x',
+                                                    '-c', cmd],
+                                              check_status=False,
+                                              stdout=StringIO(),
+                                              stderr=StringIO())
+                            proc.wait()
+                            if proc.exitstatus != 0:
+                                log.error("failed with " +
+                                          str(proc.exitstatus))
+                                log.error(proc.stdout.getvalue() + " " +
+                                          proc.stderr.getvalue())
+                                ERRORS += 1
+
+                        if len(values) != 0:
+                            log.error("Not all keys found, remaining keys:")
+                            log.error(values)
+
+    log.info("Test pg info")
+    for remote in osds.remotes.iterkeys():
+        for role in osds.remotes[remote]:
+            if string.find(role, "osd.") != 0:
+                continue
+            osdid = int(role.split('.')[1])
+            if osdid not in pgs:
+                continue
+
+            for pg in pgs[osdid]:
+                cmd = ((prefix + "--op info --pgid {pg}").
+                       format(id=osdid, pg=pg).split())
+                proc = remote.run(args=cmd, check_status=False,
+                                  stdout=StringIO())
+                proc.wait()
+                if proc.exitstatus != 0:
+                    log.error("Failure of --op info command with {ret}".
+                              format(proc.exitstatus))
+                    ERRORS += 1
+                    continue
+                info = proc.stdout.getvalue()
+                if not str(pg) in info:
+                    log.error("Bad data from info: {info}".format(info=info))
+                    ERRORS += 1
+
+    log.info("Test pg logging")
+    for remote in osds.remotes.iterkeys():
+        for role in osds.remotes[remote]:
+            if string.find(role, "osd.") != 0:
+                continue
+            osdid = int(role.split('.')[1])
+            if osdid not in pgs:
+                continue
+
+            for pg in pgs[osdid]:
+                cmd = ((prefix + "--op log --pgid {pg}").
+                       format(id=osdid, pg=pg).split())
+                proc = remote.run(args=cmd, check_status=False,
+                                  stdout=StringIO())
+                proc.wait()
+                if proc.exitstatus != 0:
+                    log.error("Getting log failed for pg {pg} "
+                              "from osd.{id} with {ret}".
+                              format(pg=pg, id=osdid, ret=proc.exitstatus))
+                    ERRORS += 1
+                    continue
+                HASOBJ = pg in pgswithobjects
+                MODOBJ = "modify" in proc.stdout.getvalue()
+                if HASOBJ != MODOBJ:
+                    log.error("Bad log for pg {pg} from osd.{id}".
+                              format(pg=pg, id=osdid))
+                    MSG = (HASOBJ and [""] or ["NOT "])[0]
+                    log.error("Log should {msg}have a modify entry".
+                              format(msg=MSG))
+                    ERRORS += 1
+
+    log.info("Test pg export")
+    EXP_ERRORS = 0
+    for remote in osds.remotes.iterkeys():
+        for role in osds.remotes[remote]:
+            if string.find(role, "osd.") != 0:
+                continue
+            osdid = int(role.split('.')[1])
+            if osdid not in pgs:
+                continue
+
+            for pg in pgs[osdid]:
+                fpath = os.path.join(DATADIR, "osd{id}.{pg}".
+                                     format(id=osdid, pg=pg))
+
+                cmd = ((prefix + "--op export --pgid {pg} --file {file}").
+                       format(id=osdid, pg=pg, file=fpath))
+                proc = remote.run(args=cmd, check_status=False,
+                                  stdout=StringIO())
+                proc.wait()
+                if proc.exitstatus != 0:
+                    log.error("Exporting failed for pg {pg} "
+                              "on osd.{id} with {ret}".
+                              format(pg=pg, id=osdid, ret=proc.exitstatus))
+                    EXP_ERRORS += 1
+
+    ERRORS += EXP_ERRORS
+
+    log.info("Test pg removal")
+    RM_ERRORS = 0
+    for remote in osds.remotes.iterkeys():
+        for role in osds.remotes[remote]:
+            if string.find(role, "osd.") != 0:
+                continue
+            osdid = int(role.split('.')[1])
+            if osdid not in pgs:
+                continue
+
+            for pg in pgs[osdid]:
+                cmd = ((prefix + "--force --op remove --pgid {pg}").
+                       format(pg=pg, id=osdid))
+                proc = remote.run(args=cmd, check_status=False,
+                                  stdout=StringIO())
+                proc.wait()
+                if proc.exitstatus != 0:
+                    log.error("Removing failed for pg {pg} "
+                              "on osd.{id} with {ret}".
+                              format(pg=pg, id=osdid, ret=proc.exitstatus))
+                    RM_ERRORS += 1
+
+    ERRORS += RM_ERRORS
+
+    IMP_ERRORS = 0
+    if EXP_ERRORS == 0 and RM_ERRORS == 0:
+        log.info("Test pg import")
+
+        for remote in osds.remotes.iterkeys():
+            for role in osds.remotes[remote]:
+                if string.find(role, "osd.") != 0:
+                    continue
+                osdid = int(role.split('.')[1])
+                if osdid not in pgs:
+                    continue
+
+                for pg in pgs[osdid]:
+                    fpath = os.path.join(DATADIR, "osd{id}.{pg}".
+                                         format(id=osdid, pg=pg))
+
+                    cmd = ((prefix + "--op import --file {file}").
+                           format(id=osdid, file=fpath))
+                    proc = remote.run(args=cmd, check_status=False,
+                                      stdout=StringIO())
+                    proc.wait()
+                    if proc.exitstatus != 0:
+                        log.error("Import failed from {file} with {ret}".
+                                  format(file=fpath, ret=proc.exitstatus))
+                        IMP_ERRORS += 1
+    else:
+        log.warning("SKIPPING IMPORT TESTS DUE TO PREVIOUS FAILURES")
+
+    ERRORS += IMP_ERRORS
+
+    if EXP_ERRORS == 0 and RM_ERRORS == 0 and IMP_ERRORS == 0:
+        log.info("Restarting OSDs....")
+        # They are still look to be up because of setting nodown
+        for osd in manager.get_osd_status()['up']:
+            manager.revive_osd(osd)
+        # Wait for health?
+        time.sleep(5)
+        # Let scrub after test runs verify consistency of all copies
+        log.info("Verify replicated import data")
+        objects = range(1, NUM_OBJECTS + 1)
+        for i in objects:
+            NAME = REP_NAME + "{num}".format(num=i)
+            TESTNAME = os.path.join(DATADIR, "gettest")
+            REFNAME = os.path.join(DATADIR, NAME)
+
+            proc = rados(ctx, cli_remote,
+                         ['-p', REP_POOL, 'get', NAME, TESTNAME], wait=False)
+
+            ret = proc.wait()
+            if ret != 0:
+                log.error("After import, rados get failed with {ret}".
+                          format(ret=proc.exitstatus))
+                ERRORS += 1
+                continue
+
+            cmd = "diff -q {gettest} {ref}".format(gettest=TESTNAME,
+                                                   ref=REFNAME)
+            proc = cli_remote.run(args=cmd, check_status=False)
+            proc.wait()
+            if proc.exitstatus != 0:
+                log.error("Data comparison failed for {obj}".format(obj=NAME))
+                ERRORS += 1
+
+    return ERRORS
diff --git a/src/ceph/qa/tasks/ceph_test_case.py b/src/ceph/qa/tasks/ceph_test_case.py
new file mode 100644
index 0000000..5767df4
--- /dev/null
+++ b/src/ceph/qa/tasks/ceph_test_case.py
@@ -0,0 +1,150 @@
+
+import unittest
+import time
+import logging
+
+from teuthology.orchestra.run import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+
+class CephTestCase(unittest.TestCase):
+    """
+    For test tasks that want to define a structured set of
+    tests implemented in python.  Subclass this with appropriate
+    helpers for the subsystem you're testing.
+    """
+
+    # Environment references
+    mounts = None
+    fs = None
+    recovery_fs = None
+    ceph_cluster = None
+    mds_cluster = None
+    mgr_cluster = None
+    ctx = None
+
+    mon_manager = None
+
+    def setUp(self):
+        self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+            "Starting test {0}".format(self.id()))
+
+    def tearDown(self):
+        self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+            "Ended test {0}".format(self.id()))
+
+    def assert_cluster_log(self, expected_pattern, invert_match=False, timeout=10):
+        """
+        Context manager.  Assert that during execution, or up to 5 seconds later,
+        the Ceph cluster log emits a message matching the expected pattern.
+
+        :param expected_pattern: a string that you expect to see in the log output
+        """
+
+        ceph_manager = self.ceph_cluster.mon_manager
+
+        class ContextManager(object):
+            def match(self):
+                found = expected_pattern in self.watcher_process.stdout.getvalue()
+                if invert_match:
+                    return not found
+
+                return found
+
+            def __enter__(self):
+                self.watcher_process = ceph_manager.run_ceph_w()
+
+            def __exit__(self, exc_type, exc_val, exc_tb):
+                if not self.watcher_process.finished:
+                    # Check if we got an early match, wait a bit if we didn't
+                    if self.match():
+                        return
+                    else:
+                        log.debug("No log hits yet, waiting...")
+                        # Default monc tick interval is 10s, so wait that long and
+                        # then some grace
+                        time.sleep(5 + timeout)
+
+                self.watcher_process.stdin.close()
+                try:
+                    self.watcher_process.wait()
+                except CommandFailedError:
+                    pass
+
+                if not self.match():
+                    log.error("Log output: \n{0}\n".format(self.watcher_process.stdout.getvalue()))
+                    raise AssertionError("Expected log message not found: '{0}'".format(expected_pattern))
+
+        return ContextManager()
+
+    def wait_for_health(self, pattern, timeout):
+        """
+        Wait until 'ceph health' contains messages matching the pattern
+        """
+        def seen_health_warning():
+            health = self.ceph_cluster.mon_manager.get_mon_health()
+            codes = [s for s in health['checks']]
+            summary_strings = [s[1]['summary']['message'] for s in health['checks'].iteritems()]
+            if len(summary_strings) == 0:
+                log.debug("Not expected number of summary strings ({0})".format(summary_strings))
+                return False
+            else:
+                for ss in summary_strings:
+                    if pattern in ss:
+                         return True
+                if pattern in codes:
+                    return True
+
+            log.debug("Not found expected summary strings yet ({0})".format(summary_strings))
+            return False
+
+        self.wait_until_true(seen_health_warning, timeout)
+
+    def wait_for_health_clear(self, timeout):
+        """
+        Wait until `ceph health` returns no messages
+        """
+        def is_clear():
+            health = self.ceph_cluster.mon_manager.get_mon_health()
+            return len(health['checks']) == 0
+
+        self.wait_until_true(is_clear, timeout)
+
+    def wait_until_equal(self, get_fn, expect_val, timeout, reject_fn=None):
+        period = 5
+        elapsed = 0
+        while True:
+            val = get_fn()
+            if val == expect_val:
+                return
+            elif reject_fn and reject_fn(val):
+                raise RuntimeError("wait_until_equal: forbidden value {0} seen".format(val))
+            else:
+                if elapsed >= timeout:
+                    raise RuntimeError("Timed out after {0} seconds waiting for {1} (currently {2})".format(
+                        elapsed, expect_val, val
+                    ))
+                else:
+                    log.debug("wait_until_equal: {0} != {1}, waiting...".format(val, expect_val))
+                time.sleep(period)
+                elapsed += period
+
+        log.debug("wait_until_equal: success")
+
+    def wait_until_true(self, condition, timeout):
+        period = 5
+        elapsed = 0
+        while True:
+            if condition():
+                log.debug("wait_until_true: success in {0}s".format(elapsed))
+                return
+            else:
+                if elapsed >= timeout:
+                    raise RuntimeError("Timed out after {0}s".format(elapsed))
+                else:
+                    log.debug("wait_until_true: waiting...")
+                time.sleep(period)
+                elapsed += period
+
+
diff --git a/src/ceph/qa/tasks/cephfs/__init__.py b/src/ceph/qa/tasks/cephfs/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/__init__.py
diff --git a/src/ceph/qa/tasks/cephfs/cephfs_test_case.py b/src/ceph/qa/tasks/cephfs/cephfs_test_case.py
new file mode 100644
index 0000000..801d0d3
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/cephfs_test_case.py
@@ -0,0 +1,315 @@
+import json
+import logging
+from unittest import case
+from tasks.ceph_test_case import CephTestCase
+import os
+import re
+from StringIO import StringIO
+
+from tasks.cephfs.fuse_mount import FuseMount
+
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError
+
+
+log = logging.getLogger(__name__)
+
+
+def for_teuthology(f):
+    """
+    Decorator that adds an "is_for_teuthology" attribute to the wrapped function
+    """
+    f.is_for_teuthology = True
+    return f
+
+
+def needs_trimming(f):
+    """
+    Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse
+    this means it needs to be able to run as root, currently)
+    """
+    f.needs_trimming = True
+    return f
+
+
+class CephFSTestCase(CephTestCase):
+    """
+    Test case for Ceph FS, requires caller to populate Filesystem and Mounts,
+    into the fs, mount_a, mount_b class attributes (setting mount_b is optional)
+
+    Handles resetting the cluster under test between tests.
+    """
+
+    # FIXME weird explicit naming
+    mount_a = None
+    mount_b = None
+    recovery_mount = None
+
+    # Declarative test requirements: subclasses should override these to indicate
+    # their special needs.  If not met, tests will be skipped.
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+    REQUIRE_KCLIENT_REMOTE = False
+    REQUIRE_ONE_CLIENT_REMOTE = False
+    REQUIRE_MEMSTORE = False
+
+    # Whether to create the default filesystem during setUp
+    REQUIRE_FILESYSTEM = True
+
+    # requires REQUIRE_FILESYSTEM = True
+    REQUIRE_RECOVERY_FILESYSTEM = False
+
+    LOAD_SETTINGS = []
+
+    def setUp(self):
+        super(CephFSTestCase, self).setUp()
+
+        if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED:
+            raise case.SkipTest("Only have {0} MDSs, require {1}".format(
+                len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED
+            ))
+
+        if len(self.mounts) < self.CLIENTS_REQUIRED:
+            raise case.SkipTest("Only have {0} clients, require {1}".format(
+                len(self.mounts), self.CLIENTS_REQUIRED
+            ))
+
+        if self.REQUIRE_KCLIENT_REMOTE:
+            if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount):
+                # kclient kill() power cycles nodes, so requires clients to each be on
+                # their own node
+                if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname:
+                    raise case.SkipTest("kclient clients must be on separate nodes")
+
+        if self.REQUIRE_ONE_CLIENT_REMOTE:
+            if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames():
+                raise case.SkipTest("Require first client to be on separate server from MDSs")
+
+        if self.REQUIRE_MEMSTORE:
+            objectstore = self.mds_cluster.get_config("osd_objectstore", "osd")
+            if objectstore != "memstore":
+                # You certainly *could* run this on a real OSD, but you don't want to sit
+                # here for hours waiting for the test to fill up a 1TB drive!
+                raise case.SkipTest("Require `memstore` OSD backend to simulate full drives")
+
+        # Create friendly mount_a, mount_b attrs
+        for i in range(0, self.CLIENTS_REQUIRED):
+            setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i])
+
+        self.mds_cluster.clear_firewall()
+
+        # Unmount all clients, we are about to blow away the filesystem
+        for mount in self.mounts:
+            if mount.is_mounted():
+                mount.umount_wait(force=True)
+
+        # To avoid any issues with e.g. unlink bugs, we destroy and recreate
+        # the filesystem rather than just doing a rm -rf of files
+        self.mds_cluster.mds_stop()
+        self.mds_cluster.mds_fail()
+        self.mds_cluster.delete_all_filesystems()
+        self.fs = None # is now invalid!
+        self.recovery_fs = None
+
+        # In case the previous filesystem had filled up the RADOS cluster, wait for that
+        # flag to pass.
+        osd_mon_report_interval_max = int(self.mds_cluster.get_config("osd_mon_report_interval_max", service_type='osd'))
+        self.wait_until_true(lambda: not self.mds_cluster.is_full(),
+                             timeout=osd_mon_report_interval_max * 5)
+
+        # In case anything is in the OSD blacklist list, clear it out.  This is to avoid
+        # the OSD map changing in the background (due to blacklist expiry) while tests run.
+        try:
+            self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "clear")
+        except CommandFailedError:
+            # Fallback for older Ceph cluster
+            blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd",
+                                  "dump", "--format=json-pretty"))['blacklist']
+            log.info("Removing {0} blacklist entries".format(len(blacklist)))
+            for addr, blacklisted_at in blacklist.items():
+                self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr)
+
+        client_mount_ids = [m.client_id for m in self.mounts]
+        # In case the test changes the IDs of clients, stash them so that we can
+        # reset in tearDown
+        self._original_client_ids = client_mount_ids
+        log.info(client_mount_ids)
+
+        # In case there were any extra auth identities around from a previous
+        # test, delete them
+        for entry in self.auth_list():
+            ent_type, ent_id = entry['entity'].split(".")
+            if ent_type == "client" and ent_id not in client_mount_ids and ent_id != "admin":
+                self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity'])
+
+        if self.REQUIRE_FILESYSTEM:
+            self.fs = self.mds_cluster.newfs(create=True)
+            self.fs.mds_restart()
+
+            # In case some test messed with auth caps, reset them
+            for client_id in client_mount_ids:
+                self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+                    'auth', 'caps', "client.{0}".format(client_id),
+                    'mds', 'allow',
+                    'mon', 'allow r',
+                    'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name()))
+
+            # wait for mds restart to complete...
+            self.fs.wait_for_daemons()
+
+            # Mount the requested number of clients
+            for i in range(0, self.CLIENTS_REQUIRED):
+                self.mounts[i].mount()
+                self.mounts[i].wait_until_mounted()
+
+        if self.REQUIRE_RECOVERY_FILESYSTEM:
+            if not self.REQUIRE_FILESYSTEM:
+                raise case.SkipTest("Recovery filesystem requires a primary filesystem as well")
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set',
+                                                'enable_multiple', 'true',
+                                                '--yes-i-really-mean-it')
+            self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False)
+            self.recovery_fs.set_metadata_overlay(True)
+            self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name())
+            self.recovery_fs.create()
+            self.recovery_fs.getinfo(refresh=True)
+            self.recovery_fs.mds_restart()
+            self.recovery_fs.wait_for_daemons()
+
+        # Load an config settings of interest
+        for setting in self.LOAD_SETTINGS:
+            setattr(self, setting, float(self.fs.mds_asok(
+                ['config', 'get', setting], self.mds_cluster.mds_ids[0]
+            )[setting]))
+
+        self.configs_set = set()
+
+    def tearDown(self):
+        super(CephFSTestCase, self).tearDown()
+
+        self.mds_cluster.clear_firewall()
+        for m in self.mounts:
+            m.teardown()
+
+        for i, m in enumerate(self.mounts):
+            m.client_id = self._original_client_ids[i]
+
+        for subsys, key in self.configs_set:
+            self.mds_cluster.clear_ceph_conf(subsys, key)
+
+    def set_conf(self, subsys, key, value):
+        self.configs_set.add((subsys, key))
+        self.mds_cluster.set_ceph_conf(subsys, key, value)
+
+    def auth_list(self):
+        """
+        Convenience wrapper on "ceph auth ls"
+        """
+        return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd(
+            "auth", "ls", "--format=json-pretty"
+        ))['auth_dump']
+
+    def assert_session_count(self, expected, ls_data=None, mds_id=None):
+        if ls_data is None:
+            ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id)
+
+        alive_count = len([s for s in ls_data if s['state'] != 'killing'])
+
+        self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format(
+            expected, alive_count
+        ))
+
+    def assert_session_state(self, client_id,  expected_state):
+        self.assertEqual(
+            self._session_by_id(
+                self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'],
+            expected_state)
+
+    def get_session_data(self, client_id):
+        return self._session_by_id(client_id)
+
+    def _session_list(self):
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']]
+        return ls_data
+
+    def get_session(self, client_id, session_ls=None):
+        if session_ls is None:
+            session_ls = self.fs.mds_asok(['session', 'ls'])
+
+        return self._session_by_id(session_ls)[client_id]
+
+    def _session_by_id(self, session_ls):
+        return dict([(s['id'], s) for s in session_ls])
+
+    def wait_for_daemon_start(self, daemon_ids=None):
+        """
+        Wait until all the daemons appear in the FSMap, either assigned
+        MDS ranks or in the list of standbys
+        """
+        def get_daemon_names():
+            return [info['name'] for info in self.mds_cluster.status().get_all()]
+
+        if daemon_ids is None:
+            daemon_ids = self.mds_cluster.mds_ids
+
+        try:
+            self.wait_until_true(
+                lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids),
+                timeout=30
+            )
+        except RuntimeError:
+            log.warn("Timeout waiting for daemons {0}, while we have {1}".format(
+                daemon_ids, get_daemon_names()
+            ))
+            raise
+
+    def assert_mds_crash(self, daemon_id):
+        """
+        Assert that the a particular MDS daemon crashes (block until
+        it does)
+        """
+        try:
+            self.mds_cluster.mds_daemons[daemon_id].proc.wait()
+        except CommandFailedError as e:
+            log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus))
+            self.mds_cluster.mds_daemons[daemon_id].proc = None
+
+            # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will
+            # catch it later and treat it as a failure.
+            p = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
+                "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO())
+            core_pattern = p.stdout.getvalue().strip()
+            if os.path.dirname(core_pattern):  # Non-default core_pattern with a directory in it
+                # We have seen a core_pattern that looks like it's from teuthology's coredump
+                # task, so proceed to clear out the core file
+                log.info("Clearing core from pattern: {0}".format(core_pattern))
+
+                # Determine the PID of the crashed MDS by inspecting the MDSMap, it had
+                # to talk to the mons to get assigned a rank to reach the point of crashing
+                addr = self.mds_cluster.mon_manager.get_mds_status(daemon_id)['addr']
+                pid_str = addr.split("/")[1]
+                log.info("Determined crasher PID was {0}".format(pid_str))
+
+                # Substitute PID into core_pattern to get a glob
+                core_glob = core_pattern.replace("%p", pid_str)
+                core_glob = re.sub("%[a-z]", "*", core_glob)  # Match all for all other % tokens
+
+                # Verify that we see the expected single coredump matching the expected pattern
+                ls_proc = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
+                    "sudo", "ls", run.Raw(core_glob)
+                ], stdout=StringIO())
+                cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f]
+                log.info("Enumerated cores: {0}".format(cores))
+                self.assertEqual(len(cores), 1)
+
+                log.info("Found core file {0}, deleting it".format(cores[0]))
+
+                self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[
+                    "sudo", "rm", "-f", cores[0]
+                ])
+            else:
+                log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)")
+
+        else:
+            raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id))
diff --git a/src/ceph/qa/tasks/cephfs/filesystem.py b/src/ceph/qa/tasks/cephfs/filesystem.py
new file mode 100644
index 0000000..9638fd5
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/filesystem.py
@@ -0,0 +1,1213 @@
+
+from StringIO import StringIO
+import json
+import logging
+from gevent import Greenlet
+import os
+import time
+import datetime
+import re
+import errno
+import random
+
+from teuthology.exceptions import CommandFailedError
+from teuthology import misc
+from teuthology.nuke import clear_firewall
+from teuthology.parallel import parallel
+from tasks.ceph_manager import write_conf
+from tasks import ceph_manager
+
+
+log = logging.getLogger(__name__)
+
+
+DAEMON_WAIT_TIMEOUT = 120
+ROOT_INO = 1
+
+
+class ObjectNotFound(Exception):
+    def __init__(self, object_name):
+        self._object_name = object_name
+
+    def __str__(self):
+        return "Object not found: '{0}'".format(self._object_name)
+
+class FSStatus(object):
+    """
+    Operations on a snapshot of the FSMap.
+    """
+    def __init__(self, mon_manager):
+        self.mon = mon_manager
+        self.map = json.loads(self.mon.raw_cluster_cmd("fs", "dump", "--format=json"))
+
+    def __str__(self):
+        return json.dumps(self.map, indent = 2, sort_keys = True)
+
+    # Expose the fsmap for manual inspection.
+    def __getitem__(self, key):
+        """
+        Get a field from the fsmap.
+        """
+        return self.map[key]
+
+    def get_filesystems(self):
+        """
+        Iterator for all filesystems.
+        """
+        for fs in self.map['filesystems']:
+            yield fs
+
+    def get_all(self):
+        """
+        Iterator for all the mds_info components in the FSMap.
+        """
+        for info in self.get_standbys():
+            yield info
+        for fs in self.map['filesystems']:
+            for info in fs['mdsmap']['info'].values():
+                yield info
+
+    def get_standbys(self):
+        """
+        Iterator for all standbys.
+        """
+        for info in self.map['standbys']:
+            yield info
+
+    def get_fsmap(self, fscid):
+        """
+        Get the fsmap for the given FSCID.
+        """
+        for fs in self.map['filesystems']:
+            if fscid is None or fs['id'] == fscid:
+                return fs
+        raise RuntimeError("FSCID {0} not in map".format(fscid))
+
+    def get_fsmap_byname(self, name):
+        """
+        Get the fsmap for the given file system name.
+        """
+        for fs in self.map['filesystems']:
+            if name is None or fs['mdsmap']['fs_name'] == name:
+                return fs
+        raise RuntimeError("FS {0} not in map".format(name))
+
+    def get_replays(self, fscid):
+        """
+        Get the standby:replay MDS for the given FSCID.
+        """
+        fs = self.get_fsmap(fscid)
+        for info in fs['mdsmap']['info'].values():
+            if info['state'] == 'up:standby-replay':
+                yield info
+
+    def get_ranks(self, fscid):
+        """
+        Get the ranks for the given FSCID.
+        """
+        fs = self.get_fsmap(fscid)
+        for info in fs['mdsmap']['info'].values():
+            if info['rank'] >= 0:
+                yield info
+
+    def get_rank(self, fscid, rank):
+        """
+        Get the rank for the given FSCID.
+        """
+        for info in self.get_ranks(fscid):
+            if info['rank'] == rank:
+                return info
+        raise RuntimeError("FSCID {0} has no rank {1}".format(fscid, rank))
+
+    def get_mds(self, name):
+        """
+        Get the info for the given MDS name.
+        """
+        for info in self.get_all():
+            if info['name'] == name:
+                return info
+        return None
+
+    def get_mds_addr(self, name):
+        """
+        Return the instance addr as a string, like "10.214.133.138:6807\/10825"
+        """
+        info = self.get_mds(name)
+        if info:
+            return info['addr']
+        else:
+            log.warn(json.dumps(list(self.get_all()), indent=2))  # dump for debugging
+            raise RuntimeError("MDS id '{0}' not found in map".format(name))
+
+class CephCluster(object):
+    @property
+    def admin_remote(self):
+        first_mon = misc.get_first_mon(self._ctx, None)
+        (result,) = self._ctx.cluster.only(first_mon).remotes.iterkeys()
+        return result
+
+    def __init__(self, ctx):
+        self._ctx = ctx
+        self.mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=ctx, logger=log.getChild('ceph_manager'))
+
+    def get_config(self, key, service_type=None):
+        """
+        Get config from mon by default, or a specific service if caller asks for it
+        """
+        if service_type is None:
+            service_type = 'mon'
+
+        service_id = sorted(misc.all_roles_of_type(self._ctx.cluster, service_type))[0]
+        return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
+    def set_ceph_conf(self, subsys, key, value):
+        if subsys not in self._ctx.ceph['ceph'].conf:
+            self._ctx.ceph['ceph'].conf[subsys] = {}
+        self._ctx.ceph['ceph'].conf[subsys][key] = value
+        write_conf(self._ctx)  # XXX because we don't have the ceph task's config object, if they
+                               # used a different config path this won't work.
+
+    def clear_ceph_conf(self, subsys, key):
+        del self._ctx.ceph['ceph'].conf[subsys][key]
+        write_conf(self._ctx)
+
+    def json_asok(self, command, service_type, service_id):
+        proc = self.mon_manager.admin_socket(service_type, service_id, command)
+        response_data = proc.stdout.getvalue()
+        log.info("_json_asok output: {0}".format(response_data))
+        if response_data.strip():
+            return json.loads(response_data)
+        else:
+            return None
+
+
+class MDSCluster(CephCluster):
+    """
+    Collective operations on all the MDS daemons in the Ceph cluster.  These
+    daemons may be in use by various Filesystems.
+
+    For the benefit of pre-multi-filesystem tests, this class is also
+    a parent of Filesystem.  The correct way to use MDSCluster going forward is
+    as a separate instance outside of your (multiple) Filesystem instances.
+    """
+    def __init__(self, ctx):
+        super(MDSCluster, self).__init__(ctx)
+
+        self.mds_ids = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
+
+        if len(self.mds_ids) == 0:
+            raise RuntimeError("This task requires at least one MDS")
+
+        if hasattr(self._ctx, "daemons"):
+            # Presence of 'daemons' attribute implies ceph task rather than ceph_deploy task
+            self.mds_daemons = dict([(mds_id, self._ctx.daemons.get_daemon('mds', mds_id)) for mds_id in self.mds_ids])
+
+    def _one_or_all(self, mds_id, cb, in_parallel=True):
+        """
+        Call a callback for a single named MDS, or for all.
+
+        Note that the parallelism here isn't for performance, it's to avoid being overly kind
+        to the cluster by waiting a graceful ssh-latency of time between doing things, and to
+        avoid being overly kind by executing them in a particular order.  However, some actions
+        don't cope with being done in parallel, so it's optional (`in_parallel`)
+
+        :param mds_id: MDS daemon name, or None
+        :param cb: Callback taking single argument of MDS daemon name
+        :param in_parallel: whether to invoke callbacks concurrently (else one after the other)
+        """
+        if mds_id is None:
+            if in_parallel:
+                with parallel() as p:
+                    for mds_id in self.mds_ids:
+                        p.spawn(cb, mds_id)
+            else:
+                for mds_id in self.mds_ids:
+                    cb(mds_id)
+        else:
+            cb(mds_id)
+
+    def get_config(self, key, service_type=None):
+        """
+        get_config specialization of service_type="mds"
+        """
+        if service_type != "mds":
+            return super(MDSCluster, self).get_config(key, service_type)
+
+        # Some tests stop MDS daemons, don't send commands to a dead one:
+        service_id = random.sample(filter(lambda i: self.mds_daemons[i].running(), self.mds_daemons), 1)[0]
+        return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
+    def mds_stop(self, mds_id=None):
+        """
+        Stop the MDS daemon process(se).  If it held a rank, that rank
+        will eventually go laggy.
+        """
+        self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].stop())
+
+    def mds_fail(self, mds_id=None):
+        """
+        Inform MDSMonitor of the death of the daemon process(es).  If it held
+        a rank, that rank will be relinquished.
+        """
+        self._one_or_all(mds_id, lambda id_: self.mon_manager.raw_cluster_cmd("mds", "fail", id_))
+
+    def mds_restart(self, mds_id=None):
+        self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].restart())
+
+    def mds_fail_restart(self, mds_id=None):
+        """
+        Variation on restart that includes marking MDSs as failed, so that doing this
+        operation followed by waiting for healthy daemon states guarantees that they
+        have gone down and come up, rather than potentially seeing the healthy states
+        that existed before the restart.
+        """
+        def _fail_restart(id_):
+            self.mds_daemons[id_].stop()
+            self.mon_manager.raw_cluster_cmd("mds", "fail", id_)
+            self.mds_daemons[id_].restart()
+
+        self._one_or_all(mds_id, _fail_restart)
+
+    def newfs(self, name='cephfs', create=True):
+        return Filesystem(self._ctx, name=name, create=create)
+
+    def status(self):
+        return FSStatus(self.mon_manager)
+
+    def delete_all_filesystems(self):
+        """
+        Remove all filesystems that exist, and any pools in use by them.
+        """
+        pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+        pool_id_name = {}
+        for pool in pools:
+            pool_id_name[pool['pool']] = pool['pool_name']
+
+        # mark cluster down for each fs to prevent churn during deletion
+        status = self.status()
+        for fs in status.get_filesystems():
+            self.mon_manager.raw_cluster_cmd("fs", "set", fs['mdsmap']['fs_name'], "cluster_down", "true")
+
+        # get a new copy as actives may have since changed
+        status = self.status()
+        for fs in status.get_filesystems():
+            mdsmap = fs['mdsmap']
+            metadata_pool = pool_id_name[mdsmap['metadata_pool']]
+
+            for gid in mdsmap['up'].values():
+                self.mon_manager.raw_cluster_cmd('mds', 'fail', gid.__str__())
+
+            self.mon_manager.raw_cluster_cmd('fs', 'rm', mdsmap['fs_name'], '--yes-i-really-mean-it')
+            self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+                                             metadata_pool, metadata_pool,
+                                             '--yes-i-really-really-mean-it')
+            for data_pool in mdsmap['data_pools']:
+                data_pool = pool_id_name[data_pool]
+                try:
+                    self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+                                                     data_pool, data_pool,
+                                                     '--yes-i-really-really-mean-it')
+                except CommandFailedError as e:
+                    if e.exitstatus == 16: # EBUSY, this data pool is used
+                        pass               # by two metadata pools, let the 2nd
+                    else:                  # pass delete it
+                        raise
+
+    def get_standby_daemons(self):
+        return set([s['name'] for s in self.status().get_standbys()])
+
+    def get_mds_hostnames(self):
+        result = set()
+        for mds_id in self.mds_ids:
+            mds_remote = self.mon_manager.find_remote('mds', mds_id)
+            result.add(mds_remote.hostname)
+
+        return list(result)
+
+    def set_clients_block(self, blocked, mds_id=None):
+        """
+        Block (using iptables) client communications to this MDS.  Be careful: if
+        other services are running on this MDS, or other MDSs try to talk to this
+        MDS, their communications may also be blocked as collatoral damage.
+
+        :param mds_id: Optional ID of MDS to block, default to all
+        :return:
+        """
+        da_flag = "-A" if blocked else "-D"
+
+        def set_block(_mds_id):
+            remote = self.mon_manager.find_remote('mds', _mds_id)
+            status = self.status()
+
+            addr = status.get_mds_addr(_mds_id)
+            ip_str, port_str, inst_str = re.match("(.+):(.+)/(.+)", addr).groups()
+
+            remote.run(
+                args=["sudo", "iptables", da_flag, "OUTPUT", "-p", "tcp", "--sport", port_str, "-j", "REJECT", "-m",
+                      "comment", "--comment", "teuthology"])
+            remote.run(
+                args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m",
+                      "comment", "--comment", "teuthology"])
+
+        self._one_or_all(mds_id, set_block, in_parallel=False)
+
+    def clear_firewall(self):
+        clear_firewall(self._ctx)
+
+    def get_mds_info(self, mds_id):
+        return FSStatus(self.mon_manager).get_mds(mds_id)
+
+    def is_full(self):
+        flags = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['flags']
+        return 'full' in flags
+
+    def is_pool_full(self, pool_name):
+        pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+        for pool in pools:
+            if pool['pool_name'] == pool_name:
+                return 'full' in pool['flags_names'].split(",")
+
+        raise RuntimeError("Pool not found '{0}'".format(pool_name))
+
+class Filesystem(MDSCluster):
+    """
+    This object is for driving a CephFS filesystem.  The MDS daemons driven by
+    MDSCluster may be shared with other Filesystems.
+    """
+    def __init__(self, ctx, fscid=None, name=None, create=False,
+                 ec_profile=None):
+        super(Filesystem, self).__init__(ctx)
+
+        self.name = name
+        self.ec_profile = ec_profile
+        self.id = None
+        self.metadata_pool_name = None
+        self.metadata_overlay = False
+        self.data_pool_name = None
+        self.data_pools = None
+
+        client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client'))
+        self.client_id = client_list[0]
+        self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1]
+
+        if name is not None:
+            if fscid is not None:
+                raise RuntimeError("cannot specify fscid when creating fs")
+            if create and not self.legacy_configured():
+                self.create()
+        else:
+            if fscid is not None:
+                self.id = fscid
+                self.getinfo(refresh = True)
+
+        # Stash a reference to the first created filesystem on ctx, so
+        # that if someone drops to the interactive shell they can easily
+        # poke our methods.
+        if not hasattr(self._ctx, "filesystem"):
+            self._ctx.filesystem = self
+
+    def getinfo(self, refresh = False):
+        status = self.status()
+        if self.id is not None:
+            fsmap = status.get_fsmap(self.id)
+        elif self.name is not None:
+            fsmap = status.get_fsmap_byname(self.name)
+        else:
+            fss = [fs for fs in status.get_filesystems()]
+            if len(fss) == 1:
+                fsmap = fss[0]
+            elif len(fss) == 0:
+                raise RuntimeError("no file system available")
+            else:
+                raise RuntimeError("more than one file system available")
+        self.id = fsmap['id']
+        self.name = fsmap['mdsmap']['fs_name']
+        self.get_pool_names(status = status, refresh = refresh)
+        return status
+
+    def set_metadata_overlay(self, overlay):
+        if self.id is not None:
+            raise RuntimeError("cannot specify fscid when configuring overlay")
+        self.metadata_overlay = overlay
+
+    def deactivate(self, rank):
+        if rank < 0:
+            raise RuntimeError("invalid rank")
+        elif rank == 0:
+            raise RuntimeError("cannot deactivate rank 0")
+        self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank))
+
+    def set_max_mds(self, max_mds):
+        self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "max_mds", "%d" % max_mds)
+
+    def set_allow_dirfrags(self, yes):
+        self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "allow_dirfrags", str(yes).lower(), '--yes-i-really-mean-it')
+
+    def get_pgs_per_fs_pool(self):
+        """
+        Calculate how many PGs to use when creating a pool, in order to avoid raising any
+        health warnings about mon_pg_warn_min_per_osd
+
+        :return: an integer number of PGs
+        """
+        pg_warn_min_per_osd = int(self.get_config('mon_pg_warn_min_per_osd'))
+        osd_count = len(list(misc.all_roles_of_type(self._ctx.cluster, 'osd')))
+        return pg_warn_min_per_osd * osd_count
+
+    def create(self):
+        if self.name is None:
+            self.name = "cephfs"
+        if self.metadata_pool_name is None:
+            self.metadata_pool_name = "{0}_metadata".format(self.name)
+        if self.data_pool_name is None:
+            data_pool_name = "{0}_data".format(self.name)
+        else:
+            data_pool_name = self.data_pool_name
+
+        log.info("Creating filesystem '{0}'".format(self.name))
+
+        pgs_per_fs_pool = self.get_pgs_per_fs_pool()
+
+        self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                         self.metadata_pool_name, pgs_per_fs_pool.__str__())
+        if self.metadata_overlay:
+            self.mon_manager.raw_cluster_cmd('fs', 'new',
+                                             self.name, self.metadata_pool_name, data_pool_name,
+                                             '--allow-dangerous-metadata-overlay')
+        else:
+            if self.ec_profile:
+                log.info("EC profile is %s", self.ec_profile)
+                cmd = ['osd', 'erasure-code-profile', 'set', data_pool_name]
+                cmd.extend(self.ec_profile)
+                self.mon_manager.raw_cluster_cmd(*cmd)
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'create',
+                    data_pool_name, pgs_per_fs_pool.__str__(), 'erasure',
+                    data_pool_name)
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'set',
+                    data_pool_name, 'allow_ec_overwrites', 'true')
+            else:
+                self.mon_manager.raw_cluster_cmd(
+                    'osd', 'pool', 'create',
+                    data_pool_name, pgs_per_fs_pool.__str__())
+            self.mon_manager.raw_cluster_cmd('fs', 'new',
+                                             self.name, self.metadata_pool_name, data_pool_name)
+        self.check_pool_application(self.metadata_pool_name)
+        self.check_pool_application(data_pool_name)
+        # Turn off spurious standby count warnings from modifying max_mds in tests.
+        try:
+            self.mon_manager.raw_cluster_cmd('fs', 'set', self.name, 'standby_count_wanted', '0')
+        except CommandFailedError as e:
+            if e.exitstatus == 22:
+                # standby_count_wanted not available prior to luminous (upgrade tests would fail otherwise)
+                pass
+            else:
+                raise
+
+        self.getinfo(refresh = True)
+
+        
+    def check_pool_application(self, pool_name):
+        osd_map = self.mon_manager.get_osd_dump_json()
+        for pool in osd_map['pools']:
+            if pool['pool_name'] == pool_name:
+                if "application_metadata" in pool:
+                    if not "cephfs" in pool['application_metadata']:
+                        raise RuntimeError("Pool %p does not name cephfs as application!".\
+                                           format(pool_name))
+        
+
+    def __del__(self):
+        if getattr(self._ctx, "filesystem", None) == self:
+            delattr(self._ctx, "filesystem")
+
+    def exists(self):
+        """
+        Whether a filesystem exists in the mon's filesystem list
+        """
+        fs_list = json.loads(self.mon_manager.raw_cluster_cmd('fs', 'ls', '--format=json-pretty'))
+        return self.name in [fs['name'] for fs in fs_list]
+
+    def legacy_configured(self):
+        """
+        Check if a legacy (i.e. pre "fs new") filesystem configuration is present.  If this is
+        the case, the caller should avoid using Filesystem.create
+        """
+        try:
+            out_text = self.mon_manager.raw_cluster_cmd('--format=json-pretty', 'osd', 'lspools')
+            pools = json.loads(out_text)
+            metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools]
+            if metadata_pool_exists:
+                self.metadata_pool_name = 'metadata'
+        except CommandFailedError as e:
+            # For use in upgrade tests, Ceph cuttlefish and earlier don't support
+            # structured output (--format) from the CLI.
+            if e.exitstatus == 22:
+                metadata_pool_exists = True
+            else:
+                raise
+
+        return metadata_pool_exists
+
+    def _df(self):
+        return json.loads(self.mon_manager.raw_cluster_cmd("df", "--format=json-pretty"))
+
+    def get_mds_map(self):
+        return self.status().get_fsmap(self.id)['mdsmap']
+
+    def add_data_pool(self, name):
+        self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name, self.get_pgs_per_fs_pool().__str__())
+        self.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', self.name, name)
+        self.get_pool_names(refresh = True)
+        for poolid, fs_name in self.data_pools.items():
+            if name == fs_name:
+                return poolid
+        raise RuntimeError("could not get just created pool '{0}'".format(name))
+
+    def get_pool_names(self, refresh = False, status = None):
+        if refresh or self.metadata_pool_name is None or self.data_pools is None:
+            if status is None:
+                status = self.status()
+            fsmap = status.get_fsmap(self.id)
+
+            osd_map = self.mon_manager.get_osd_dump_json()
+            id_to_name = {}
+            for p in osd_map['pools']:
+                id_to_name[p['pool']] = p['pool_name']
+
+            self.metadata_pool_name = id_to_name[fsmap['mdsmap']['metadata_pool']]
+            self.data_pools = {}
+            for data_pool in fsmap['mdsmap']['data_pools']:
+                self.data_pools[data_pool] = id_to_name[data_pool]
+
+    def get_data_pool_name(self, refresh = False):
+        if refresh or self.data_pools is None:
+            self.get_pool_names(refresh = True)
+        assert(len(self.data_pools) == 1)
+        return self.data_pools.values()[0]
+
+    def get_data_pool_id(self, refresh = False):
+        """
+        Don't call this if you have multiple data pools
+        :return: integer
+        """
+        if refresh or self.data_pools is None:
+            self.get_pool_names(refresh = True)
+        assert(len(self.data_pools) == 1)
+        return self.data_pools.keys()[0]
+
+    def get_data_pool_names(self, refresh = False):
+        if refresh or self.data_pools is None:
+            self.get_pool_names(refresh = True)
+        return self.data_pools.values()
+
+    def get_metadata_pool_name(self):
+        return self.metadata_pool_name
+
+    def set_data_pool_name(self, name):
+        if self.id is not None:
+            raise RuntimeError("can't set filesystem name if its fscid is set")
+        self.data_pool_name = name
+
+    def get_namespace_id(self):
+        return self.id
+
+    def get_pool_df(self, pool_name):
+        """
+        Return a dict like:
+        {u'bytes_used': 0, u'max_avail': 83848701, u'objects': 0, u'kb_used': 0}
+        """
+        for pool_df in self._df()['pools']:
+            if pool_df['name'] == pool_name:
+                return pool_df['stats']
+
+        raise RuntimeError("Pool name '{0}' not found".format(pool_name))
+
+    def get_usage(self):
+        return self._df()['stats']['total_used_bytes']
+
+    def are_daemons_healthy(self):
+        """
+        Return true if all daemons are in one of active, standby, standby-replay, and
+        at least max_mds daemons are in 'active'.
+
+        Unlike most of Filesystem, this function is tolerant of new-style `fs`
+        commands being missing, because we are part of the ceph installation
+        process during upgrade suites, so must fall back to old style commands
+        when we get an EINVAL on a new style command.
+
+        :return:
+        """
+
+        active_count = 0
+        try:
+            mds_map = self.get_mds_map()
+        except CommandFailedError as cfe:
+            # Old version, fall back to non-multi-fs commands
+            if cfe.exitstatus == errno.EINVAL:
+                mds_map = json.loads(
+                        self.mon_manager.raw_cluster_cmd('mds', 'dump', '--format=json'))
+            else:
+                raise
+
+        log.info("are_daemons_healthy: mds map: {0}".format(mds_map))
+
+        for mds_id, mds_status in mds_map['info'].items():
+            if mds_status['state'] not in ["up:active", "up:standby", "up:standby-replay"]:
+                log.warning("Unhealthy mds state {0}:{1}".format(mds_id, mds_status['state']))
+                return False
+            elif mds_status['state'] == 'up:active':
+                active_count += 1
+
+        log.info("are_daemons_healthy: {0}/{1}".format(
+            active_count, mds_map['max_mds']
+        ))
+
+        if active_count >= mds_map['max_mds']:
+            # The MDSMap says these guys are active, but let's check they really are
+            for mds_id, mds_status in mds_map['info'].items():
+                if mds_status['state'] == 'up:active':
+                    try:
+                        daemon_status = self.mds_asok(["status"], mds_id=mds_status['name'])
+                    except CommandFailedError as cfe:
+                        if cfe.exitstatus == errno.EINVAL:
+                            # Old version, can't do this check
+                            continue
+                        else:
+                            # MDS not even running
+                            return False
+
+                    if daemon_status['state'] != 'up:active':
+                        # MDS hasn't taken the latest map yet
+                        return False
+
+            return True
+        else:
+            return False
+
+    def get_daemon_names(self, state=None):
+        """
+        Return MDS daemon names of those daemons in the given state
+        :param state:
+        :return:
+        """
+        status = self.get_mds_map()
+        result = []
+        for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])):
+            if mds_status['state'] == state or state is None:
+                result.append(mds_status['name'])
+
+        return result
+
+    def get_active_names(self):
+        """
+        Return MDS daemon names of those daemons holding ranks
+        in state up:active
+
+        :return: list of strings like ['a', 'b'], sorted by rank
+        """
+        return self.get_daemon_names("up:active")
+
+    def get_all_mds_rank(self):
+        status = self.get_mds_map()
+        result = []
+        for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])):
+            if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay':
+                result.append(mds_status['rank'])
+
+        return result
+
+    def get_rank_names(self):
+        """
+        Return MDS daemon names of those daemons holding a rank,
+        sorted by rank.  This includes e.g. up:replay/reconnect
+        as well as active, but does not include standby or
+        standby-replay.
+        """
+        status = self.get_mds_map()
+        result = []
+        for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])):
+            if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay':
+                result.append(mds_status['name'])
+
+        return result
+
+    def wait_for_daemons(self, timeout=None):
+        """
+        Wait until all daemons are healthy
+        :return:
+        """
+
+        if timeout is None:
+            timeout = DAEMON_WAIT_TIMEOUT
+
+        elapsed = 0
+        while True:
+            if self.are_daemons_healthy():
+                return
+            else:
+                time.sleep(1)
+                elapsed += 1
+
+            if elapsed > timeout:
+                raise RuntimeError("Timed out waiting for MDS daemons to become healthy")
+
+    def get_lone_mds_id(self):
+        """
+        Get a single MDS ID: the only one if there is only one
+        configured, else the only one currently holding a rank,
+        else raise an error.
+        """
+        if len(self.mds_ids) != 1:
+            alive = self.get_rank_names()
+            if len(alive) == 1:
+                return alive[0]
+            else:
+                raise ValueError("Explicit MDS argument required when multiple MDSs in use")
+        else:
+            return self.mds_ids[0]
+
+    def recreate(self):
+        log.info("Creating new filesystem")
+        self.delete_all_filesystems()
+        self.id = None
+        self.create()
+
+    def put_metadata_object_raw(self, object_id, infile):
+        """
+        Save an object to the metadata pool
+        """
+        temp_bin_path = infile
+        self.client_remote.run(args=[
+            'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'put', object_id, temp_bin_path
+        ])
+
+    def get_metadata_object_raw(self, object_id):
+        """
+        Retrieve an object from the metadata pool and store it in a file.
+        """
+        temp_bin_path = '/tmp/' + object_id + '.bin'
+
+        self.client_remote.run(args=[
+            'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path
+        ])
+
+        return temp_bin_path
+
+    def get_metadata_object(self, object_type, object_id):
+        """
+        Retrieve an object from the metadata pool, pass it through
+        ceph-dencoder to dump it to JSON, and return the decoded object.
+        """
+        temp_bin_path = '/tmp/out.bin'
+
+        self.client_remote.run(args=[
+            'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path
+        ])
+
+        stdout = StringIO()
+        self.client_remote.run(args=[
+            'sudo', os.path.join(self._prefix, 'ceph-dencoder'), 'type', object_type, 'import', temp_bin_path, 'decode', 'dump_json'
+        ], stdout=stdout)
+        dump_json = stdout.getvalue().strip()
+        try:
+            dump = json.loads(dump_json)
+        except (TypeError, ValueError):
+            log.error("Failed to decode JSON: '{0}'".format(dump_json))
+            raise
+
+        return dump
+
+    def get_journal_version(self):
+        """
+        Read the JournalPointer and Journal::Header objects to learn the version of
+        encoding in use.
+        """
+        journal_pointer_object = '400.00000000'
+        journal_pointer_dump = self.get_metadata_object("JournalPointer", journal_pointer_object)
+        journal_ino = journal_pointer_dump['journal_pointer']['front']
+
+        journal_header_object = "{0:x}.00000000".format(journal_ino)
+        journal_header_dump = self.get_metadata_object('Journaler::Header', journal_header_object)
+
+        version = journal_header_dump['journal_header']['stream_format']
+        log.info("Read journal version {0}".format(version))
+
+        return version
+
+    def mds_asok(self, command, mds_id=None):
+        if mds_id is None:
+            mds_id = self.get_lone_mds_id()
+
+        return self.json_asok(command, 'mds', mds_id)
+
+    def read_cache(self, path, depth=None):
+        cmd = ["dump", "tree", path]
+        if depth is not None:
+            cmd.append(depth.__str__())
+        result = self.mds_asok(cmd)
+        if len(result) == 0:
+            raise RuntimeError("Path not found in cache: {0}".format(path))
+
+        return result
+
+    def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None, rank=None):
+        """
+        Block until the MDS reaches a particular state, or a failure condition
+        is met.
+
+        When there are multiple MDSs, succeed when exaclty one MDS is in the
+        goal state, or fail when any MDS is in the reject state.
+
+        :param goal_state: Return once the MDS is in this state
+        :param reject: Fail if the MDS enters this state before the goal state
+        :param timeout: Fail if this many seconds pass before reaching goal
+        :return: number of seconds waited, rounded down to integer
+        """
+
+        started_at = time.time()
+        while True:
+            status = self.status()
+            if rank is not None:
+                mds_info = status.get_rank(self.id, rank)
+                current_state = mds_info['state'] if mds_info else None
+                log.info("Looked up MDS state for mds.{0}: {1}".format(rank, current_state))
+            elif mds_id is not None:
+                # mds_info is None if no daemon with this ID exists in the map
+                mds_info = status.get_mds(mds_id)
+                current_state = mds_info['state'] if mds_info else None
+                log.info("Looked up MDS state for {0}: {1}".format(mds_id, current_state))
+            else:
+                # In general, look for a single MDS
+                states = [m['state'] for m in status.get_ranks(self.id)]
+                if [s for s in states if s == goal_state] == [goal_state]:
+                    current_state = goal_state
+                elif reject in states:
+                    current_state = reject
+                else:
+                    current_state = None
+                log.info("mapped states {0} to {1}".format(states, current_state))
+
+            elapsed = time.time() - started_at
+            if current_state == goal_state:
+                log.info("reached state '{0}' in {1}s".format(current_state, elapsed))
+                return elapsed
+            elif reject is not None and current_state == reject:
+                raise RuntimeError("MDS in reject state {0}".format(current_state))
+            elif timeout is not None and elapsed > timeout:
+                log.error("MDS status at timeout: {0}".format(status.get_fsmap(self.id)))
+                raise RuntimeError(
+                    "Reached timeout after {0} seconds waiting for state {1}, while in state {2}".format(
+                        elapsed, goal_state, current_state
+                    ))
+            else:
+                time.sleep(1)
+
+    def _read_data_xattr(self, ino_no, xattr_name, type, pool):
+        mds_id = self.mds_ids[0]
+        remote = self.mds_daemons[mds_id].remote
+        if pool is None:
+            pool = self.get_data_pool_name()
+
+        obj_name = "{0:x}.00000000".format(ino_no)
+
+        args = [
+            os.path.join(self._prefix, "rados"), "-p", pool, "getxattr", obj_name, xattr_name
+        ]
+        try:
+            proc = remote.run(
+                args=args,
+                stdout=StringIO())
+        except CommandFailedError as e:
+            log.error(e.__str__())
+            raise ObjectNotFound(obj_name)
+
+        data = proc.stdout.getvalue()
+
+        p = remote.run(
+            args=[os.path.join(self._prefix, "ceph-dencoder"), "type", type, "import", "-", "decode", "dump_json"],
+            stdout=StringIO(),
+            stdin=data
+        )
+
+        return json.loads(p.stdout.getvalue().strip())
+
+    def _write_data_xattr(self, ino_no, xattr_name, data, pool=None):
+        """
+        Write to an xattr of the 0th data object of an inode.  Will
+        succeed whether the object and/or xattr already exist or not.
+
+        :param ino_no: integer inode number
+        :param xattr_name: string name of the xattr
+        :param data: byte array data to write to the xattr
+        :param pool: name of data pool or None to use primary data pool
+        :return: None
+        """
+        remote = self.mds_daemons[self.mds_ids[0]].remote
+        if pool is None:
+            pool = self.get_data_pool_name()
+
+        obj_name = "{0:x}.00000000".format(ino_no)
+        args = [
+            os.path.join(self._prefix, "rados"), "-p", pool, "setxattr",
+            obj_name, xattr_name, data
+        ]
+        remote.run(
+            args=args,
+            stdout=StringIO())
+
+    def read_backtrace(self, ino_no, pool=None):
+        """
+        Read the backtrace from the data pool, return a dict in the format
+        given by inode_backtrace_t::dump, which is something like:
+
+        ::
+
+            rados -p cephfs_data getxattr 10000000002.00000000 parent > out.bin
+            ceph-dencoder type inode_backtrace_t import out.bin decode dump_json
+
+            { "ino": 1099511627778,
+              "ancestors": [
+                    { "dirino": 1,
+                      "dname": "blah",
+                      "version": 11}],
+              "pool": 1,
+              "old_pools": []}
+
+        :param pool: name of pool to read backtrace from.  If omitted, FS must have only
+                     one data pool and that will be used.
+        """
+        return self._read_data_xattr(ino_no, "parent", "inode_backtrace_t", pool)
+
+    def read_layout(self, ino_no, pool=None):
+        """
+        Read 'layout' xattr of an inode and parse the result, returning a dict like:
+        ::
+            {
+                "stripe_unit": 4194304,
+                "stripe_count": 1,
+                "object_size": 4194304,
+                "pool_id": 1,
+                "pool_ns": "",
+            }
+
+        :param pool: name of pool to read backtrace from.  If omitted, FS must have only
+                     one data pool and that will be used.
+        """
+        return self._read_data_xattr(ino_no, "layout", "file_layout_t", pool)
+
+    def _enumerate_data_objects(self, ino, size):
+        """
+        Get the list of expected data objects for a range, and the list of objects
+        that really exist.
+
+        :return a tuple of two lists of strings (expected, actual)
+        """
+        stripe_size = 1024 * 1024 * 4
+
+        size = max(stripe_size, size)
+
+        want_objects = [
+            "{0:x}.{1:08x}".format(ino, n)
+            for n in range(0, ((size - 1) / stripe_size) + 1)
+        ]
+
+        exist_objects = self.rados(["ls"], pool=self.get_data_pool_name()).split("\n")
+
+        return want_objects, exist_objects
+
+    def data_objects_present(self, ino, size):
+        """
+        Check that *all* the expected data objects for an inode are present in the data pool
+        """
+
+        want_objects, exist_objects = self._enumerate_data_objects(ino, size)
+        missing = set(want_objects) - set(exist_objects)
+
+        if missing:
+            log.info("Objects missing (ino {0}, size {1}): {2}".format(
+                ino, size, missing
+            ))
+            return False
+        else:
+            log.info("All objects for ino {0} size {1} found".format(ino, size))
+            return True
+
+    def data_objects_absent(self, ino, size):
+        want_objects, exist_objects = self._enumerate_data_objects(ino, size)
+        present = set(want_objects) & set(exist_objects)
+
+        if present:
+            log.info("Objects not absent (ino {0}, size {1}): {2}".format(
+                ino, size, present
+            ))
+            return False
+        else:
+            log.info("All objects for ino {0} size {1} are absent".format(ino, size))
+            return True
+
+    def dirfrag_exists(self, ino, frag):
+        try:
+            self.rados(["stat", "{0:x}.{1:08x}".format(ino, frag)])
+        except CommandFailedError as e:
+            return False
+        else:
+            return True
+
+    def rados(self, args, pool=None, namespace=None, stdin_data=None):
+        """
+        Call into the `rados` CLI from an MDS
+        """
+
+        if pool is None:
+            pool = self.get_metadata_pool_name()
+
+        # Doesn't matter which MDS we use to run rados commands, they all
+        # have access to the pools
+        mds_id = self.mds_ids[0]
+        remote = self.mds_daemons[mds_id].remote
+
+        # NB we could alternatively use librados pybindings for this, but it's a one-liner
+        # using the `rados` CLI
+        args = ([os.path.join(self._prefix, "rados"), "-p", pool] +
+                (["--namespace", namespace] if namespace else []) +
+                args)
+        p = remote.run(
+            args=args,
+            stdin=stdin_data,
+            stdout=StringIO())
+        return p.stdout.getvalue().strip()
+
+    def list_dirfrag(self, dir_ino):
+        """
+        Read the named object and return the list of omap keys
+
+        :return a list of 0 or more strings
+        """
+
+        dirfrag_obj_name = "{0:x}.00000000".format(dir_ino)
+
+        try:
+            key_list_str = self.rados(["listomapkeys", dirfrag_obj_name])
+        except CommandFailedError as e:
+            log.error(e.__str__())
+            raise ObjectNotFound(dirfrag_obj_name)
+
+        return key_list_str.split("\n") if key_list_str else []
+
+    def erase_metadata_objects(self, prefix):
+        """
+        For all objects in the metadata pool matching the prefix,
+        erase them.
+
+        This O(N) with the number of objects in the pool, so only suitable
+        for use on toy test filesystems.
+        """
+        all_objects = self.rados(["ls"]).split("\n")
+        matching_objects = [o for o in all_objects if o.startswith(prefix)]
+        for o in matching_objects:
+            self.rados(["rm", o])
+
+    def erase_mds_objects(self, rank):
+        """
+        Erase all the per-MDS objects for a particular rank.  This includes
+        inotable, sessiontable, journal
+        """
+
+        def obj_prefix(multiplier):
+            """
+            MDS object naming conventions like rank 1's
+            journal is at 201.***
+            """
+            return "%x." % (multiplier * 0x100 + rank)
+
+        # MDS_INO_LOG_OFFSET
+        self.erase_metadata_objects(obj_prefix(2))
+        # MDS_INO_LOG_BACKUP_OFFSET
+        self.erase_metadata_objects(obj_prefix(3))
+        # MDS_INO_LOG_POINTER_OFFSET
+        self.erase_metadata_objects(obj_prefix(4))
+        # MDSTables & SessionMap
+        self.erase_metadata_objects("mds{rank:d}_".format(rank=rank))
+
+    @property
+    def _prefix(self):
+        """
+        Override this to set a different
+        """
+        return ""
+
+    def _run_tool(self, tool, args, rank=None, quiet=False):
+        # Tests frequently have [client] configuration that jacks up
+        # the objecter log level (unlikely to be interesting here)
+        # and does not set the mds log level (very interesting here)
+        if quiet:
+            base_args = [os.path.join(self._prefix, tool), '--debug-mds=1', '--debug-objecter=1']
+        else:
+            base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1']
+
+        if rank is not None:
+            base_args.extend(["--rank", "%d" % rank])
+
+        t1 = datetime.datetime.now()
+        r = self.tool_remote.run(
+            args=base_args + args,
+            stdout=StringIO()).stdout.getvalue().strip()
+        duration = datetime.datetime.now() - t1
+        log.info("Ran {0} in time {1}, result:\n{2}".format(
+            base_args + args, duration, r
+        ))
+        return r
+
+    @property
+    def tool_remote(self):
+        """
+        An arbitrary remote to use when invoking recovery tools.  Use an MDS host because
+        it'll definitely have keys with perms to access cephfs metadata pool.  This is public
+        so that tests can use this remote to go get locally written output files from the tools.
+        """
+        mds_id = self.mds_ids[0]
+        return self.mds_daemons[mds_id].remote
+
+    def journal_tool(self, args, rank=None, quiet=False):
+        """
+        Invoke cephfs-journal-tool with the passed arguments, and return its stdout
+        """
+        return self._run_tool("cephfs-journal-tool", args, rank, quiet)
+
+    def table_tool(self, args, quiet=False):
+        """
+        Invoke cephfs-table-tool with the passed arguments, and return its stdout
+        """
+        return self._run_tool("cephfs-table-tool", args, None, quiet)
+
+    def data_scan(self, args, quiet=False, worker_count=1):
+        """
+        Invoke cephfs-data-scan with the passed arguments, and return its stdout
+
+        :param worker_count: if greater than 1, multiple workers will be run
+                             in parallel and the return value will be None
+        """
+
+        workers = []
+
+        for n in range(0, worker_count):
+            if worker_count > 1:
+                # data-scan args first token is a command, followed by args to it.
+                # insert worker arguments after the command.
+                cmd = args[0]
+                worker_args = [cmd] + ["--worker_n", n.__str__(), "--worker_m", worker_count.__str__()] + args[1:]
+            else:
+                worker_args = args
+
+            workers.append(Greenlet.spawn(lambda wargs=worker_args:
+                                          self._run_tool("cephfs-data-scan", wargs, None, quiet)))
+
+        for w in workers:
+            w.get()
+
+        if worker_count == 1:
+            return workers[0].value
+        else:
+            return None
diff --git a/src/ceph/qa/tasks/cephfs/fuse_mount.py b/src/ceph/qa/tasks/cephfs/fuse_mount.py
new file mode 100644
index 0000000..8d8410c
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/fuse_mount.py
@@ -0,0 +1,428 @@
+
+from StringIO import StringIO
+import json
+import time
+import logging
+from textwrap import dedent
+
+from teuthology import misc
+from teuthology.contextutil import MaxWhileTries
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError
+from .mount import CephFSMount
+
+log = logging.getLogger(__name__)
+
+
+class FuseMount(CephFSMount):
+    def __init__(self, client_config, test_dir, client_id, client_remote):
+        super(FuseMount, self).__init__(test_dir, client_id, client_remote)
+
+        self.client_config = client_config if client_config else {}
+        self.fuse_daemon = None
+        self._fuse_conn = None
+
+    def mount(self, mount_path=None, mount_fs_name=None):
+        try:
+            return self._mount(mount_path, mount_fs_name)
+        except RuntimeError:
+            # Catch exceptions by the mount() logic (i.e. not remote command
+            # failures) and ensure the mount is not left half-up.
+            # Otherwise we might leave a zombie mount point that causes
+            # anyone traversing cephtest/ to get hung up on.
+            log.warn("Trying to clean up after failed mount")
+            self.umount_wait(force=True)
+            raise
+
+    def _mount(self, mount_path, mount_fs_name):
+        log.info("Client client.%s config is %s" % (self.client_id, self.client_config))
+
+        daemon_signal = 'kill'
+        if self.client_config.get('coverage') or self.client_config.get('valgrind') is not None:
+            daemon_signal = 'term'
+
+        log.info('Mounting ceph-fuse client.{id} at {remote} {mnt}...'.format(
+            id=self.client_id, remote=self.client_remote, mnt=self.mountpoint))
+
+        self.client_remote.run(
+            args=[
+                'mkdir',
+                '--',
+                self.mountpoint,
+            ],
+        )
+
+        run_cmd = [
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=self.test_dir),
+            'daemon-helper',
+            daemon_signal,
+        ]
+
+        fuse_cmd = ['ceph-fuse', "-f"]
+
+        if mount_path is not None:
+            fuse_cmd += ["--client_mountpoint={0}".format(mount_path)]
+
+        if mount_fs_name is not None:
+            fuse_cmd += ["--client_mds_namespace={0}".format(mount_fs_name)]
+
+        fuse_cmd += [
+            '--name', 'client.{id}'.format(id=self.client_id),
+            # TODO ceph-fuse doesn't understand dash dash '--',
+            self.mountpoint,
+        ]
+
+        if self.client_config.get('valgrind') is not None:
+            run_cmd = misc.get_valgrind_args(
+                self.test_dir,
+                'client.{id}'.format(id=self.client_id),
+                run_cmd,
+                self.client_config.get('valgrind'),
+            )
+
+        run_cmd.extend(fuse_cmd)
+
+        def list_connections():
+            self.client_remote.run(
+                args=["sudo", "mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"],
+                check_status=False
+            )
+            p = self.client_remote.run(
+                args=["ls", "/sys/fs/fuse/connections"],
+                stdout=StringIO(),
+                check_status=False
+            )
+            if p.exitstatus != 0:
+                return []
+
+            ls_str = p.stdout.getvalue().strip()
+            if ls_str:
+                return [int(n) for n in ls_str.split("\n")]
+            else:
+                return []
+
+        # Before starting ceph-fuse process, note the contents of
+        # /sys/fs/fuse/connections
+        pre_mount_conns = list_connections()
+        log.info("Pre-mount connections: {0}".format(pre_mount_conns))
+
+        proc = self.client_remote.run(
+            args=run_cmd,
+            logger=log.getChild('ceph-fuse.{id}'.format(id=self.client_id)),
+            stdin=run.PIPE,
+            wait=False,
+        )
+        self.fuse_daemon = proc
+
+        # Wait for the connection reference to appear in /sys
+        mount_wait = self.client_config.get('mount_wait', 0)
+        if mount_wait > 0:
+            log.info("Fuse mount waits {0} seconds before checking /sys/".format(mount_wait))
+            time.sleep(mount_wait)            
+        timeout = int(self.client_config.get('mount_timeout', 30))
+        waited = 0
+
+        post_mount_conns = list_connections()
+        while len(post_mount_conns) <= len(pre_mount_conns):
+            if self.fuse_daemon.finished:
+                # Did mount fail?  Raise the CommandFailedError instead of
+                # hitting the "failed to populate /sys/" timeout
+                self.fuse_daemon.wait()
+            time.sleep(1)
+            waited += 1
+            if waited > timeout:
+                raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format(
+                    waited
+                ))
+            else:
+                post_mount_conns = list_connections()
+
+        log.info("Post-mount connections: {0}".format(post_mount_conns))
+
+        # Record our fuse connection number so that we can use it when
+        # forcing an unmount
+        new_conns = list(set(post_mount_conns) - set(pre_mount_conns))
+        if len(new_conns) == 0:
+            raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns))
+        elif len(new_conns) > 1:
+            raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns))
+        else:
+            self._fuse_conn = new_conns[0]
+
+    def is_mounted(self):
+        proc = self.client_remote.run(
+            args=[
+                'stat',
+                '--file-system',
+                '--printf=%T\n',
+                '--',
+                self.mountpoint,
+            ],
+            stdout=StringIO(),
+            stderr=StringIO(),
+            wait=False
+        )
+        try:
+            proc.wait()
+        except CommandFailedError:
+            if ("endpoint is not connected" in proc.stderr.getvalue()
+            or "Software caused connection abort" in proc.stderr.getvalue()):
+                # This happens is fuse is killed without unmount
+                log.warn("Found stale moutn point at {0}".format(self.mountpoint))
+                return True
+            else:
+                # This happens if the mount directory doesn't exist
+                log.info('mount point does not exist: %s', self.mountpoint)
+                return False
+
+        fstype = proc.stdout.getvalue().rstrip('\n')
+        if fstype == 'fuseblk':
+            log.info('ceph-fuse is mounted on %s', self.mountpoint)
+            return True
+        else:
+            log.debug('ceph-fuse not mounted, got fs type {fstype!r}'.format(
+                fstype=fstype))
+            return False
+
+    def wait_until_mounted(self):
+        """
+        Check to make sure that fuse is mounted on mountpoint.  If not,
+        sleep for 5 seconds and check again.
+        """
+
+        while not self.is_mounted():
+            # Even if it's not mounted, it should at least
+            # be running: catch simple failures where it has terminated.
+            assert not self.fuse_daemon.poll()
+
+            time.sleep(5)
+
+        # Now that we're mounted, set permissions so that the rest of the test will have
+        # unrestricted access to the filesystem mount.
+        self.client_remote.run(
+            args=['sudo', 'chmod', '1777', self.mountpoint])
+
+    def _mountpoint_exists(self):
+        return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False).exitstatus == 0
+
+    def umount(self):
+        try:
+            log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name))
+            self.client_remote.run(
+                args=[
+                    'sudo',
+                    'fusermount',
+                    '-u',
+                    self.mountpoint,
+                ],
+            )
+        except run.CommandFailedError:
+            log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name))
+
+            self.client_remote.run(args=[
+                'sudo',
+                run.Raw('PATH=/usr/sbin:$PATH'),
+                'lsof',
+                run.Raw(';'),
+                'ps',
+                'auxf',
+            ])
+
+            # abort the fuse mount, killing all hung processes
+            if self._fuse_conn:
+                self.run_python(dedent("""
+                import os
+                path = "/sys/fs/fuse/connections/{0}/abort"
+                if os.path.exists(path):
+                    open(path, "w").write("1")
+                """).format(self._fuse_conn))
+                self._fuse_conn = None
+
+            stderr = StringIO()
+            try:
+                # make sure its unmounted
+                self.client_remote.run(
+                    args=[
+                        'sudo',
+                        'umount',
+                        '-l',
+                        '-f',
+                        self.mountpoint,
+                    ],
+                    stderr=stderr
+                )
+            except CommandFailedError:
+                if self.is_mounted():
+                    raise
+
+        assert not self.is_mounted()
+        self._fuse_conn = None
+
+    def umount_wait(self, force=False, require_clean=False):
+        """
+        :param force: Complete cleanly even if the MDS is offline
+        """
+        if force:
+            assert not require_clean  # mutually exclusive
+
+            # When we expect to be forcing, kill the ceph-fuse process directly.
+            # This should avoid hitting the more aggressive fallback killing
+            # in umount() which can affect other mounts too.
+            self.fuse_daemon.stdin.close()
+
+            # However, we will still hit the aggressive wait if there is an ongoing
+            # mount -o remount (especially if the remount is stuck because MDSs
+            # are unavailable)
+
+        self.umount()
+
+        try:
+            if self.fuse_daemon:
+                # Permit a timeout, so that we do not block forever
+                run.wait([self.fuse_daemon], 900)
+        except MaxWhileTries:
+            log.error("process failed to terminate after unmount.  This probably"
+                      "indicates a bug within ceph-fuse.")
+            raise
+        except CommandFailedError:
+            if require_clean:
+                raise
+
+        self.cleanup()
+
+    def cleanup(self):
+        """
+        Remove the mount point.
+
+        Prerequisite: the client is not mounted.
+        """
+        stderr = StringIO()
+        try:
+            self.client_remote.run(
+                args=[
+                    'rmdir',
+                    '--',
+                    self.mountpoint,
+                ],
+                stderr=stderr
+            )
+        except CommandFailedError:
+            if "No such file or directory" in stderr.getvalue():
+                pass
+            else:
+                raise
+
+    def kill(self):
+        """
+        Terminate the client without removing the mount point.
+        """
+        self.fuse_daemon.stdin.close()
+        try:
+            self.fuse_daemon.wait()
+        except CommandFailedError:
+            pass
+
+    def kill_cleanup(self):
+        """
+        Follow up ``kill`` to get to a clean unmounted state.
+        """
+        self.umount()
+        self.cleanup()
+
+    def teardown(self):
+        """
+        Whatever the state of the mount, get it gone.
+        """
+        super(FuseMount, self).teardown()
+
+        self.umount()
+
+        if self.fuse_daemon and not self.fuse_daemon.finished:
+            self.fuse_daemon.stdin.close()
+            try:
+                self.fuse_daemon.wait()
+            except CommandFailedError:
+                pass
+
+        # Indiscriminate, unlike the touchier cleanup()
+        self.client_remote.run(
+            args=[
+                'rm',
+                '-rf',
+                self.mountpoint,
+            ],
+        )
+
+    def _asok_path(self):
+        return "/var/run/ceph/ceph-client.{0}.*.asok".format(self.client_id)
+
+    @property
+    def _prefix(self):
+        return ""
+
+    def admin_socket(self, args):
+        pyscript = """
+import glob
+import re
+import os
+import subprocess
+
+def find_socket(client_name):
+        asok_path = "{asok_path}"
+        files = glob.glob(asok_path)
+
+        # Given a non-glob path, it better be there
+        if "*" not in asok_path:
+            assert(len(files) == 1)
+            return files[0]
+
+        for f in files:
+                pid = re.match(".*\.(\d+)\.asok$", f).group(1)
+                if os.path.exists("/proc/{{0}}".format(pid)):
+                        return f
+        raise RuntimeError("Client socket {{0}} not found".format(client_name))
+
+print find_socket("{client_name}")
+""".format(
+            asok_path=self._asok_path(),
+            client_name="client.{0}".format(self.client_id))
+
+        # Find the admin socket
+        p = self.client_remote.run(args=[
+            'python', '-c', pyscript
+        ], stdout=StringIO())
+        asok_path = p.stdout.getvalue().strip()
+        log.info("Found client admin socket at {0}".format(asok_path))
+
+        # Query client ID from admin socket
+        p = self.client_remote.run(
+            args=['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args,
+            stdout=StringIO())
+        return json.loads(p.stdout.getvalue())
+
+    def get_global_id(self):
+        """
+        Look up the CephFS client ID for this mount
+        """
+
+        return self.admin_socket(['mds_sessions'])['id']
+
+    def get_osd_epoch(self):
+        """
+        Return 2-tuple of osd_epoch, osd_epoch_barrier
+        """
+        status = self.admin_socket(['status'])
+        return status['osd_epoch'], status['osd_epoch_barrier']
+
+    def get_dentry_count(self):
+        """
+        Return 2-tuple of dentry_count, dentry_pinned_count
+        """
+        status = self.admin_socket(['status'])
+        return status['dentry_count'], status['dentry_pinned_count']
+
+    def set_cache_size(self, size):
+        return self.admin_socket(['config', 'set', 'client_cache_size', str(size)])
diff --git a/src/ceph/qa/tasks/cephfs/kernel_mount.py b/src/ceph/qa/tasks/cephfs/kernel_mount.py
new file mode 100644
index 0000000..bfa1ac6
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/kernel_mount.py
@@ -0,0 +1,267 @@
+from StringIO import StringIO
+import json
+import logging
+from textwrap import dedent
+from teuthology.orchestra.run import CommandFailedError
+from teuthology import misc
+
+from teuthology.orchestra import remote as orchestra_remote
+from teuthology.orchestra import run
+from teuthology.contextutil import MaxWhileTries
+from .mount import CephFSMount
+
+log = logging.getLogger(__name__)
+
+
+UMOUNT_TIMEOUT = 300
+
+
+class KernelMount(CephFSMount):
+    def __init__(self, mons, test_dir, client_id, client_remote,
+                 ipmi_user, ipmi_password, ipmi_domain):
+        super(KernelMount, self).__init__(test_dir, client_id, client_remote)
+        self.mons = mons
+
+        self.mounted = False
+        self.ipmi_user = ipmi_user
+        self.ipmi_password = ipmi_password
+        self.ipmi_domain = ipmi_domain
+
+    def write_secret_file(self, remote, role, keyring, filename):
+        """
+        Stash the keyring in the filename specified.
+        """
+        remote.run(
+            args=[
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{tdir}/archive/coverage'.format(tdir=self.test_dir),
+                'ceph-authtool',
+                '--name={role}'.format(role=role),
+                '--print-key',
+                keyring,
+                run.Raw('>'),
+                filename,
+            ],
+        )
+
+    def mount(self, mount_path=None, mount_fs_name=None):
+        log.info('Mounting kclient client.{id} at {remote} {mnt}...'.format(
+            id=self.client_id, remote=self.client_remote, mnt=self.mountpoint))
+
+        keyring = self.get_keyring_path()
+        secret = '{tdir}/ceph.data/client.{id}.secret'.format(tdir=self.test_dir, id=self.client_id)
+        self.write_secret_file(self.client_remote, 'client.{id}'.format(id=self.client_id),
+                               keyring, secret)
+
+        self.client_remote.run(
+            args=[
+                'mkdir',
+                '--',
+                self.mountpoint,
+            ],
+        )
+
+        if mount_path is None:
+            mount_path = "/"
+
+        opts = 'name={id},secretfile={secret},norequire_active_mds'.format(id=self.client_id,
+                                                      secret=secret)
+
+        if mount_fs_name is not None:
+            opts += ",mds_namespace={0}".format(mount_fs_name)
+
+        self.client_remote.run(
+            args=[
+                'sudo',
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{tdir}/archive/coverage'.format(tdir=self.test_dir),
+                '/sbin/mount.ceph',
+                '{mons}:{mount_path}'.format(mons=','.join(self.mons), mount_path=mount_path),
+                self.mountpoint,
+                '-v',
+                '-o',
+                opts
+            ],
+        )
+
+        self.client_remote.run(
+            args=['sudo', 'chmod', '1777', self.mountpoint])
+
+        self.mounted = True
+
+    def umount(self, force=False):
+        log.debug('Unmounting client client.{id}...'.format(id=self.client_id))
+
+        cmd=['sudo', 'umount', self.mountpoint]
+        if force:
+            cmd.append('-f')
+
+        try:
+            self.client_remote.run(args=cmd)
+        except Exception as e:
+            self.client_remote.run(args=[
+                'sudo',
+                run.Raw('PATH=/usr/sbin:$PATH'),
+                'lsof',
+                run.Raw(';'),
+                'ps', 'auxf',
+            ])
+            raise e
+
+        rproc = self.client_remote.run(
+            args=[
+                'rmdir',
+                '--',
+                self.mountpoint,
+            ],
+            wait=False
+        )
+        run.wait([rproc], UMOUNT_TIMEOUT)
+        self.mounted = False
+
+    def cleanup(self):
+        pass
+
+    def umount_wait(self, force=False, require_clean=False):
+        """
+        Unlike the fuse client, the kernel client's umount is immediate
+        """
+        if not self.is_mounted():
+            return
+
+        try:
+            self.umount(force)
+        except (CommandFailedError, MaxWhileTries):
+            if not force:
+                raise
+
+            self.kill()
+            self.kill_cleanup()
+
+        self.mounted = False
+
+    def is_mounted(self):
+        return self.mounted
+
+    def wait_until_mounted(self):
+        """
+        Unlike the fuse client, the kernel client is up and running as soon
+        as the initial mount() function returns.
+        """
+        assert self.mounted
+
+    def teardown(self):
+        super(KernelMount, self).teardown()
+        if self.mounted:
+            self.umount()
+
+    def kill(self):
+        """
+        The Ceph kernel client doesn't have a mechanism to kill itself (doing
+        that in side the kernel would be weird anyway), so we reboot the whole node
+        to get the same effect.
+
+        We use IPMI to reboot, because we don't want the client to send any
+        releases of capabilities.
+        """
+
+        con = orchestra_remote.getRemoteConsole(self.client_remote.hostname,
+                                                self.ipmi_user,
+                                                self.ipmi_password,
+                                                self.ipmi_domain)
+        con.power_off()
+
+        self.mounted = False
+
+    def kill_cleanup(self):
+        assert not self.mounted
+
+        con = orchestra_remote.getRemoteConsole(self.client_remote.hostname,
+                                                self.ipmi_user,
+                                                self.ipmi_password,
+                                                self.ipmi_domain)
+        con.power_on()
+
+        # Wait for node to come back up after reboot
+        misc.reconnect(None, 300, [self.client_remote])
+
+        # Remove mount directory
+        self.client_remote.run(
+            args=[
+                'rmdir',
+                '--',
+                self.mountpoint,
+            ],
+        )
+
+    def _find_debug_dir(self):
+        """
+        Find the debugfs folder for this mount
+        """
+        pyscript = dedent("""
+            import glob
+            import os
+            import json
+
+            def get_id_to_dir():
+                result = {}
+                for dir in glob.glob("/sys/kernel/debug/ceph/*"):
+                    mds_sessions_lines = open(os.path.join(dir, "mds_sessions")).readlines()
+                    client_id = mds_sessions_lines[1].split()[1].strip('"')
+
+                    result[client_id] = dir
+                return result
+
+            print json.dumps(get_id_to_dir())
+            """)
+
+        p = self.client_remote.run(args=[
+            'sudo', 'python', '-c', pyscript
+        ], stdout=StringIO())
+        client_id_to_dir = json.loads(p.stdout.getvalue())
+
+        try:
+            return client_id_to_dir[self.client_id]
+        except KeyError:
+            log.error("Client id '{0}' debug dir not found (clients seen were: {1})".format(
+                self.client_id, ",".join(client_id_to_dir.keys())
+            ))
+            raise
+
+    def _read_debug_file(self, filename):
+        debug_dir = self._find_debug_dir()
+
+        pyscript = dedent("""
+            import os
+
+            print open(os.path.join("{debug_dir}", "{filename}")).read()
+            """).format(debug_dir=debug_dir, filename=filename)
+
+        p = self.client_remote.run(args=[
+            'sudo', 'python', '-c', pyscript
+        ], stdout=StringIO())
+        return p.stdout.getvalue()
+
+    def get_global_id(self):
+        """
+        Look up the CephFS client ID for this mount, using debugfs.
+        """
+
+        assert self.mounted
+
+        mds_sessions = self._read_debug_file("mds_sessions")
+        lines = mds_sessions.split("\n")
+        return int(lines[0].split()[1])
+
+    def get_osd_epoch(self):
+        """
+        Return 2-tuple of osd_epoch, osd_epoch_barrier
+        """
+        osd_map = self._read_debug_file("osdmap")
+        lines = osd_map.split("\n")
+        first_line_tokens = lines[0].split()
+        epoch, barrier = int(first_line_tokens[1]), int(first_line_tokens[3])
+
+        return epoch, barrier
diff --git a/src/ceph/qa/tasks/cephfs/mount.py b/src/ceph/qa/tasks/cephfs/mount.py
new file mode 100644
index 0000000..4f96e6c
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/mount.py
@@ -0,0 +1,627 @@
+from contextlib import contextmanager
+import json
+import logging
+import datetime
+import time
+from textwrap import dedent
+import os
+from StringIO import StringIO
+from teuthology.orchestra import run
+from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
+
+log = logging.getLogger(__name__)
+
+
+class CephFSMount(object):
+    def __init__(self, test_dir, client_id, client_remote):
+        """
+        :param test_dir: Global teuthology test dir
+        :param client_id: Client ID, the 'foo' in client.foo
+        :param client_remote: Remote instance for the host where client will run
+        """
+
+        self.test_dir = test_dir
+        self.client_id = client_id
+        self.client_remote = client_remote
+        self.mountpoint_dir_name = 'mnt.{id}'.format(id=self.client_id)
+
+        self.test_files = ['a', 'b', 'c']
+
+        self.background_procs = []
+
+    @property
+    def mountpoint(self):
+        return os.path.join(
+            self.test_dir, '{dir_name}'.format(dir_name=self.mountpoint_dir_name))
+
+    def is_mounted(self):
+        raise NotImplementedError()
+
+    def mount(self, mount_path=None, mount_fs_name=None):
+        raise NotImplementedError()
+
+    def umount(self):
+        raise NotImplementedError()
+
+    def umount_wait(self, force=False, require_clean=False):
+        """
+
+        :param force: Expect that the mount will not shutdown cleanly: kill
+                      it hard.
+        :param require_clean: Wait for the Ceph client associated with the
+                              mount (e.g. ceph-fuse) to terminate, and
+                              raise if it doesn't do so cleanly.
+        :return:
+        """
+        raise NotImplementedError()
+
+    def kill_cleanup(self):
+        raise NotImplementedError()
+
+    def kill(self):
+        raise NotImplementedError()
+
+    def cleanup(self):
+        raise NotImplementedError()
+
+    def wait_until_mounted(self):
+        raise NotImplementedError()
+
+    def get_keyring_path(self):
+        return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id)
+
+    @property
+    def config_path(self):
+        """
+        Path to ceph.conf: override this if you're not a normal systemwide ceph install
+        :return: stringv
+        """
+        return "/etc/ceph/ceph.conf"
+
+    @contextmanager
+    def mounted(self):
+        """
+        A context manager, from an initially unmounted state, to mount
+        this, yield, and then unmount and clean up.
+        """
+        self.mount()
+        self.wait_until_mounted()
+        try:
+            yield
+        finally:
+            self.umount_wait()
+
+    def create_files(self):
+        assert(self.is_mounted())
+
+        for suffix in self.test_files:
+            log.info("Creating file {0}".format(suffix))
+            self.client_remote.run(args=[
+                'sudo', 'touch', os.path.join(self.mountpoint, suffix)
+            ])
+
+    def check_files(self):
+        assert(self.is_mounted())
+
+        for suffix in self.test_files:
+            log.info("Checking file {0}".format(suffix))
+            r = self.client_remote.run(args=[
+                'sudo', 'ls', os.path.join(self.mountpoint, suffix)
+            ], check_status=False)
+            if r.exitstatus != 0:
+                raise RuntimeError("Expected file {0} not found".format(suffix))
+
+    def create_destroy(self):
+        assert(self.is_mounted())
+
+        filename = "{0} {1}".format(datetime.datetime.now(), self.client_id)
+        log.debug("Creating test file {0}".format(filename))
+        self.client_remote.run(args=[
+            'sudo', 'touch', os.path.join(self.mountpoint, filename)
+        ])
+        log.debug("Deleting test file {0}".format(filename))
+        self.client_remote.run(args=[
+            'sudo', 'rm', '-f', os.path.join(self.mountpoint, filename)
+        ])
+
+    def _run_python(self, pyscript):
+        return self.client_remote.run(args=[
+            'sudo', 'adjust-ulimits', 'daemon-helper', 'kill', 'python', '-c', pyscript
+        ], wait=False, stdin=run.PIPE, stdout=StringIO())
+
+    def run_python(self, pyscript):
+        p = self._run_python(pyscript)
+        p.wait()
+        return p.stdout.getvalue().strip()
+
+    def run_shell(self, args, wait=True):
+        args = ["cd", self.mountpoint, run.Raw('&&'), "sudo"] + args
+        return self.client_remote.run(args=args, stdout=StringIO(),
+                                      stderr=StringIO(), wait=wait)
+
+    def open_no_data(self, basename):
+        """
+        A pure metadata operation
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        p = self._run_python(dedent(
+            """
+            f = open("{path}", 'w')
+            """.format(path=path)
+        ))
+        p.wait()
+
+    def open_background(self, basename="background_file"):
+        """
+        Open a file for writing, then block such that the client
+        will hold a capability.
+
+        Don't return until the remote process has got as far as opening
+        the file, then return the RemoteProcess instance.
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        pyscript = dedent("""
+            import time
+
+            f = open("{path}", 'w')
+            f.write('content')
+            f.flush()
+            f.write('content2')
+            while True:
+                time.sleep(1)
+            """).format(path=path)
+
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+
+        # This wait would not be sufficient if the file had already
+        # existed, but it's simple and in practice users of open_background
+        # are not using it on existing files.
+        self.wait_for_visible(basename)
+
+        return rproc
+
+    def wait_for_visible(self, basename="background_file", timeout=30):
+        i = 0
+        while i < timeout:
+            r = self.client_remote.run(args=[
+                'sudo', 'ls', os.path.join(self.mountpoint, basename)
+            ], check_status=False)
+            if r.exitstatus == 0:
+                log.debug("File {0} became visible from {1} after {2}s".format(
+                    basename, self.client_id, i))
+                return
+            else:
+                time.sleep(1)
+                i += 1
+
+        raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format(
+            i, basename, self.client_id))
+
+    def lock_background(self, basename="background_file", do_flock=True):
+        """
+        Open and lock a files for writing, hold the lock in a background process
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        script_builder = """
+            import time
+            import fcntl
+            import struct"""
+        if do_flock:
+            script_builder += """
+            f1 = open("{path}-1", 'w')
+            fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)"""
+        script_builder += """
+            f2 = open("{path}-2", 'w')
+            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
+            while True:
+                time.sleep(1)
+            """
+
+        pyscript = dedent(script_builder).format(path=path)
+
+        log.info("lock_background file {0}".format(basename))
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+        return rproc
+
+    def lock_and_release(self, basename="background_file"):
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        script = """
+            import time
+            import fcntl
+            import struct
+            f1 = open("{path}-1", 'w')
+            fcntl.flock(f1, fcntl.LOCK_EX)
+            f2 = open("{path}-2", 'w')
+            lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+            fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
+            """
+        pyscript = dedent(script).format(path=path)
+
+        log.info("lock_and_release file {0}".format(basename))
+        return self._run_python(pyscript)
+
+    def check_filelock(self, basename="background_file", do_flock=True):
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        script_builder = """
+            import fcntl
+            import errno
+            import struct"""
+        if do_flock:
+            script_builder += """
+            f1 = open("{path}-1", 'r')
+            try:
+                fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)
+            except IOError, e:
+                if e.errno == errno.EAGAIN:
+                    pass
+            else:
+                raise RuntimeError("flock on file {path}-1 not found")"""
+        script_builder += """
+            f2 = open("{path}-2", 'r')
+            try:
+                lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0)
+                fcntl.fcntl(f2, fcntl.F_SETLK, lockdata)
+            except IOError, e:
+                if e.errno == errno.EAGAIN:
+                    pass
+            else:
+                raise RuntimeError("posix lock on file {path}-2 not found")
+            """
+        pyscript = dedent(script_builder).format(path=path)
+
+        log.info("check lock on file {0}".format(basename))
+        self.client_remote.run(args=[
+            'sudo', 'python', '-c', pyscript
+        ])
+
+    def write_background(self, basename="background_file", loop=False):
+        """
+        Open a file for writing, complete as soon as you can
+        :param basename:
+        :return:
+        """
+        assert(self.is_mounted())
+
+        path = os.path.join(self.mountpoint, basename)
+
+        pyscript = dedent("""
+            import os
+            import time
+
+            fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0644)
+            try:
+                while True:
+                    os.write(fd, 'content')
+                    time.sleep(1)
+                    if not {loop}:
+                        break
+            except IOError, e:
+                pass
+            os.close(fd)
+            """).format(path=path, loop=str(loop))
+
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+        return rproc
+
+    def write_n_mb(self, filename, n_mb, seek=0, wait=True):
+        """
+        Write the requested number of megabytes to a file
+        """
+        assert(self.is_mounted())
+
+        return self.run_shell(["dd", "if=/dev/urandom", "of={0}".format(filename),
+                               "bs=1M", "conv=fdatasync",
+                               "count={0}".format(n_mb),
+                               "seek={0}".format(seek)
+                               ], wait=wait)
+
+    def write_test_pattern(self, filename, size):
+        log.info("Writing {0} bytes to {1}".format(size, filename))
+        return self.run_python(dedent("""
+            import zlib
+            path = "{path}"
+            f = open(path, 'w')
+            for i in range(0, {size}):
+                val = zlib.crc32("%s" % i) & 7
+                f.write(chr(val))
+            f.close()
+        """.format(
+            path=os.path.join(self.mountpoint, filename),
+            size=size
+        )))
+
+    def validate_test_pattern(self, filename, size):
+        log.info("Validating {0} bytes from {1}".format(size, filename))
+        return self.run_python(dedent("""
+            import zlib
+            path = "{path}"
+            f = open(path, 'r')
+            bytes = f.read()
+            f.close()
+            if len(bytes) != {size}:
+                raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format(
+                    len(bytes), {size}
+                ))
+            for i, b in enumerate(bytes):
+                val = zlib.crc32("%s" % i) & 7
+                if b != chr(val):
+                    raise RuntimeError("Bad data at offset {{0}}".format(i))
+        """.format(
+            path=os.path.join(self.mountpoint, filename),
+            size=size
+        )))
+
+    def open_n_background(self, fs_path, count):
+        """
+        Open N files for writing, hold them open in a background process
+
+        :param fs_path: Path relative to CephFS root, e.g. "foo/bar"
+        :return: a RemoteProcess
+        """
+        assert(self.is_mounted())
+
+        abs_path = os.path.join(self.mountpoint, fs_path)
+
+        pyscript = dedent("""
+            import sys
+            import time
+            import os
+
+            n = {count}
+            abs_path = "{abs_path}"
+
+            if not os.path.exists(os.path.dirname(abs_path)):
+                os.makedirs(os.path.dirname(abs_path))
+
+            handles = []
+            for i in range(0, n):
+                fname = "{{0}}_{{1}}".format(abs_path, i)
+                handles.append(open(fname, 'w'))
+
+            while True:
+                time.sleep(1)
+            """).format(abs_path=abs_path, count=count)
+
+        rproc = self._run_python(pyscript)
+        self.background_procs.append(rproc)
+        return rproc
+
+    def create_n_files(self, fs_path, count, sync=False):
+        assert(self.is_mounted())
+
+        abs_path = os.path.join(self.mountpoint, fs_path)
+
+        pyscript = dedent("""
+            import sys
+            import time
+            import os
+
+            n = {count}
+            abs_path = "{abs_path}"
+
+            if not os.path.exists(os.path.dirname(abs_path)):
+                os.makedirs(os.path.dirname(abs_path))
+
+            for i in range(0, n):
+                fname = "{{0}}_{{1}}".format(abs_path, i)
+                h = open(fname, 'w')
+                h.write('content')
+                if {sync}:
+                    h.flush()
+                    os.fsync(h.fileno())
+                h.close()
+            """).format(abs_path=abs_path, count=count, sync=str(sync))
+
+        self.run_python(pyscript)
+
+    def teardown(self):
+        for p in self.background_procs:
+            log.info("Terminating background process")
+            self._kill_background(p)
+
+        self.background_procs = []
+
+    def _kill_background(self, p):
+        if p.stdin:
+            p.stdin.close()
+            try:
+                p.wait()
+            except (CommandFailedError, ConnectionLostError):
+                pass
+
+    def kill_background(self, p):
+        """
+        For a process that was returned by one of the _background member functions,
+        kill it hard.
+        """
+        self._kill_background(p)
+        self.background_procs.remove(p)
+
+    def get_global_id(self):
+        raise NotImplementedError()
+
+    def get_osd_epoch(self):
+        raise NotImplementedError()
+
+    def stat(self, fs_path, wait=True):
+        """
+        stat a file, and return the result as a dictionary like this:
+        {
+          "st_ctime": 1414161137.0,
+          "st_mtime": 1414161137.0,
+          "st_nlink": 33,
+          "st_gid": 0,
+          "st_dev": 16777218,
+          "st_size": 1190,
+          "st_ino": 2,
+          "st_uid": 0,
+          "st_mode": 16877,
+          "st_atime": 1431520593.0
+        }
+
+        Raises exception on absent file.
+        """
+        abs_path = os.path.join(self.mountpoint, fs_path)
+
+        pyscript = dedent("""
+            import os
+            import stat
+            import json
+            import sys
+
+            try:
+                s = os.stat("{path}")
+            except OSError as e:
+                sys.exit(e.errno)
+
+            attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"]
+            print json.dumps(
+                dict([(a, getattr(s, a)) for a in attrs]),
+                indent=2)
+            """).format(path=abs_path)
+        proc = self._run_python(pyscript)
+        if wait:
+            proc.wait()
+            return json.loads(proc.stdout.getvalue().strip())
+        else:
+            return proc
+
+    def touch(self, fs_path):
+        """
+        Create a dentry if it doesn't already exist.  This python
+        implementation exists because the usual command line tool doesn't
+        pass through error codes like EIO.
+
+        :param fs_path:
+        :return:
+        """
+        abs_path = os.path.join(self.mountpoint, fs_path)
+        pyscript = dedent("""
+            import sys
+            import errno
+
+            try:
+                f = open("{path}", "w")
+                f.close()
+            except IOError as e:
+                sys.exit(errno.EIO)
+            """).format(path=abs_path)
+        proc = self._run_python(pyscript)
+        proc.wait()
+
+    def path_to_ino(self, fs_path, follow_symlinks=True):
+        abs_path = os.path.join(self.mountpoint, fs_path)
+
+        if follow_symlinks:
+            pyscript = dedent("""
+                import os
+                import stat
+
+                print os.stat("{path}").st_ino
+                """).format(path=abs_path)
+        else:
+            pyscript = dedent("""
+                import os
+                import stat
+
+                print os.lstat("{path}").st_ino
+                """).format(path=abs_path)
+
+        proc = self._run_python(pyscript)
+        proc.wait()
+        return int(proc.stdout.getvalue().strip())
+
+    def path_to_nlink(self, fs_path):
+        abs_path = os.path.join(self.mountpoint, fs_path)
+
+        pyscript = dedent("""
+            import os
+            import stat
+
+            print os.stat("{path}").st_nlink
+            """).format(path=abs_path)
+
+        proc = self._run_python(pyscript)
+        proc.wait()
+        return int(proc.stdout.getvalue().strip())
+
+    def ls(self, path=None):
+        """
+        Wrap ls: return a list of strings
+        """
+        cmd = ["ls"]
+        if path:
+            cmd.append(path)
+
+        ls_text = self.run_shell(cmd).stdout.getvalue().strip()
+
+        if ls_text:
+            return ls_text.split("\n")
+        else:
+            # Special case because otherwise split on empty string
+            # gives you [''] instead of []
+            return []
+
+    def setfattr(self, path, key, val):
+        """
+        Wrap setfattr.
+
+        :param path: relative to mount point
+        :param key: xattr name
+        :param val: xattr value
+        :return: None
+        """
+        self.run_shell(["setfattr", "-n", key, "-v", val, path])
+
+    def getfattr(self, path, attr):
+        """
+        Wrap getfattr: return the values of a named xattr on one file, or
+        None if the attribute is not found.
+
+        :return: a string
+        """
+        p = self.run_shell(["getfattr", "--only-values", "-n", attr, path], wait=False)
+        try:
+            p.wait()
+        except CommandFailedError as e:
+            if e.exitstatus == 1 and "No such attribute" in p.stderr.getvalue():
+                return None
+            else:
+                raise
+
+        return p.stdout.getvalue()
+
+    def df(self):
+        """
+        Wrap df: return a dict of usage fields in bytes
+        """
+
+        p = self.run_shell(["df", "-B1", "."])
+        lines = p.stdout.getvalue().strip().split("\n")
+        fs, total, used, avail = lines[1].split()[:4]
+        log.warn(lines)
+
+        return {
+            "total": int(total),
+            "used": int(used),
+            "available": int(avail)
+        }
diff --git a/src/ceph/qa/tasks/cephfs/test_auto_repair.py b/src/ceph/qa/tasks/cephfs/test_auto_repair.py
new file mode 100644
index 0000000..c0aa2e4
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_auto_repair.py
@@ -0,0 +1,90 @@
+
+"""
+Exercise the MDS's auto repair functions
+"""
+
+import logging
+import time
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+
+class TestMDSAutoRepair(CephFSTestCase):
+    def test_backtrace_repair(self):
+        """
+        MDS should verify/fix backtrace on fetch dirfrag
+        """
+
+        self.mount_a.run_shell(["mkdir", "testdir1"])
+        self.mount_a.run_shell(["touch", "testdir1/testfile"])
+        dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino("testdir1"))
+
+        # drop inodes caps
+        self.mount_a.umount_wait()
+
+        # flush journal entries to dirfrag objects, and expire journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # Restart the MDS to drop the metadata cache (because we expired the journal,
+        # nothing gets replayed into cache on restart)
+        self.fs.mds_stop()
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        # remove testdir1's backtrace
+        self.fs.rados(["rmxattr", dir_objname, "parent"])
+
+        # readdir (fetch dirfrag) should fix testdir1's backtrace
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.mount_a.run_shell(["ls", "testdir1"])
+
+        # flush journal entries to dirfrag objects
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # check if backtrace exists
+        self.fs.rados(["getxattr", dir_objname, "parent"])
+
+    def test_mds_readonly(self):
+        """
+        test if MDS behave correct when it's readonly
+        """
+        # operation should successd when MDS is not readonly
+        self.mount_a.run_shell(["touch", "test_file1"])
+        writer = self.mount_a.write_background(loop=True)
+
+        time.sleep(10)
+        self.assertFalse(writer.finished)
+
+        # force MDS to read-only mode
+        self.fs.mds_asok(['force_readonly'])
+        time.sleep(10)
+
+        # touching test file should fail
+        try:
+            self.mount_a.run_shell(["touch", "test_file1"])
+        except CommandFailedError:
+            pass
+        else:
+            self.assertTrue(False)
+
+        # background writer also should fail
+        self.assertTrue(writer.finished)
+
+        # The MDS should report its readonly health state to the mon
+        self.wait_for_health("MDS_READ_ONLY", timeout=30)
+
+        # restart mds to make it writable
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        self.wait_for_health_clear(timeout=30)
diff --git a/src/ceph/qa/tasks/cephfs/test_backtrace.py b/src/ceph/qa/tasks/cephfs/test_backtrace.py
new file mode 100644
index 0000000..af246a1
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_backtrace.py
@@ -0,0 +1,78 @@
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+class TestBacktrace(CephFSTestCase):
+    def test_backtrace(self):
+        """
+        That the 'parent' and 'layout' xattrs on the head objects of files
+        are updated correctly.
+        """
+
+        old_data_pool_name = self.fs.get_data_pool_name()
+        old_pool_id = self.fs.get_data_pool_id()
+
+        # Create a file for subsequent checks
+        self.mount_a.run_shell(["mkdir", "parent_a"])
+        self.mount_a.run_shell(["touch", "parent_a/alpha"])
+        file_ino = self.mount_a.path_to_ino("parent_a/alpha")
+
+        # That backtrace and layout are written after initial flush
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_a'], [a['dname'] for a in backtrace['ancestors']])
+        layout = self.fs.read_layout(file_ino)
+        self.assertDictEqual(layout, {
+            "stripe_unit": 4194304,
+            "stripe_count": 1,
+            "object_size": 4194304,
+            "pool_id": old_pool_id,
+            "pool_ns": "",
+        })
+        self.assertEqual(backtrace['pool'], old_pool_id)
+
+        # That backtrace is written after parentage changes
+        self.mount_a.run_shell(["mkdir", "parent_b"])
+        self.mount_a.run_shell(["mv", "parent_a/alpha", "parent_b/alpha"])
+
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace['ancestors']])
+
+        # Create a new data pool
+        new_pool_name = "data_new"
+        new_pool_id = self.fs.add_data_pool(new_pool_name)
+
+        # That an object which has switched pools gets its backtrace updated
+        self.mount_a.setfattr("./parent_b/alpha",
+                              "ceph.file.layout.pool", new_pool_name)
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name)
+        self.assertEqual(backtrace_old_pool['pool'], new_pool_id)
+        backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name)
+        self.assertEqual(backtrace_new_pool['pool'], new_pool_id)
+        new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name)
+        self.assertEqual(new_pool_layout['pool_id'], new_pool_id)
+        self.assertEqual(new_pool_layout['pool_ns'], '')
+
+        # That subsequent linkage changes are only written to new pool backtrace
+        self.mount_a.run_shell(["mkdir", "parent_c"])
+        self.mount_a.run_shell(["mv", "parent_b/alpha", "parent_c/alpha"])
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name)
+        self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace_old_pool['ancestors']])
+        backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name)
+        self.assertEqual(['alpha', 'parent_c'], [a['dname'] for a in backtrace_new_pool['ancestors']])
+
+        # That layout is written to new pool after change to other field in layout
+        self.mount_a.setfattr("./parent_c/alpha",
+                              "ceph.file.layout.object_size", "8388608")
+
+        self.fs.mds_asok(["flush", "journal"])
+        new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name)
+        self.assertEqual(new_pool_layout['object_size'], 8388608)
+
+        # ...but not to the old pool: the old pool's backtrace points to the new pool, and that's enough,
+        # we don't update the layout in all the old pools whenever it changes
+        old_pool_layout = self.fs.read_layout(file_ino, pool=old_data_pool_name)
+        self.assertEqual(old_pool_layout['object_size'], 4194304)
diff --git a/src/ceph/qa/tasks/cephfs/test_cap_flush.py b/src/ceph/qa/tasks/cephfs/test_cap_flush.py
new file mode 100644
index 0000000..1cd102f
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_cap_flush.py
@@ -0,0 +1,64 @@
+
+import os
+import time
+from textwrap import dedent
+from unittest import SkipTest
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+class TestCapFlush(CephFSTestCase):
+    @for_teuthology
+    def test_replay_create(self):
+        """
+        MDS starts to handle client caps when it enters clientreplay stage.
+        When handling a client cap in clientreplay stage, it's possible that
+        corresponding inode does not exist because the client request which
+        creates inode hasn't been replayed.
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client to inject client release failure")
+
+        dir_path = os.path.join(self.mount_a.mountpoint, "testdir")
+        py_script = dedent("""
+            import os
+            os.mkdir("{0}")
+            fd = os.open("{0}", os.O_RDONLY)
+            os.fchmod(fd, 0777)
+            os.fsync(fd)
+            """).format(dir_path)
+        self.mount_a.run_python(py_script)
+
+        self.fs.mds_asok(["flush", "journal"])
+
+        # client will only get unsafe replay
+        self.fs.mds_asok(["config", "set", "mds_log_pause", "1"])
+
+        file_name = "testfile"
+        file_path = dir_path + "/" + file_name
+
+        # Create a file and modify its mode. ceph-fuse will mark Ax cap dirty
+        py_script = dedent("""
+            import os
+            os.chdir("{0}")
+            os.setgid(65534)
+            os.setuid(65534)
+            fd = os.open("{1}", os.O_CREAT | os.O_RDWR, 0644)
+            os.fchmod(fd, 0640)
+            """).format(dir_path, file_name)
+        self.mount_a.run_python(py_script)
+
+        # Modify file mode by different user. ceph-fuse will send a setattr request
+        self.mount_a.run_shell(["chmod", "600", file_path], wait=False)
+
+        time.sleep(10)
+
+        # Restart mds. Client will re-send the unsafe request and cap flush
+        self.fs.mds_stop()
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip()
+        # If the cap flush get dropped, mode should be 0644.
+        # (Ax cap stays in dirty state, which prevents setattr reply from updating file mode)
+        self.assertEqual(mode, "600")
diff --git a/src/ceph/qa/tasks/cephfs/test_client_limits.py b/src/ceph/qa/tasks/cephfs/test_client_limits.py
new file mode 100644
index 0000000..cb5e3a4
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_client_limits.py
@@ -0,0 +1,239 @@
+
+"""
+Exercise the MDS's behaviour when clients and the MDCache reach or
+exceed the limits of how many caps/inodes they should hold.
+"""
+
+import logging
+from textwrap import dedent
+from unittest import SkipTest
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming
+from tasks.cephfs.fuse_mount import FuseMount
+import os
+
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+# Hardcoded values from Server::recall_client_state
+CAP_RECALL_RATIO = 0.8
+CAP_RECALL_MIN = 100
+
+
+class TestClientLimits(CephFSTestCase):
+    REQUIRE_KCLIENT_REMOTE = True
+    CLIENTS_REQUIRED = 2
+
+    def _test_client_pin(self, use_subdir, open_files):
+        """
+        When a client pins an inode in its cache, for example because the file is held open,
+        it should reject requests from the MDS to trim these caps.  The MDS should complain
+        to the user that it is unable to enforce its cache size limits because of this
+        objectionable client.
+
+        :param use_subdir: whether to put test files in a subdir or use root
+        """
+
+        cache_size = open_files/2
+
+        self.set_conf('mds', 'mds cache size', cache_size)
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client"))
+        self.assertTrue(open_files >= mds_min_caps_per_client)
+        mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client"))
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        path = "subdir/mount_a" if use_subdir else "mount_a"
+        open_proc = self.mount_a.open_n_background(path, open_files)
+
+        # Client should now hold:
+        # `open_files` caps for the open files
+        # 1 cap for root
+        # 1 cap for subdir
+        self.wait_until_equal(lambda: self.get_session(mount_a_client_id)['num_caps'],
+                              open_files + (2 if use_subdir else 1),
+                              timeout=600,
+                              reject_fn=lambda x: x > open_files + 2)
+
+        # MDS should not be happy about that, as the client is failing to comply
+        # with the SESSION_RECALL messages it is being sent
+        mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout"))
+        self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10)
+
+        # We can also test that the MDS health warning for oversized
+        # cache is functioning as intended.
+        self.wait_for_health("MDS_CACHE_OVERSIZED",
+                mds_recall_state_timeout + 10)
+
+        # When the client closes the files, it should retain only as many caps as allowed
+        # under the SESSION_RECALL policy
+        log.info("Terminating process holding files open")
+        open_proc.stdin.close()
+        try:
+            open_proc.wait()
+        except CommandFailedError:
+            # We killed it, so it raises an error
+            pass
+
+        # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message,
+        # which depend on the caps outstanding, cache size and overall ratio
+        recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2))
+        def expected_caps():
+            num_caps = self.get_session(mount_a_client_id)['num_caps']
+            if num_caps < mds_min_caps_per_client:
+                raise RuntimeError("client caps fell below min!")
+            elif num_caps == mds_min_caps_per_client:
+                return True
+            elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05:
+                return True
+            else:
+                return False
+
+        self.wait_until_true(expected_caps, timeout=60)
+
+    @needs_trimming
+    def test_client_pin_root(self):
+        self._test_client_pin(False, 400)
+
+    @needs_trimming
+    def test_client_pin(self):
+        self._test_client_pin(True, 800)
+
+    @needs_trimming
+    def test_client_pin_mincaps(self):
+        self._test_client_pin(True, 200)
+
+    def test_client_release_bug(self):
+        """
+        When a client has a bug (which we will simulate) preventing it from releasing caps,
+        the MDS should notice that releases are not being sent promptly, and generate a health
+        metric to that effect.
+        """
+
+        # The debug hook to inject the failure only exists in the fuse client
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client to inject client release failure")
+
+        self.set_conf('client.{0}'.format(self.mount_a.client_id), 'client inject release failure', 'true')
+        self.mount_a.teardown()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # Client A creates a file.  He will hold the write caps on the file, and later (simulated bug) fail
+        # to comply with the MDSs request to release that cap
+        self.mount_a.run_shell(["touch", "file1"])
+
+        # Client B tries to stat the file that client A created
+        rproc = self.mount_b.write_background("file1")
+
+        # After mds_revoke_cap_timeout, we should see a health warning (extra lag from
+        # MDS beacon period)
+        mds_revoke_cap_timeout = float(self.fs.get_config("mds_revoke_cap_timeout"))
+        self.wait_for_health("MDS_CLIENT_LATE_RELEASE", mds_revoke_cap_timeout + 10)
+
+        # Client B should still be stuck
+        self.assertFalse(rproc.finished)
+
+        # Kill client A
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+
+        # Client B should complete
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+        rproc.wait()
+
+    def test_client_oldest_tid(self):
+        """
+        When a client does not advance its oldest tid, the MDS should notice that
+        and generate health warnings.
+        """
+
+        # num of requests client issues
+        max_requests = 1000
+
+        # The debug hook to inject the failure only exists in the fuse client
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client to inject client release failure")
+
+        self.set_conf('client', 'client inject fixed oldest tid', 'true')
+        self.mount_a.teardown()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        self.fs.mds_asok(['config', 'set', 'mds_max_completed_requests', '{0}'.format(max_requests)])
+
+        # Create lots of files
+        self.mount_a.create_n_files("testdir/file1", max_requests + 100)
+
+        # Create a few files synchronously. This makes sure previous requests are completed
+        self.mount_a.create_n_files("testdir/file2", 5, True)
+
+        # Wait for the health warnings. Assume mds can handle 10 request per second at least
+        self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests / 10)
+
+    def _test_client_cache_size(self, mount_subdir):
+        """
+        check if client invalidate kernel dcache according to its cache size config
+        """
+
+        # The debug hook to inject the failure only exists in the fuse client
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client to inject client release failure")
+
+        if mount_subdir:
+            # fuse assigns a fix inode number (1) to root inode. But in mounting into
+            # subdir case, the actual inode number of root is not 1. This mismatch
+            # confuses fuse_lowlevel_notify_inval_entry() when invalidating dentries
+            # in root directory.
+            self.mount_a.run_shell(["mkdir", "subdir"])
+            self.mount_a.umount_wait()
+            self.set_conf('client', 'client mountpoint', '/subdir')
+            self.mount_a.mount()
+            self.mount_a.wait_until_mounted()
+            root_ino = self.mount_a.path_to_ino(".")
+            self.assertEqual(root_ino, 1);
+
+        dir_path = os.path.join(self.mount_a.mountpoint, "testdir")
+
+        mkdir_script = dedent("""
+            import os
+            os.mkdir("{path}")
+            for n in range(0, {num_dirs}):
+                os.mkdir("{path}/dir{{0}}".format(n))
+            """)
+
+        num_dirs = 1000
+        self.mount_a.run_python(mkdir_script.format(path=dir_path, num_dirs=num_dirs))
+        self.mount_a.run_shell(["sync"])
+
+        dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count()
+        self.assertGreaterEqual(dentry_count, num_dirs)
+        self.assertGreaterEqual(dentry_pinned_count, num_dirs)
+
+        cache_size = num_dirs / 10
+        self.mount_a.set_cache_size(cache_size)
+
+        def trimmed():
+            dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count()
+            log.info("waiting, dentry_count, dentry_pinned_count: {0}, {1}".format(
+                dentry_count, dentry_pinned_count
+            ))
+            if dentry_count > cache_size or dentry_pinned_count > cache_size:
+                return False
+
+            return True
+
+        self.wait_until_true(trimmed, 30)
+
+    @needs_trimming
+    def test_client_cache_size(self):
+        self._test_client_cache_size(False)
+        self._test_client_cache_size(True)
diff --git a/src/ceph/qa/tasks/cephfs/test_client_recovery.py b/src/ceph/qa/tasks/cephfs/test_client_recovery.py
new file mode 100644
index 0000000..fd58c14
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_client_recovery.py
@@ -0,0 +1,474 @@
+
+"""
+Teuthology task for exercising CephFS client recovery
+"""
+
+import logging
+from textwrap import dedent
+import time
+import distutils.version as version
+import re
+import os
+
+from teuthology.orchestra.run import CommandFailedError, ConnectionLostError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.packaging import get_package_version
+
+
+log = logging.getLogger(__name__)
+
+
+# Arbitrary timeouts for operations involving restarting
+# an MDS or waiting for it to come up
+MDS_RESTART_GRACE = 60
+
+
+class TestClientNetworkRecovery(CephFSTestCase):
+    REQUIRE_KCLIENT_REMOTE = True
+    REQUIRE_ONE_CLIENT_REMOTE = True
+    CLIENTS_REQUIRED = 2
+
+    LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
+
+    # Environment references
+    mds_session_timeout = None
+    mds_reconnect_timeout = None
+    ms_max_backoff = None
+
+    def test_network_death(self):
+        """
+        Simulate software freeze or temporary network failure.
+
+        Check that the client blocks I/O during failure, and completes
+        I/O after failure.
+        """
+
+        # We only need one client
+        self.mount_b.umount_wait()
+
+        # Initially our one client session should be visible
+        client_id = self.mount_a.get_global_id()
+        ls_data = self._session_list()
+        self.assert_session_count(1, ls_data)
+        self.assertEqual(ls_data[0]['id'], client_id)
+        self.assert_session_state(client_id, "open")
+
+        # ...and capable of doing I/O without blocking
+        self.mount_a.create_files()
+
+        # ...but if we turn off the network
+        self.fs.set_clients_block(True)
+
+        # ...and try and start an I/O
+        write_blocked = self.mount_a.write_background()
+
+        # ...then it should block
+        self.assertFalse(write_blocked.finished)
+        self.assert_session_state(client_id, "open")
+        time.sleep(self.mds_session_timeout * 1.5)  # Long enough for MDS to consider session stale
+        self.assertFalse(write_blocked.finished)
+        self.assert_session_state(client_id, "stale")
+
+        # ...until we re-enable I/O
+        self.fs.set_clients_block(False)
+
+        # ...when it should complete promptly
+        a = time.time()
+        self.wait_until_true(lambda: write_blocked.finished, self.ms_max_backoff * 2)
+        write_blocked.wait()  # Already know we're finished, wait() to raise exception on errors
+        recovery_time = time.time() - a
+        log.info("recovery time: {0}".format(recovery_time))
+        self.assert_session_state(client_id, "open")
+
+
+class TestClientRecovery(CephFSTestCase):
+    REQUIRE_KCLIENT_REMOTE = True
+    CLIENTS_REQUIRED = 2
+
+    LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"]
+
+    # Environment references
+    mds_session_timeout = None
+    mds_reconnect_timeout = None
+    ms_max_backoff = None
+
+    def test_basic(self):
+        # Check that two clients come up healthy and see each others' files
+        # =====================================================
+        self.mount_a.create_files()
+        self.mount_a.check_files()
+        self.mount_a.umount_wait()
+
+        self.mount_b.check_files()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # Check that the admin socket interface is correctly reporting
+        # two sessions
+        # =====================================================
+        ls_data = self._session_list()
+        self.assert_session_count(2, ls_data)
+
+        self.assertSetEqual(
+            set([l['id'] for l in ls_data]),
+            {self.mount_a.get_global_id(), self.mount_b.get_global_id()}
+        )
+
+    def test_restart(self):
+        # Check that after an MDS restart both clients reconnect and continue
+        # to handle I/O
+        # =====================================================
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+        self.mount_a.create_destroy()
+        self.mount_b.create_destroy()
+
+    def _session_num_caps(self, client_id):
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        return int(self._session_by_id(ls_data).get(client_id, {'num_caps': None})['num_caps'])
+
+    def test_reconnect_timeout(self):
+        # Reconnect timeout
+        # =================
+        # Check that if I stop an MDS and a client goes away, the MDS waits
+        # for the reconnect period
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        mount_a_client_id = self.mount_a.get_global_id()
+        self.mount_a.umount_wait(force=True)
+
+        self.fs.mds_restart()
+
+        self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
+        # Check that the MDS locally reports its state correctly
+        status = self.fs.mds_asok(['status'])
+        self.assertIn("reconnect_status", status)
+
+        ls_data = self._session_list()
+        self.assert_session_count(2, ls_data)
+
+        # The session for the dead client should have the 'reconnect' flag set
+        self.assertTrue(self.get_session(mount_a_client_id)['reconnecting'])
+
+        # Wait for the reconnect state to clear, this should take the
+        # reconnect timeout period.
+        in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2)
+        # Check that the period we waited to enter active is within a factor
+        # of two of the reconnect timeout.
+        self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout / 2,
+                           "Should have been in reconnect phase for {0} but only took {1}".format(
+                               self.mds_reconnect_timeout, in_reconnect_for
+                           ))
+
+        self.assert_session_count(1)
+
+        # Check that the client that timed out during reconnect can
+        # mount again and do I/O
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.mount_a.create_destroy()
+
+        self.assert_session_count(2)
+
+    def test_reconnect_eviction(self):
+        # Eviction during reconnect
+        # =========================
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # The mount goes away while the MDS is offline
+        self.mount_a.kill()
+
+        self.fs.mds_restart()
+
+        # Enter reconnect phase
+        self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE)
+        self.assert_session_count(2)
+
+        # Evict the stuck client
+        self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+        self.assert_session_count(1)
+
+        # Observe that we proceed to active phase without waiting full reconnect timeout
+        evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+        # Once we evict the troublemaker, the reconnect phase should complete
+        # in well under the reconnect timeout.
+        self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5,
+                        "reconnect did not complete soon enough after eviction, took {0}".format(
+                            evict_til_active
+                        ))
+
+        # We killed earlier so must clean up before trying to use again
+        self.mount_a.kill_cleanup()
+
+        # Bring the client back
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.mount_a.create_destroy()
+
+    def test_stale_caps(self):
+        # Capability release from stale session
+        # =====================================
+        cap_holder = self.mount_a.open_background()
+
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
+
+        # Simulate client death
+        self.mount_a.kill()
+
+        try:
+            # Now, after mds_session_timeout seconds, the waiter should
+            # complete their operation when the MDS marks the holder's
+            # session stale.
+            cap_waiter = self.mount_b.write_background()
+            a = time.time()
+            cap_waiter.wait()
+            b = time.time()
+
+            # Should have succeeded
+            self.assertEqual(cap_waiter.exitstatus, 0)
+
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+            self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0,
+                            "Capability handover took {0}, expected approx {1}".format(
+                                cap_waited, self.mds_session_timeout
+                            ))
+
+            cap_holder.stdin.close()
+            try:
+                cap_holder.wait()
+            except (CommandFailedError, ConnectionLostError):
+                # We killed it (and possibly its node), so it raises an error
+                pass
+        finally:
+            # teardown() doesn't quite handle this case cleanly, so help it out
+            self.mount_a.kill_cleanup()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+    def test_evicted_caps(self):
+        # Eviction while holding a capability
+        # ===================================
+
+        # Take out a write capability on a file on client A,
+        # and then immediately kill it.
+        cap_holder = self.mount_a.open_background()
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # Wait for the file to be visible from another client, indicating
+        # that mount_a has completed its network ops
+        self.mount_b.wait_for_visible()
+
+        # Simulate client death
+        self.mount_a.kill()
+
+        try:
+            # The waiter should get stuck waiting for the capability
+            # held on the MDS by the now-dead client A
+            cap_waiter = self.mount_b.write_background()
+            time.sleep(5)
+            self.assertFalse(cap_waiter.finished)
+
+            self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+            # Now, because I evicted the old holder of the capability, it should
+            # immediately get handed over to the waiter
+            a = time.time()
+            cap_waiter.wait()
+            b = time.time()
+            cap_waited = b - a
+            log.info("cap_waiter waited {0}s".format(cap_waited))
+            # This is the check that it happened 'now' rather than waiting
+            # for the session timeout
+            self.assertLess(cap_waited, self.mds_session_timeout / 2.0,
+                            "Capability handover took {0}, expected less than {1}".format(
+                                cap_waited, self.mds_session_timeout / 2.0
+                            ))
+
+            cap_holder.stdin.close()
+            try:
+                cap_holder.wait()
+            except (CommandFailedError, ConnectionLostError):
+                # We killed it (and possibly its node), so it raises an error
+                pass
+        finally:
+            self.mount_a.kill_cleanup()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+    def test_trim_caps(self):
+        # Trim capability when reconnecting MDS
+        # ===================================
+
+        count = 500
+        # Create lots of files
+        for i in range(count):
+            self.mount_a.run_shell(["touch", "f{0}".format(i)])
+
+        # Populate mount_b's cache
+        self.mount_b.run_shell(["ls", "-l"])
+
+        client_id = self.mount_b.get_global_id()
+        num_caps = self._session_num_caps(client_id)
+        self.assertGreaterEqual(num_caps, count)
+
+        # Restart MDS. client should trim its cache when reconnecting to the MDS
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+        num_caps = self._session_num_caps(client_id)
+        self.assertLess(num_caps, count,
+                        "should have less than {0} capabilities, have {1}".format(
+                            count, num_caps
+                        ))
+
+    def _is_flockable(self):
+        a_version_str = get_package_version(self.mount_a.client_remote, "fuse")
+        b_version_str = get_package_version(self.mount_b.client_remote, "fuse")
+        flock_version_str = "2.9"
+
+        version_regex = re.compile(r"[0-9\.]+")
+        a_result = version_regex.match(a_version_str)
+        self.assertTrue(a_result)
+        b_result = version_regex.match(b_version_str)
+        self.assertTrue(b_result)
+        a_version = version.StrictVersion(a_result.group())
+        b_version = version.StrictVersion(b_result.group())
+        flock_version=version.StrictVersion(flock_version_str)
+
+        if (a_version >= flock_version and b_version >= flock_version):
+            log.info("flock locks are available")
+            return True
+        else:
+            log.info("not testing flock locks, machines have versions {av} and {bv}".format(
+                av=a_version_str,bv=b_version_str))
+            return False
+
+    def test_filelock(self):
+        """
+        Check that file lock doesn't get lost after an MDS restart
+        """
+
+        flockable = self._is_flockable()
+        lock_holder = self.mount_a.lock_background(do_flock=flockable)
+
+        self.mount_b.wait_for_visible("background_file-2")
+        self.mount_b.check_filelock(do_flock=flockable)
+
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE)
+
+        self.mount_b.check_filelock(do_flock=flockable)
+
+        # Tear down the background process
+        lock_holder.stdin.close()
+        try:
+            lock_holder.wait()
+        except (CommandFailedError, ConnectionLostError):
+            # We killed it, so it raises an error
+            pass
+
+    def test_filelock_eviction(self):
+        """
+        Check that file lock held by evicted client is given to
+        waiting client.
+        """
+        if not self._is_flockable():
+            self.skipTest("flock is not available")
+
+        lock_holder = self.mount_a.lock_background()
+        self.mount_b.wait_for_visible("background_file-2")
+        self.mount_b.check_filelock()
+
+        lock_taker = self.mount_b.lock_and_release()
+        # Check the taker is waiting (doesn't get it immediately)
+        time.sleep(2)
+        self.assertFalse(lock_holder.finished)
+        self.assertFalse(lock_taker.finished)
+
+        try:
+            mount_a_client_id = self.mount_a.get_global_id()
+            self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id])
+
+            # Evicting mount_a should let mount_b's attempt to take the lock
+            # succeed
+            self.wait_until_true(lambda: lock_taker.finished, timeout=10)
+        finally:
+            # teardown() doesn't quite handle this case cleanly, so help it out
+            self.mount_a.kill()
+            self.mount_a.kill_cleanup()
+
+        # Bring the client back
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+    def test_dir_fsync(self):
+	self._test_fsync(True);
+
+    def test_create_fsync(self):
+	self._test_fsync(False);
+
+    def _test_fsync(self, dirfsync):
+        """
+        That calls to fsync guarantee visibility of metadata to another
+        client immediately after the fsyncing client dies.
+        """
+
+        # Leave this guy out until he's needed
+        self.mount_b.umount_wait()
+
+        # Create dir + child dentry on client A, and fsync the dir
+        path = os.path.join(self.mount_a.mountpoint, "subdir")
+        self.mount_a.run_python(
+            dedent("""
+                import os
+                import time
+
+                path = "{path}"
+
+                print "Starting creation..."
+                start = time.time()
+
+                os.mkdir(path)
+                dfd = os.open(path, os.O_DIRECTORY)
+
+                fd = open(os.path.join(path, "childfile"), "w")
+                print "Finished creation in {{0}}s".format(time.time() - start)
+
+                print "Starting fsync..."
+                start = time.time()
+                if {dirfsync}:
+                    os.fsync(dfd)
+                else:
+                    os.fsync(fd)
+                print "Finished fsync in {{0}}s".format(time.time() - start)
+            """.format(path=path,dirfsync=str(dirfsync)))
+        )
+
+        # Immediately kill the MDS and then client A
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+
+        # Restart the MDS.  Wait for it to come up, it'll have to time out in clientreplay
+        self.fs.mds_restart()
+        log.info("Waiting for reconnect...")
+        self.fs.wait_for_state("up:reconnect")
+        log.info("Waiting for active...")
+        self.fs.wait_for_state("up:active", timeout=MDS_RESTART_GRACE + self.mds_reconnect_timeout)
+        log.info("Reached active...")
+
+        # Is the child dentry visible from mount B?
+        self.mount_b.mount()
+        self.mount_b.wait_until_mounted()
+        self.mount_b.run_shell(["ls", "subdir/childfile"])
diff --git a/src/ceph/qa/tasks/cephfs/test_config_commands.py b/src/ceph/qa/tasks/cephfs/test_config_commands.py
new file mode 100644
index 0000000..ce0619f
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_config_commands.py
@@ -0,0 +1,63 @@
+
+from unittest import case
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.fuse_mount import FuseMount
+
+
+class TestConfigCommands(CephFSTestCase):
+    """
+    Test that daemons and clients respond to the otherwise rarely-used
+    runtime config modification operations.
+    """
+
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def test_client_config(self):
+        """
+        That I can successfully issue asok "config set" commands
+
+        :return:
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            raise case.SkipTest("Test only applies to FUSE clients")
+
+        test_key = "client_cache_size"
+        test_val = "123"
+        self.mount_a.admin_socket(['config', 'set', test_key, test_val])
+        out = self.mount_a.admin_socket(['config', 'get', test_key])
+        self.assertEqual(out[test_key], test_val)
+
+        self.mount_a.write_n_mb("file.bin", 1);
+
+        # Implicitly asserting that things don't have lockdep error in shutdown
+        self.mount_a.umount_wait(require_clean=True)
+        self.fs.mds_stop()
+
+    def test_mds_config_asok(self):
+        test_key = "mds_max_purge_ops"
+        test_val = "123"
+        self.fs.mds_asok(['config', 'set', test_key, test_val])
+        out = self.fs.mds_asok(['config', 'get', test_key])
+        self.assertEqual(out[test_key], test_val)
+
+        # Implicitly asserting that things don't have lockdep error in shutdown
+        self.mount_a.umount_wait(require_clean=True)
+        self.fs.mds_stop()
+
+    def test_mds_config_tell(self):
+        test_key = "mds_max_purge_ops"
+        test_val = "123"
+
+        mds_id = self.fs.get_lone_mds_id()
+        self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "injectargs",
+                                            "--{0}={1}".format(test_key, test_val))
+
+        # Read it back with asok because there is no `tell` equivalent
+        out = self.fs.mds_asok(['config', 'get', test_key])
+        self.assertEqual(out[test_key], test_val)
+
+        # Implicitly asserting that things don't have lockdep error in shutdown
+        self.mount_a.umount_wait(require_clean=True)
+        self.fs.mds_stop()
diff --git a/src/ceph/qa/tasks/cephfs/test_damage.py b/src/ceph/qa/tasks/cephfs/test_damage.py
new file mode 100644
index 0000000..380b49c
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_damage.py
@@ -0,0 +1,548 @@
+import json
+import logging
+import errno
+import re
+from teuthology.contextutil import MaxWhileTries
+from teuthology.exceptions import CommandFailedError
+from teuthology.orchestra.run import wait
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+DAMAGED_ON_START = "damaged_on_start"
+DAMAGED_ON_LS = "damaged_on_ls"
+CRASHED = "server crashed"
+NO_DAMAGE = "no damage"
+FAILED_CLIENT = "client failed"
+FAILED_SERVER = "server failed"
+
+# An EIO in response to a stat from the client
+EIO_ON_LS = "eio"
+
+# An EIO, but nothing in damage table (not ever what we expect)
+EIO_NO_DAMAGE = "eio without damage entry"
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDamage(CephFSTestCase):
+    def _simple_workload_write(self):
+        self.mount_a.run_shell(["mkdir", "subdir"])
+        self.mount_a.write_n_mb("subdir/sixmegs", 6)
+        return self.mount_a.stat("subdir/sixmegs")
+
+    def is_marked_damaged(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank in mds_map['damaged']
+
+    @for_teuthology #459s
+    def test_object_deletion(self):
+        """
+        That the MDS has a clean 'damaged' response to loss of any single metadata object
+        """
+
+        self._simple_workload_write()
+
+        # Hmm, actually it would be nice to permute whether the metadata pool
+        # state contains sessions or not, but for the moment close this session
+        # to avoid waiting through reconnect on every MDS start.
+        self.mount_a.umount_wait()
+        for mds_name in self.fs.get_active_names():
+            self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        self.fs.rados(['export', '/tmp/metadata.bin'])
+
+        def is_ignored(obj_id, dentry=None):
+            """
+            A filter to avoid redundantly mutating many similar objects (e.g.
+            stray dirfrags) or similar dentries (e.g. stray dir dentries)
+            """
+            if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000":
+                return True
+
+            if dentry and obj_id == "100.00000000":
+                if re.match("stray.+_head", dentry) and dentry != "stray0_head":
+                    return True
+
+            return False
+
+        def get_path(obj_id, dentry=None):
+            """
+            What filesystem path does this object or dentry correspond to?   i.e.
+            what should I poke to see EIO after damaging it?
+            """
+
+            if obj_id == "1.00000000" and dentry == "subdir_head":
+                return "./subdir"
+            elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head":
+                return "./subdir/sixmegs"
+
+            # None means ls will do an "ls -R" in hope of seeing some errors
+            return None
+
+        objects = self.fs.rados(["ls"]).split("\n")
+        objects = [o for o in objects if not is_ignored(o)]
+
+        # Find all objects with an OMAP header
+        omap_header_objs = []
+        for o in objects:
+            header = self.fs.rados(["getomapheader", o])
+            # The rados CLI wraps the header output in a hex-printed style
+            header_bytes = int(re.match("header \((.+) bytes\)", header).group(1))
+            if header_bytes > 0:
+                omap_header_objs.append(o)
+
+        # Find all OMAP key/vals
+        omap_keys = []
+        for o in objects:
+            keys_str = self.fs.rados(["listomapkeys", o])
+            if keys_str:
+                for key in keys_str.split("\n"):
+                    if not is_ignored(o, key):
+                        omap_keys.append((o, key))
+
+        # Find objects that have data in their bodies
+        data_objects = []
+        for obj_id in objects:
+            stat_out = self.fs.rados(["stat", obj_id])
+            size = int(re.match(".+, size (.+)$", stat_out).group(1))
+            if size > 0:
+                data_objects.append(obj_id)
+
+        # Define the various forms of damage we will inflict
+        class MetadataMutation(object):
+            def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None):
+                self.obj_id = obj_id_
+                self.desc = desc_
+                self.mutate_fn = mutate_fn_
+                self.expectation = expectation_
+                if ls_path is None:
+                    self.ls_path = "."
+                else:
+                    self.ls_path = ls_path
+
+            def __eq__(self, other):
+                return self.desc == other.desc
+
+            def __hash__(self):
+                return hash(self.desc)
+
+        junk = "deadbeef" * 10
+        mutations = []
+
+        # Removals
+        for obj_id in objects:
+            if obj_id in [
+                # JournalPointers are auto-replaced if missing (same path as upgrade)
+                "400.00000000",
+                # Missing dirfrags for non-system dirs result in empty directory
+                "10000000000.00000000",
+                # PurgeQueue is auto-created if not found on startup
+                "500.00000000"
+            ]:
+                expectation = NO_DAMAGE
+            else:
+                expectation = DAMAGED_ON_START
+
+            log.info("Expectation on rm '{0}' will be '{1}'".format(
+                obj_id, expectation
+            ))
+
+            mutations.append(MetadataMutation(
+                obj_id,
+                "Delete {0}".format(obj_id),
+                lambda o=obj_id: self.fs.rados(["rm", o]),
+                expectation
+            ))
+
+        # Blatant corruptions
+        mutations.extend([
+            MetadataMutation(
+                o,
+                "Corrupt {0}".format(o),
+                lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk),
+                DAMAGED_ON_START
+            ) for o in data_objects
+        ])
+
+        # Truncations
+        for obj_id in data_objects:
+            if obj_id == "500.00000000":
+                # The PurgeQueue is allowed to be empty: Journaler interprets
+                # an empty header object as an empty journal.
+                expectation = NO_DAMAGE
+            else:
+                expectation = DAMAGED_ON_START
+
+            mutations.append(
+                MetadataMutation(
+                    o,
+                    "Truncate {0}".format(o),
+                    lambda o=o: self.fs.rados(["truncate", o, "0"]),
+                    DAMAGED_ON_START
+            ))
+
+        # OMAP value corruptions
+        for o, k in omap_keys:
+            if o.startswith("100."):
+                # Anything in rank 0's 'mydir'
+                expectation = DAMAGED_ON_START
+            else:
+                expectation = EIO_ON_LS
+
+            mutations.append(
+                MetadataMutation(
+                    o,
+                    "Corrupt omap key {0}:{1}".format(o, k),
+                    lambda o=o,k=k: self.fs.rados(["setomapval", o, k, junk]),
+                    expectation,
+                    get_path(o, k)
+                )
+            )
+
+        # OMAP header corruptions
+        for obj_id in omap_header_objs:
+            if re.match("60.\.00000000", obj_id) \
+                    or obj_id in ["1.00000000", "100.00000000", "mds0_sessionmap"]:
+                expectation = DAMAGED_ON_START
+            else:
+                expectation = NO_DAMAGE
+
+            log.info("Expectation on corrupt header '{0}' will be '{1}'".format(
+                obj_id, expectation
+            ))
+
+            mutations.append(
+                MetadataMutation(
+                    obj_id,
+                    "Corrupt omap header on {0}".format(obj_id),
+                    lambda o=obj_id: self.fs.rados(["setomapheader", o, junk]),
+                    expectation
+                )
+            )
+
+        results = {}
+
+        for mutation in mutations:
+            log.info("Applying mutation '{0}'".format(mutation.desc))
+
+            # Reset MDS state
+            self.mount_a.umount_wait(force=True)
+            self.fs.mds_stop()
+            self.fs.mds_fail()
+            self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+            # Reset RADOS pool state
+            self.fs.rados(['import', '/tmp/metadata.bin'])
+
+            # Inject the mutation
+            mutation.mutate_fn()
+
+            # Try starting the MDS
+            self.fs.mds_restart()
+
+            # How long we'll wait between starting a daemon and expecting
+            # it to make it through startup, and potentially declare itself
+            # damaged to the mon cluster.
+            startup_timeout = 60
+
+            if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE):
+                if mutation.expectation == DAMAGED_ON_START:
+                    # The MDS may pass through active before making it to damaged
+                    try:
+                        self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout)
+                    except RuntimeError:
+                        pass
+
+                # Wait for MDS to either come up or go into damaged state
+                try:
+                    self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout)
+                except RuntimeError:
+                    crashed = False
+                    # Didn't make it to healthy or damaged, did it crash?
+                    for daemon_id, daemon in self.fs.mds_daemons.items():
+                        if daemon.proc and daemon.proc.finished:
+                            crashed = True
+                            log.error("Daemon {0} crashed!".format(daemon_id))
+                            daemon.proc = None  # So that subsequent stop() doesn't raise error
+                    if not crashed:
+                        # Didn't go health, didn't go damaged, didn't crash, so what?
+                        raise
+                    else:
+                        log.info("Result: Mutation '{0}' led to crash".format(mutation.desc))
+                        results[mutation] = CRASHED
+                        continue
+                if self.is_marked_damaged(0):
+                    log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc))
+                    results[mutation] = DAMAGED_ON_START
+                    continue
+                else:
+                    log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc))
+            else:
+                try:
+                    self.wait_until_true(self.fs.are_daemons_healthy, 60)
+                except RuntimeError:
+                    log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc))
+                    if self.is_marked_damaged(0):
+                        results[mutation] = DAMAGED_ON_START
+                    else:
+                        results[mutation] = FAILED_SERVER
+                    continue
+                log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc))
+
+            # MDS is up, should go damaged on ls or client mount
+            self.mount_a.mount()
+            self.mount_a.wait_until_mounted()
+            if mutation.ls_path == ".":
+                proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False)
+            else:
+                proc = self.mount_a.stat(mutation.ls_path, wait=False)
+
+            if mutation.expectation == DAMAGED_ON_LS:
+                try:
+                    self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
+                    log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc))
+                    results[mutation] = DAMAGED_ON_LS
+                except RuntimeError:
+                    if self.fs.are_daemons_healthy():
+                        log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format(
+                            mutation.desc))
+                        results[mutation] = NO_DAMAGE
+                    else:
+                        log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc))
+                        results[mutation] = FAILED_SERVER
+
+            else:
+                try:
+                    wait([proc], 20)
+                    log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc))
+                    results[mutation] = NO_DAMAGE
+                except MaxWhileTries:
+                    log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc))
+                    results[mutation] = FAILED_CLIENT
+                except CommandFailedError as e:
+                    if e.exitstatus == errno.EIO:
+                        log.info("Result: EIO on client")
+                        results[mutation] = EIO_ON_LS
+                    else:
+                        log.info("Result: unexpected error {0} on client".format(e))
+                        results[mutation] = FAILED_CLIENT
+
+            if mutation.expectation == EIO_ON_LS:
+                # EIOs mean something handled by DamageTable: assert that it has
+                # been populated
+                damage = json.loads(
+                    self.fs.mon_manager.raw_cluster_cmd(
+                        'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty'))
+                if len(damage) == 0:
+                    results[mutation] = EIO_NO_DAMAGE
+
+        failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result]
+        if failures:
+            log.error("{0} mutations had unexpected outcomes:".format(len(failures)))
+            for mutation, result in failures:
+                log.error("  Expected '{0}' actually '{1}' from '{2}'".format(
+                    mutation.expectation, result, mutation.desc
+                ))
+            raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures)))
+        else:
+            log.info("All {0} mutations had expected outcomes".format(len(mutations)))
+
+    def test_damaged_dentry(self):
+        # Damage to dentrys is interesting because it leaves the
+        # directory's `complete` flag in a subtle state where
+        # we have marked the dir complete in order that folks
+        # can access it, but in actual fact there is a dentry
+        # missing
+        self.mount_a.run_shell(["mkdir", "subdir/"])
+
+        self.mount_a.run_shell(["touch", "subdir/file_undamaged"])
+        self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"])
+
+        subdir_ino = self.mount_a.path_to_ino("subdir")
+
+        self.mount_a.umount_wait()
+        for mds_name in self.fs.get_active_names():
+            self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # Corrupt a dentry
+        junk = "deadbeef" * 10
+        dirfrag_obj = "{0:x}.00000000".format(subdir_ino)
+        self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
+
+        # Start up and try to list it
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        dentries = self.mount_a.ls("subdir/")
+
+        # The damaged guy should have disappeared
+        self.assertEqual(dentries, ["file_undamaged"])
+
+        # I should get ENOENT if I try and read it normally, because
+        # the dir is considered complete
+        try:
+            self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            raise AssertionError("Expected ENOENT")
+
+        # The fact that there is damaged should have bee recorded
+        damage = json.loads(
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "ls", '--format=json-pretty'))
+        self.assertEqual(len(damage), 1)
+        damage_id = damage[0]['id']
+
+        # If I try to create a dentry with the same name as the damaged guy
+        # then that should be forbidden
+        try:
+            self.mount_a.touch("subdir/file_to_be_damaged")
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.EIO)
+        else:
+            raise AssertionError("Expected EIO")
+
+        # Attempting that touch will clear the client's complete flag, now
+        # when I stat it I'll get EIO instead of ENOENT
+        try:
+            self.mount_a.stat("subdir/file_to_be_damaged", wait=True)
+        except CommandFailedError as e:
+            if isinstance(self.mount_a, FuseMount):
+                self.assertEqual(e.exitstatus, errno.EIO)
+            else:
+                # Kernel client handles this case differently
+                self.assertEqual(e.exitstatus, errno.ENOENT)
+        else:
+            raise AssertionError("Expected EIO")
+
+        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
+        self.assertEqual(nfiles, "2")
+
+        self.mount_a.umount_wait()
+
+        # Now repair the stats
+        scrub_json = self.fs.mds_asok(["scrub_path", "/subdir", "repair"])
+        log.info(json.dumps(scrub_json, indent=2))
+
+        self.assertEqual(scrub_json["passed_validation"], False)
+        self.assertEqual(scrub_json["raw_stats"]["checked"], True)
+        self.assertEqual(scrub_json["raw_stats"]["passed"], False)
+
+        # Check that the file count is now correct
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files")
+        self.assertEqual(nfiles, "1")
+
+        # Clean up the omap object
+        self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk])
+
+        # Clean up the damagetable entry
+        self.fs.mon_manager.raw_cluster_cmd(
+            'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+            "damage", "rm", "{did}".format(did=damage_id))
+
+        # Now I should be able to create a file with the same name as the
+        # damaged guy if I want.
+        self.mount_a.touch("subdir/file_to_be_damaged")
+
+    def test_open_ino_errors(self):
+        """
+        That errors encountered during opening inos are properly propagated
+        """
+
+        self.mount_a.run_shell(["mkdir", "dir1"])
+        self.mount_a.run_shell(["touch", "dir1/file1"])
+        self.mount_a.run_shell(["mkdir", "dir2"])
+        self.mount_a.run_shell(["touch", "dir2/file2"])
+        self.mount_a.run_shell(["mkdir", "testdir"])
+        self.mount_a.run_shell(["ln", "dir1/file1", "testdir/hardlink1"])
+        self.mount_a.run_shell(["ln", "dir2/file2", "testdir/hardlink2"])
+
+        file1_ino = self.mount_a.path_to_ino("dir1/file1")
+        file2_ino = self.mount_a.path_to_ino("dir2/file2")
+        dir2_ino = self.mount_a.path_to_ino("dir2")
+
+        # Ensure everything is written to backing store
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Drop everything from the MDS cache
+        self.mds_cluster.mds_stop()
+        self.fs.journal_tool(['journal', 'reset'])
+        self.mds_cluster.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount()
+
+        # Case 1: un-decodeable backtrace
+
+        # Validate that the backtrace is present and decodable
+        self.fs.read_backtrace(file1_ino)
+        # Go corrupt the backtrace of alpha/target (used for resolving
+        # bravo/hardlink).
+        self.fs._write_data_xattr(file1_ino, "parent", "rhubarb")
+
+        # Check that touching the hardlink gives EIO
+        ran = self.mount_a.run_shell(["stat", "testdir/hardlink1"], wait=False)
+        try:
+            ran.wait()
+        except CommandFailedError:
+            self.assertTrue("Input/output error" in ran.stderr.getvalue())
+
+        # Check that an entry is created in the damage table
+        damage = json.loads(
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "ls", '--format=json-pretty'))
+        self.assertEqual(len(damage), 1)
+        self.assertEqual(damage[0]['damage_type'], "backtrace")
+        self.assertEqual(damage[0]['ino'], file1_ino)
+
+        self.fs.mon_manager.raw_cluster_cmd(
+            'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+            "damage", "rm", str(damage[0]['id']))
+
+
+        # Case 2: missing dirfrag for the target inode
+
+        self.fs.rados(["rm", "{0:x}.00000000".format(dir2_ino)])
+
+        # Check that touching the hardlink gives EIO
+        ran = self.mount_a.run_shell(["stat", "testdir/hardlink2"], wait=False)
+        try:
+            ran.wait()
+        except CommandFailedError:
+            self.assertTrue("Input/output error" in ran.stderr.getvalue())
+
+        # Check that an entry is created in the damage table
+        damage = json.loads(
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "ls", '--format=json-pretty'))
+        self.assertEqual(len(damage), 2)
+        if damage[0]['damage_type'] == "backtrace" :
+            self.assertEqual(damage[0]['ino'], file2_ino)
+            self.assertEqual(damage[1]['damage_type'], "dir_frag")
+            self.assertEqual(damage[1]['ino'], dir2_ino)
+        else:
+            self.assertEqual(damage[0]['damage_type'], "dir_frag")
+            self.assertEqual(damage[0]['ino'], dir2_ino)
+            self.assertEqual(damage[1]['damage_type'], "backtrace")
+            self.assertEqual(damage[1]['ino'], file2_ino)
+
+        for entry in damage:
+            self.fs.mon_manager.raw_cluster_cmd(
+                'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]),
+                "damage", "rm", str(entry['id']))
diff --git a/src/ceph/qa/tasks/cephfs/test_data_scan.py b/src/ceph/qa/tasks/cephfs/test_data_scan.py
new file mode 100644
index 0000000..a2d3157
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_data_scan.py
@@ -0,0 +1,600 @@
+
+"""
+Test our tools for recovering metadata from the data pool
+"""
+import json
+
+import logging
+import os
+from textwrap import dedent
+import traceback
+from collections import namedtuple, defaultdict
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class Workload(object):
+    def __init__(self, filesystem, mount):
+        self._mount = mount
+        self._filesystem = filesystem
+        self._initial_state = None
+
+        # Accumulate backtraces for every failed validation, and return them.  Backtraces
+        # are rather verbose, but we only see them when something breaks, and they
+        # let us see which check failed without having to decorate each check with
+        # a string
+        self._errors = []
+
+    def assert_equal(self, a, b):
+        try:
+            if a != b:
+                raise AssertionError("{0} != {1}".format(a, b))
+        except AssertionError as e:
+            self._errors.append(
+                ValidationError(e, traceback.format_exc(3))
+            )
+
+    def write(self):
+        """
+        Write the workload files to the mount
+        """
+        raise NotImplementedError()
+
+    def validate(self):
+        """
+        Read from the mount and validate that the workload files are present (i.e. have
+        survived or been reconstructed from the test scenario)
+        """
+        raise NotImplementedError()
+
+    def damage(self):
+        """
+        Damage the filesystem pools in ways that will be interesting to recover from.  By
+        default just wipe everything in the metadata pool
+        """
+        # Delete every object in the metadata pool
+        objects = self._filesystem.rados(["ls"]).split("\n")
+        for o in objects:
+            self._filesystem.rados(["rm", o])
+
+    def flush(self):
+        """
+        Called after client unmount, after write: flush whatever you want
+        """
+        self._filesystem.mds_asok(["flush", "journal"])
+
+
+class SimpleWorkload(Workload):
+    """
+    Single file, single directory, check that it gets recovered and so does its size
+    """
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        self._mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._mount.stat("subdir/sixmegs")
+
+    def validate(self):
+        self._mount.run_shell(["ls", "subdir"])
+        st = self._mount.stat("subdir/sixmegs")
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+        return self._errors
+
+
+class MovedFile(Workload):
+    def write(self):
+        # Create a file whose backtrace disagrees with his eventual position
+        # in the metadata.  We will see that he gets reconstructed in his
+        # original position according to his backtrace.
+        self._mount.run_shell(["mkdir", "subdir_alpha"])
+        self._mount.run_shell(["mkdir", "subdir_bravo"])
+        self._mount.write_n_mb("subdir_alpha/sixmegs", 6)
+        self._filesystem.mds_asok(["flush", "journal"])
+        self._mount.run_shell(["mv", "subdir_alpha/sixmegs", "subdir_bravo/sixmegs"])
+        self._initial_state = self._mount.stat("subdir_bravo/sixmegs")
+
+    def flush(self):
+        pass
+
+    def validate(self):
+        self.assert_equal(self._mount.ls(), ["subdir_alpha"])
+        st = self._mount.stat("subdir_alpha/sixmegs")
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+        return self._errors
+
+
+class BacktracelessFile(Workload):
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        self._mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._mount.stat("subdir/sixmegs")
+
+    def flush(self):
+        # Never flush metadata, so backtrace won't be written
+        pass
+
+    def validate(self):
+        ino_name = "%x" % self._initial_state["st_ino"]
+
+        # The inode should be linked into lost+found because we had no path for it
+        self.assert_equal(self._mount.ls(), ["lost+found"])
+        self.assert_equal(self._mount.ls("lost+found"), [ino_name])
+        st = self._mount.stat("lost+found/{ino_name}".format(ino_name=ino_name))
+
+        # We might not have got the name or path, but we should still get the size
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+        return self._errors
+
+
+class StripedStashedLayout(Workload):
+    def __init__(self, fs, m):
+        super(StripedStashedLayout, self).__init__(fs, m)
+
+        # Nice small stripes so we can quickly do our writes+validates
+        self.sc = 4
+        self.ss = 65536
+        self.os = 262144
+
+        self.interesting_sizes = [
+            # Exactly stripe_count objects will exist
+            self.os * self.sc,
+            # Fewer than stripe_count objects will exist
+            self.os * self.sc / 2,
+            self.os * (self.sc - 1) + self.os / 2,
+            self.os * (self.sc - 1) + self.os / 2 - 1,
+            self.os * (self.sc + 1) + self.os / 2,
+            self.os * (self.sc + 1) + self.os / 2 + 1,
+            # More than stripe_count objects will exist
+            self.os * self.sc + self.os * self.sc / 2
+        ]
+
+    def write(self):
+        # Create a dir with a striped layout set on it
+        self._mount.run_shell(["mkdir", "stripey"])
+
+        self._mount.setfattr("./stripey", "ceph.dir.layout",
+             "stripe_unit={ss} stripe_count={sc} object_size={os} pool={pool}".format(
+                 ss=self.ss, os=self.os, sc=self.sc,
+                 pool=self._filesystem.get_data_pool_name()
+             ))
+
+        # Write files, then flush metadata so that its layout gets written into an xattr
+        for i, n_bytes in enumerate(self.interesting_sizes):
+            self._mount.write_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+            # This is really just validating the validator
+            self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+        self._filesystem.mds_asok(["flush", "journal"])
+
+        # Write another file in the same way, but this time don't flush the metadata,
+        # so that it won't have the layout xattr
+        self._mount.write_test_pattern("stripey/unflushed_file", 1024 * 512)
+        self._mount.validate_test_pattern("stripey/unflushed_file", 1024 * 512)
+
+        self._initial_state = {
+            "unflushed_ino": self._mount.path_to_ino("stripey/unflushed_file")
+        }
+
+    def flush(self):
+        # Pass because we already selectively flushed during write
+        pass
+
+    def validate(self):
+        # The first files should have been recovered into its original location
+        # with the correct layout: read back correct data
+        for i, n_bytes in enumerate(self.interesting_sizes):
+            try:
+                self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes)
+            except CommandFailedError as e:
+                self._errors.append(
+                    ValidationError("File {0} (size {1}): {2}".format(i, n_bytes, e), traceback.format_exc(3))
+                )
+
+        # The unflushed file should have been recovered into lost+found without
+        # the correct layout: read back junk
+        ino_name = "%x" % self._initial_state["unflushed_ino"]
+        self.assert_equal(self._mount.ls("lost+found"), [ino_name])
+        try:
+            self._mount.validate_test_pattern(os.path.join("lost+found", ino_name), 1024 * 512)
+        except CommandFailedError:
+            pass
+        else:
+            self._errors.append(
+                ValidationError("Unexpectedly valid data in unflushed striped file", "")
+            )
+
+        return self._errors
+
+
+class ManyFilesWorkload(Workload):
+    def __init__(self, filesystem, mount, file_count):
+        super(ManyFilesWorkload, self).__init__(filesystem, mount)
+        self.file_count = file_count
+
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        for n in range(0, self.file_count):
+            self._mount.write_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024)
+
+    def validate(self):
+        for n in range(0, self.file_count):
+            try:
+                self._mount.validate_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024)
+            except CommandFailedError as e:
+                self._errors.append(
+                    ValidationError("File {0}: {1}".format(n, e), traceback.format_exc(3))
+                )
+
+        return self._errors
+
+
+class MovedDir(Workload):
+    def write(self):
+        # Create a nested dir that we will then move.  Two files with two different
+        # backtraces referring to the moved dir, claiming two different locations for
+        # it.  We will see that only one backtrace wins and the dir ends up with
+        # single linkage.
+        self._mount.run_shell(["mkdir", "-p", "grandmother/parent"])
+        self._mount.write_n_mb("grandmother/parent/orig_pos_file", 1)
+        self._filesystem.mds_asok(["flush", "journal"])
+        self._mount.run_shell(["mkdir", "grandfather"])
+        self._mount.run_shell(["mv", "grandmother/parent", "grandfather"])
+        self._mount.write_n_mb("grandfather/parent/new_pos_file", 2)
+        self._filesystem.mds_asok(["flush", "journal"])
+
+        self._initial_state = (
+            self._mount.stat("grandfather/parent/orig_pos_file"),
+            self._mount.stat("grandfather/parent/new_pos_file")
+        )
+
+    def validate(self):
+        root_files = self._mount.ls()
+        self.assert_equal(len(root_files), 1)
+        self.assert_equal(root_files[0] in ["grandfather", "grandmother"], True)
+        winner = root_files[0]
+        st_opf = self._mount.stat("{0}/parent/orig_pos_file".format(winner))
+        st_npf = self._mount.stat("{0}/parent/new_pos_file".format(winner))
+
+        self.assert_equal(st_opf['st_size'], self._initial_state[0]['st_size'])
+        self.assert_equal(st_npf['st_size'], self._initial_state[1]['st_size'])
+
+
+class MissingZerothObject(Workload):
+    def write(self):
+        self._mount.run_shell(["mkdir", "subdir"])
+        self._mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._mount.stat("subdir/sixmegs")
+
+    def damage(self):
+        super(MissingZerothObject, self).damage()
+        zeroth_id = "{0:x}.00000000".format(self._initial_state['st_ino'])
+        self._filesystem.rados(["rm", zeroth_id], pool=self._filesystem.get_data_pool_name())
+
+    def validate(self):
+        st = self._mount.stat("lost+found/{0:x}".format(self._initial_state['st_ino']))
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+
+class NonDefaultLayout(Workload):
+    """
+    Check that the reconstruction copes with files that have a different
+    object size in their layout
+    """
+    def write(self):
+        self._mount.run_shell(["touch", "datafile"])
+        self._mount.setfattr("./datafile", "ceph.file.layout.object_size", "8388608")
+        self._mount.run_shell(["dd", "if=/dev/urandom", "of=./datafile", "bs=1M", "count=32"])
+        self._initial_state = self._mount.stat("datafile")
+
+    def validate(self):
+        # Check we got the layout reconstructed properly
+        object_size = int(self._mount.getfattr(
+            "./datafile", "ceph.file.layout.object_size"))
+        self.assert_equal(object_size, 8388608)
+
+        # Check we got the file size reconstructed properly
+        st = self._mount.stat("datafile")
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+
+
+class TestDataScan(CephFSTestCase):
+    MDSS_REQUIRED = 2
+
+    def is_marked_damaged(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank in mds_map['damaged']
+
+    def _rebuild_metadata(self, workload, workers=1):
+        """
+        That when all objects in metadata pool are removed, we can rebuild a metadata pool
+        based on the contents of a data pool, and a client can see and read our files.
+        """
+
+        # First, inject some files
+
+        workload.write()
+
+        # Unmount the client and flush the journal: the tool should also cope with
+        # situations where there is dirty metadata, but we'll test that separately
+        self.mount_a.umount_wait()
+        workload.flush()
+
+        # Stop the MDS
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # After recovery, we need the MDS to not be strict about stats (in production these options
+        # are off by default, but in QA we need to explicitly disable them)
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+        # Apply any data damage the workload wants
+        workload.damage()
+
+        # Reset the MDS map in case multiple ranks were in play: recovery procedure
+        # only understands how to rebuild metadata under rank 0
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
+                '--yes-i-really-mean-it')
+
+        self.fs.mds_restart()
+
+        def get_state(mds_id):
+            info = self.mds_cluster.get_mds_info(mds_id)
+            return info['state'] if info is not None else None
+
+        self.wait_until_true(lambda: self.is_marked_damaged(0), 60)
+        for mds_id in self.fs.mds_ids:
+            self.wait_until_equal(
+                    lambda: get_state(mds_id),
+                    "up:standby",
+                    timeout=60)
+
+        self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
+
+        # Run the recovery procedure
+        if False:
+            with self.assertRaises(CommandFailedError):
+                # Normal reset should fail when no objects are present, we'll use --force instead
+                self.fs.journal_tool(["journal", "reset"])
+
+        self.fs.journal_tool(["journal", "reset", "--force"])
+        self.fs.data_scan(["init"])
+        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers)
+        self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers)
+
+        # Mark the MDS repaired
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+        # Start the MDS
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+        log.info(str(self.mds_cluster.status()))
+
+        # Mount a client
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # See that the files are present and correct
+        errors = workload.validate()
+        if errors:
+            log.error("Validation errors found: {0}".format(len(errors)))
+            for e in errors:
+                log.error(e.exception)
+                log.error(e.backtrace)
+            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+                errors[0].exception, errors[0].backtrace
+            ))
+
+    def test_rebuild_simple(self):
+        self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a))
+
+    def test_rebuild_moved_file(self):
+        self._rebuild_metadata(MovedFile(self.fs, self.mount_a))
+
+    def test_rebuild_backtraceless(self):
+        self._rebuild_metadata(BacktracelessFile(self.fs, self.mount_a))
+
+    def test_rebuild_moved_dir(self):
+        self._rebuild_metadata(MovedDir(self.fs, self.mount_a))
+
+    def test_rebuild_missing_zeroth(self):
+        self._rebuild_metadata(MissingZerothObject(self.fs, self.mount_a))
+
+    def test_rebuild_nondefault_layout(self):
+        self._rebuild_metadata(NonDefaultLayout(self.fs, self.mount_a))
+
+    def test_stashed_layout(self):
+        self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a))
+
+    def _dirfrag_keys(self, object_id):
+        keys_str = self.fs.rados(["listomapkeys", object_id])
+        if keys_str:
+            return keys_str.split("\n")
+        else:
+            return []
+
+    def test_fragmented_injection(self):
+        """
+        That when injecting a dentry into a fragmented directory, we put it in the right fragment.
+        """
+
+        self.fs.set_allow_dirfrags(True)
+
+        file_count = 100
+        file_names = ["%s" % n for n in range(0, file_count)]
+
+        # Create a directory of `file_count` files, each named after its
+        # decimal number and containing the string of its decimal number
+        self.mount_a.run_python(dedent("""
+        import os
+        path = os.path.join("{path}", "subdir")
+        os.mkdir(path)
+        for n in range(0, {file_count}):
+            open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
+        """.format(
+            path=self.mount_a.mountpoint,
+            file_count=file_count
+        )))
+
+        dir_ino = self.mount_a.path_to_ino("subdir")
+
+        # Only one MDS should be active!
+        self.assertEqual(len(self.fs.get_active_names()), 1)
+
+        # Ensure that one directory is fragmented
+        mds_id = self.fs.get_active_names()[0]
+        self.fs.mds_asok(["dirfrag", "split", "/subdir", "0/0", "1"], mds_id)
+
+        # Flush journal and stop MDS
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # Pick a dentry and wipe out its key
+        # Because I did a 1 bit split, I know one frag will be named <inode>.01000000
+        frag_obj_id = "{0:x}.01000000".format(dir_ino)
+        keys = self._dirfrag_keys(frag_obj_id)
+        victim_key = keys[7]  # arbitrary choice
+        log.info("victim_key={0}".format(victim_key))
+        victim_dentry = victim_key.split("_head")[0]
+        self.fs.rados(["rmomapkey", frag_obj_id, victim_key])
+
+        # Start filesystem back up, observe that the file appears to be gone in an `ls`
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n")
+        self.assertListEqual(sorted(files), sorted(list(set(file_names) - set([victim_dentry]))))
+
+        # Stop the filesystem
+        self.mount_a.umount_wait()
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # Run data-scan, observe that it inserts our dentry back into the correct fragment
+        # by checking the omap now has the dentry's key again
+        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
+        self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()])
+        self.assertIn(victim_key, self._dirfrag_keys(frag_obj_id))
+
+        # Start the filesystem and check that the dentry we deleted is now once again visible
+        # and points to the correct file data.
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip()
+        self.assertEqual(out, victim_dentry)
+
+        # Finally, close the loop by checking our injected dentry survives a merge
+        mds_id = self.fs.get_active_names()[0]
+        self.mount_a.ls("subdir")  # Do an ls to ensure both frags are in cache so the merge will work
+        self.fs.mds_asok(["dirfrag", "merge", "/subdir", "0/0"], mds_id)
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+        frag_obj_id = "{0:x}.00000000".format(dir_ino)
+        keys = self._dirfrag_keys(frag_obj_id)
+        self.assertListEqual(sorted(keys), sorted(["%s_head" % f for f in file_names]))
+
+    @for_teuthology
+    def test_parallel_execution(self):
+        self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 25), workers=7)
+
+    def test_pg_files(self):
+        """
+        That the pg files command tells us which files are associated with
+        a particular PG
+        """
+        file_count = 20
+        self.mount_a.run_shell(["mkdir", "mydir"])
+        self.mount_a.create_n_files("mydir/myfile", file_count)
+
+        # Some files elsewhere in the system that we will ignore
+        # to check that the tool is filtering properly
+        self.mount_a.run_shell(["mkdir", "otherdir"])
+        self.mount_a.create_n_files("otherdir/otherfile", file_count)
+
+        pgs_to_files = defaultdict(list)
+        # Rough (slow) reimplementation of the logic
+        for i in range(0, file_count):
+            file_path = "mydir/myfile_{0}".format(i)
+            ino = self.mount_a.path_to_ino(file_path)
+            obj = "{0:x}.{1:08x}".format(ino, 0)
+            pgid = json.loads(self.fs.mon_manager.raw_cluster_cmd(
+                "osd", "map", self.fs.get_data_pool_name(), obj,
+                "--format=json-pretty"
+            ))['pgid']
+            pgs_to_files[pgid].append(file_path)
+            log.info("{0}: {1}".format(file_path, pgid))
+
+        pg_count = self.fs.get_pgs_per_fs_pool()
+        for pg_n in range(0, pg_count):
+            pg_str = "{0}.{1}".format(self.fs.get_data_pool_id(), pg_n)
+            out = self.fs.data_scan(["pg_files", "mydir", pg_str])
+            lines = [l for l in out.split("\n") if l]
+            log.info("{0}: {1}".format(pg_str, lines))
+            self.assertSetEqual(set(lines), set(pgs_to_files[pg_str]))
+
+    def test_scan_links(self):
+        """
+        The scan_links command fixes linkage errors
+        """
+        self.mount_a.run_shell(["mkdir", "testdir1"])
+        self.mount_a.run_shell(["mkdir", "testdir2"])
+        dir1_ino = self.mount_a.path_to_ino("testdir1")
+        dir2_ino = self.mount_a.path_to_ino("testdir2")
+        dirfrag1_oid = "{0:x}.00000000".format(dir1_ino)
+        dirfrag2_oid = "{0:x}.00000000".format(dir2_ino)
+
+        self.mount_a.run_shell(["touch", "testdir1/file1"])
+        self.mount_a.run_shell(["ln", "testdir1/file1", "testdir1/link1"])
+        self.mount_a.run_shell(["ln", "testdir1/file1", "testdir2/link2"])
+
+        mds_id = self.fs.get_active_names()[0]
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+
+        dirfrag1_keys = self._dirfrag_keys(dirfrag1_oid)
+
+        # introduce duplicated primary link
+        file1_key = "file1_head"
+        self.assertIn(file1_key, dirfrag1_keys)
+        file1_omap_data = self.fs.rados(["getomapval", dirfrag1_oid, file1_key, '-'])
+        self.fs.rados(["setomapval", dirfrag2_oid, file1_key], stdin_data=file1_omap_data)
+        self.assertIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
+
+        # remove a remote link, make inode link count incorrect
+        link1_key = 'link1_head'
+        self.assertIn(link1_key, dirfrag1_keys)
+        self.fs.rados(["rmomapkey", dirfrag1_oid, link1_key])
+
+        # increase good primary link's version
+        self.mount_a.run_shell(["touch", "testdir1/file1"])
+        self.mount_a.umount_wait()
+
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # repair linkage errors
+        self.fs.data_scan(["scan_links"])
+
+        # primary link in testdir2 was deleted?
+        self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid))
+
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # link count was adjusted?
+        file1_nlink = self.mount_a.path_to_nlink("testdir1/file1")
+        self.assertEqual(file1_nlink, 2)
diff --git a/src/ceph/qa/tasks/cephfs/test_dump_tree.py b/src/ceph/qa/tasks/cephfs/test_dump_tree.py
new file mode 100644
index 0000000..6d943f9
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_dump_tree.py
@@ -0,0 +1,66 @@
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import random
+import os
+
+class TestDumpTree(CephFSTestCase):
+    def get_paths_to_ino(self):
+        inos = {}
+        p = self.mount_a.run_shell(["find", "./"])
+        paths = p.stdout.getvalue().strip().split()
+        for path in paths:
+            inos[path] = self.mount_a.path_to_ino(path, False)
+
+        return inos
+
+    def populate(self):
+        self.mount_a.run_shell(["git", "clone",
+                                "https://github.com/ceph/ceph-qa-suite"])
+
+    def test_basic(self):
+        self.mount_a.run_shell(["mkdir", "parent"])
+        self.mount_a.run_shell(["mkdir", "parent/child"])
+        self.mount_a.run_shell(["touch", "parent/child/file"])
+        self.mount_a.run_shell(["mkdir", "parent/child/grandchild"])
+        self.mount_a.run_shell(["touch", "parent/child/grandchild/file"])
+
+        inos = self.get_paths_to_ino()
+        tree = self.fs.mds_asok(["dump", "tree", "/parent/child", "1"])
+
+        target_inos = [inos["./parent/child"], inos["./parent/child/file"],
+                       inos["./parent/child/grandchild"]]
+
+        for ino in tree:
+            del target_inos[target_inos.index(ino['ino'])] # don't catch!
+            
+        assert(len(target_inos) == 0)
+
+    def test_random(self):
+        random.seed(0)
+
+        self.populate()
+        inos = self.get_paths_to_ino()
+        target = random.choice(inos.keys())
+
+        if target != "./":
+            target = os.path.dirname(target)
+
+        subtree = [path for path in inos.keys() if path.startswith(target)]
+        target_inos = [inos[path] for path in subtree]
+        tree = self.fs.mds_asok(["dump", "tree", target[1:]])
+
+        for ino in tree:
+            del target_inos[target_inos.index(ino['ino'])] # don't catch!
+            
+        assert(len(target_inos) == 0)
+
+        target_depth = target.count('/')
+        maxdepth = max([path.count('/') for path in subtree]) - target_depth
+        depth = random.randint(0, maxdepth)
+        target_inos = [inos[path] for path in subtree \
+                       if path.count('/') <= depth + target_depth]
+        tree = self.fs.mds_asok(["dump", "tree", target[1:], str(depth)])
+
+        for ino in tree:
+            del target_inos[target_inos.index(ino['ino'])] # don't catch!
+            
+        assert(len(target_inos) == 0)
diff --git a/src/ceph/qa/tasks/cephfs/test_exports.py b/src/ceph/qa/tasks/cephfs/test_exports.py
new file mode 100644
index 0000000..913999d
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_exports.py
@@ -0,0 +1,107 @@
+import logging
+import time
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+class TestExports(CephFSTestCase):
+    MDSS_REQUIRED = 2
+
+    def _wait_subtrees(self, status, rank, test):
+        timeout = 30
+        pause = 2
+        test = sorted(test)
+        for i in range(timeout/pause):
+            subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name'])
+            subtrees = filter(lambda s: s['dir']['path'].startswith('/'), subtrees)
+            filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees])
+            log.info("%s =?= %s", filtered, test)
+            if filtered == test:
+                # Confirm export_pin in output is correct:
+                for s in subtrees:
+                    self.assertTrue(s['export_pin'] == s['auth_first'])
+                return subtrees
+            time.sleep(pause)
+        raise RuntimeError("rank {0} failed to reach desired subtree state", rank)
+
+    def test_export_pin(self):
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+
+        status = self.fs.status()
+
+        self.mount_a.run_shell(["mkdir", "-p", "1/2/3"])
+        self._wait_subtrees(status, 0, [])
+
+        # NOP
+        self.mount_a.setfattr("1", "ceph.dir.pin", "-1")
+        self._wait_subtrees(status, 0, [])
+
+        # NOP (rank < -1)
+        self.mount_a.setfattr("1", "ceph.dir.pin", "-2341")
+        self._wait_subtrees(status, 0, [])
+
+        # pin /1 to rank 1
+        self.mount_a.setfattr("1", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/1', 1)])
+
+        # Check export_targets is set properly
+        status = self.fs.status()
+        log.info(status)
+        r0 = status.get_rank(self.fs.id, 0)
+        self.assertTrue(sorted(r0['export_targets']) == [1])
+
+        # redundant pin /1/2 to rank 1
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1)])
+
+        # change pin /1/2 to rank 0
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "0")
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 0)])
+        self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)])
+
+        # change pin /1/2/3 to (presently) non-existent rank 2
+        self.mount_a.setfattr("1/2/3", "ceph.dir.pin", "2")
+        self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)])
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 0)])
+
+        # change pin /1/2 back to rank 1
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1)])
+
+        # add another directory pinned to 1
+        self.mount_a.run_shell(["mkdir", "-p", "1/4/5"])
+        self.mount_a.setfattr("1/4/5", "ceph.dir.pin", "1")
+        self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1), ('/1/4/5', 1)])
+
+        # change pin /1 to 0
+        self.mount_a.setfattr("1", "ceph.dir.pin", "0")
+        self._wait_subtrees(status, 0, [('/1', 0), ('/1/2', 1), ('/1/4/5', 1)])
+
+        # change pin /1/2 to default (-1); does the subtree root properly respect it's parent pin?
+        self.mount_a.setfattr("1/2", "ceph.dir.pin", "-1")
+        self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1)])
+
+        if len(list(status.get_standbys())):
+            self.fs.set_max_mds(3)
+            self.fs.wait_for_state('up:active', rank=2)
+            self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2)])
+
+            # Check export_targets is set properly
+            status = self.fs.status()
+            log.info(status)
+            r0 = status.get_rank(self.fs.id, 0)
+            self.assertTrue(sorted(r0['export_targets']) == [1,2])
+            r1 = status.get_rank(self.fs.id, 1)
+            self.assertTrue(sorted(r1['export_targets']) == [0])
+            r2 = status.get_rank(self.fs.id, 2)
+            self.assertTrue(sorted(r2['export_targets']) == [])
+
+        # Test rename
+        self.mount_a.run_shell(["mkdir", "-p", "a/b", "aa/bb"])
+        self.mount_a.setfattr("a", "ceph.dir.pin", "1")
+        self.mount_a.setfattr("aa/bb", "ceph.dir.pin", "0")
+        self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/aa/bb', 0)])
+        self.mount_a.run_shell(["mv", "aa", "a/b/"])
+        self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/a/b/aa/bb', 0)])
diff --git a/src/ceph/qa/tasks/cephfs/test_failover.py b/src/ceph/qa/tasks/cephfs/test_failover.py
new file mode 100644
index 0000000..9d3392c
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_failover.py
@@ -0,0 +1,645 @@
+import json
+import logging
+from unittest import case, SkipTest
+
+from cephfs_test_case import CephFSTestCase
+from teuthology.exceptions import CommandFailedError
+from teuthology import misc as teuthology
+from tasks.cephfs.fuse_mount import FuseMount
+
+log = logging.getLogger(__name__)
+
+
+class TestFailover(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 2
+
+    def test_simple(self):
+        """
+        That when the active MDS is killed, a standby MDS is promoted into
+        its rank after the grace period.
+
+        This is just a simple unit test, the harder cases are covered
+        in thrashing tests.
+        """
+
+        # Need all my standbys up as well as the active daemons
+        self.wait_for_daemon_start()
+
+        (original_active, ) = self.fs.get_active_names()
+        original_standbys = self.mds_cluster.get_standby_daemons()
+
+        # Kill the rank 0 daemon's physical process
+        self.fs.mds_stop(original_active)
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        # Wait until the monitor promotes his replacement
+        def promoted():
+            active = self.fs.get_active_names()
+            return active and active[0] in original_standbys
+
+        log.info("Waiting for promotion of one of the original standbys {0}".format(
+            original_standbys))
+        self.wait_until_true(
+            promoted,
+            timeout=grace*2)
+
+        # Start the original rank 0 daemon up again, see that he becomes a standby
+        self.fs.mds_restart(original_active)
+        self.wait_until_true(
+            lambda: original_active in self.mds_cluster.get_standby_daemons(),
+            timeout=60  # Approximately long enough for MDS to start and mon to notice
+        )
+
+    def test_client_abort(self):
+        """
+        That a client will respect fuse_require_active_mds and error out
+        when the cluster appears to be unavailable.
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Requires FUSE client to inject client metadata")
+
+        require_active = self.fs.get_config("fuse_require_active_mds", service_type="mon").lower() == "true"
+        if not require_active:
+            raise case.SkipTest("fuse_require_active_mds is not set")
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        # Check it's not laggy to begin with
+        (original_active, ) = self.fs.get_active_names()
+        self.assertNotIn("laggy_since", self.fs.mon_manager.get_mds_status(original_active))
+
+        self.mounts[0].umount_wait()
+
+        # Control: that we can mount and unmount usually, while the cluster is healthy
+        self.mounts[0].mount()
+        self.mounts[0].wait_until_mounted()
+        self.mounts[0].umount_wait()
+
+        # Stop the daemon processes
+        self.fs.mds_stop()
+
+        # Wait for everyone to go laggy
+        def laggy():
+            mdsmap = self.fs.get_mds_map()
+            for info in mdsmap['info'].values():
+                if "laggy_since" not in info:
+                    return False
+
+            return True
+
+        self.wait_until_true(laggy, grace * 2)
+        with self.assertRaises(CommandFailedError):
+            self.mounts[0].mount()
+
+    def test_standby_count_wanted(self):
+        """
+        That cluster health warnings are generated by insufficient standbys available.
+        """
+
+        # Need all my standbys up as well as the active daemons
+        self.wait_for_daemon_start()
+
+        grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon"))
+
+        standbys = self.mds_cluster.get_standby_daemons()
+        self.assertGreaterEqual(len(standbys), 1)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)))
+
+        # Kill a standby and check for warning
+        victim = standbys.pop()
+        self.fs.mds_stop(victim)
+        log.info("waiting for insufficient standby daemon warning")
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+
+        # restart the standby, see that he becomes a standby, check health clears
+        self.fs.mds_restart(victim)
+        self.wait_until_true(
+            lambda: victim in self.mds_cluster.get_standby_daemons(),
+            timeout=60  # Approximately long enough for MDS to start and mon to notice
+        )
+        self.wait_for_health_clear(timeout=30)
+
+        # Set it one greater than standbys ever seen
+        standbys = self.mds_cluster.get_standby_daemons()
+        self.assertGreaterEqual(len(standbys), 1)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1))
+        log.info("waiting for insufficient standby daemon warning")
+        self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2)
+
+        # Set it to 0
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0')
+        self.wait_for_health_clear(timeout=30)
+
+
+
+
+class TestStandbyReplay(CephFSTestCase):
+    MDSS_REQUIRED = 4
+    REQUIRE_FILESYSTEM = False
+
+    def set_standby_for(self, leader, follower, replay):
+        self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader)
+        if replay:
+            self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true")
+
+    def get_info_by_name(self, mds_name):
+        status = self.mds_cluster.status()
+        info = status.get_mds(mds_name)
+        if info is None:
+            log.warn(str(status))
+            raise RuntimeError("MDS '{0}' not found".format(mds_name))
+        else:
+            return info
+
+    def test_standby_replay_unused(self):
+        # Pick out exactly 3 daemons to be run during test
+        use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
+        mds_a, mds_b, mds_c = use_daemons
+        log.info("Using MDS daemons: {0}".format(use_daemons))
+
+        # B and C should both follow A, but only one will
+        # really get into standby replay state.
+        self.set_standby_for(mds_a, mds_b, True)
+        self.set_standby_for(mds_a, mds_c, True)
+
+        # Create FS and start A
+        fs_a = self.mds_cluster.newfs("alpha")
+        self.mds_cluster.mds_restart(mds_a)
+        fs_a.wait_for_daemons()
+        self.assertEqual(fs_a.get_active_names(), [mds_a])
+
+        # Start B, he should go into standby replay
+        self.mds_cluster.mds_restart(mds_b)
+        self.wait_for_daemon_start([mds_b])
+        info_b = self.get_info_by_name(mds_b)
+        self.assertEqual(info_b['state'], "up:standby-replay")
+        self.assertEqual(info_b['standby_for_name'], mds_a)
+        self.assertEqual(info_b['rank'], 0)
+
+        # Start C, he should go into standby (*not* replay)
+        self.mds_cluster.mds_restart(mds_c)
+        self.wait_for_daemon_start([mds_c])
+        info_c = self.get_info_by_name(mds_c)
+        self.assertEqual(info_c['state'], "up:standby")
+        self.assertEqual(info_c['standby_for_name'], mds_a)
+        self.assertEqual(info_c['rank'], -1)
+
+        # Kill B, C should go into standby replay
+        self.mds_cluster.mds_stop(mds_b)
+        self.mds_cluster.mds_fail(mds_b)
+        self.wait_until_equal(
+                lambda: self.get_info_by_name(mds_c)['state'],
+                "up:standby-replay",
+                60)
+        info_c = self.get_info_by_name(mds_c)
+        self.assertEqual(info_c['state'], "up:standby-replay")
+        self.assertEqual(info_c['standby_for_name'], mds_a)
+        self.assertEqual(info_c['rank'], 0)
+
+    def test_standby_failure(self):
+        """
+        That the failure of a standby-replay daemon happens cleanly
+        and doesn't interrupt anything else.
+        """
+        # Pick out exactly 2 daemons to be run during test
+        use_daemons = sorted(self.mds_cluster.mds_ids[0:2])
+        mds_a, mds_b = use_daemons
+        log.info("Using MDS daemons: {0}".format(use_daemons))
+
+        # Configure two pairs of MDSs that are standby for each other
+        self.set_standby_for(mds_a, mds_b, True)
+        self.set_standby_for(mds_b, mds_a, False)
+
+        # Create FS alpha and get mds_a to come up as active
+        fs_a = self.mds_cluster.newfs("alpha")
+        self.mds_cluster.mds_restart(mds_a)
+        fs_a.wait_for_daemons()
+        self.assertEqual(fs_a.get_active_names(), [mds_a])
+
+        # Start the standbys
+        self.mds_cluster.mds_restart(mds_b)
+        self.wait_for_daemon_start([mds_b])
+
+        # See the standby come up as the correct rank
+        info_b = self.get_info_by_name(mds_b)
+        self.assertEqual(info_b['state'], "up:standby-replay")
+        self.assertEqual(info_b['standby_for_name'], mds_a)
+        self.assertEqual(info_b['rank'], 0)
+
+        # Kill the standby
+        self.mds_cluster.mds_stop(mds_b)
+        self.mds_cluster.mds_fail(mds_b)
+
+        # See that the standby is gone and the active remains
+        self.assertEqual(fs_a.get_active_names(), [mds_a])
+        mds_map = fs_a.get_mds_map()
+        self.assertEqual(len(mds_map['info']), 1)
+        self.assertEqual(mds_map['failed'], [])
+        self.assertEqual(mds_map['damaged'], [])
+        self.assertEqual(mds_map['stopped'], [])
+
+    def test_rank_stopped(self):
+        """
+        That when a rank is STOPPED, standby replays for
+        that rank get torn down
+        """
+        # Pick out exactly 2 daemons to be run during test
+        use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
+        mds_a, mds_b, mds_a_s, mds_b_s = use_daemons
+        log.info("Using MDS daemons: {0}".format(use_daemons))
+
+        # a and b both get a standby
+        self.set_standby_for(mds_a, mds_a_s, True)
+        self.set_standby_for(mds_b, mds_b_s, True)
+
+        # Create FS alpha and get mds_a to come up as active
+        fs_a = self.mds_cluster.newfs("alpha")
+        fs_a.set_max_mds(2)
+
+        self.mds_cluster.mds_restart(mds_a)
+        self.wait_until_equal(lambda: fs_a.get_active_names(), [mds_a], 30)
+        self.mds_cluster.mds_restart(mds_b)
+        fs_a.wait_for_daemons()
+        self.assertEqual(sorted(fs_a.get_active_names()), [mds_a, mds_b])
+
+        # Start the standbys
+        self.mds_cluster.mds_restart(mds_b_s)
+        self.wait_for_daemon_start([mds_b_s])
+        self.mds_cluster.mds_restart(mds_a_s)
+        self.wait_for_daemon_start([mds_a_s])
+        info_b_s = self.get_info_by_name(mds_b_s)
+        self.assertEqual(info_b_s['state'], "up:standby-replay")
+        info_a_s = self.get_info_by_name(mds_a_s)
+        self.assertEqual(info_a_s['state'], "up:standby-replay")
+
+        # Shrink the cluster
+        fs_a.set_max_mds(1)
+        fs_a.mon_manager.raw_cluster_cmd("mds", "stop", "{0}:1".format(fs_a.name))
+        self.wait_until_equal(
+            lambda: fs_a.get_active_names(), [mds_a],
+            60
+        )
+
+        # Both 'b' and 'b_s' should go back to being standbys
+        self.wait_until_equal(
+            lambda: self.mds_cluster.get_standby_daemons(), {mds_b, mds_b_s},
+            60
+        )
+
+
+class TestMultiFilesystems(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+    MDSS_REQUIRED = 4
+
+    # We'll create our own filesystems and start our own daemons
+    REQUIRE_FILESYSTEM = False
+
+    def setUp(self):
+        super(TestMultiFilesystems, self).setUp()
+        self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set",
+            "enable_multiple", "true",
+            "--yes-i-really-mean-it")
+
+    def _setup_two(self):
+        fs_a = self.mds_cluster.newfs("alpha")
+        fs_b = self.mds_cluster.newfs("bravo")
+
+        self.mds_cluster.mds_restart()
+
+        # Wait for both filesystems to go healthy
+        fs_a.wait_for_daemons()
+        fs_b.wait_for_daemons()
+
+        # Reconfigure client auth caps
+        for mount in self.mounts:
+            self.mds_cluster.mon_manager.raw_cluster_cmd_result(
+                'auth', 'caps', "client.{0}".format(mount.client_id),
+                'mds', 'allow',
+                'mon', 'allow r',
+                'osd', 'allow rw pool={0}, allow rw pool={1}'.format(
+                    fs_a.get_data_pool_name(), fs_b.get_data_pool_name()))
+
+        return fs_a, fs_b
+
+    def test_clients(self):
+        fs_a, fs_b = self._setup_two()
+
+        # Mount a client on fs_a
+        self.mount_a.mount(mount_fs_name=fs_a.name)
+        self.mount_a.write_n_mb("pad.bin", 1)
+        self.mount_a.write_n_mb("test.bin", 2)
+        a_created_ino = self.mount_a.path_to_ino("test.bin")
+        self.mount_a.create_files()
+
+        # Mount a client on fs_b
+        self.mount_b.mount(mount_fs_name=fs_b.name)
+        self.mount_b.write_n_mb("test.bin", 1)
+        b_created_ino = self.mount_b.path_to_ino("test.bin")
+        self.mount_b.create_files()
+
+        # Check that a non-default filesystem mount survives an MDS
+        # failover (i.e. that map subscription is continuous, not
+        # just the first time), reproduces #16022
+        old_fs_b_mds = fs_b.get_active_names()[0]
+        self.mds_cluster.mds_stop(old_fs_b_mds)
+        self.mds_cluster.mds_fail(old_fs_b_mds)
+        fs_b.wait_for_daemons()
+        background = self.mount_b.write_background()
+        # Raise exception if the write doesn't finish (i.e. if client
+        # has not kept up with MDS failure)
+        try:
+            self.wait_until_true(lambda: background.finished, timeout=30)
+        except RuntimeError:
+            # The mount is stuck, we'll have to force it to fail cleanly
+            background.stdin.close()
+            self.mount_b.umount_wait(force=True)
+            raise
+
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        # See that the client's files went into the correct pool
+        self.assertTrue(fs_a.data_objects_present(a_created_ino, 1024 * 1024))
+        self.assertTrue(fs_b.data_objects_present(b_created_ino, 1024 * 1024))
+
+    def test_standby(self):
+        fs_a, fs_b = self._setup_two()
+
+        # Assert that the remaining two MDS daemons are now standbys
+        a_daemons = fs_a.get_active_names()
+        b_daemons = fs_b.get_active_names()
+        self.assertEqual(len(a_daemons), 1)
+        self.assertEqual(len(b_daemons), 1)
+        original_a = a_daemons[0]
+        original_b = b_daemons[0]
+        expect_standby_daemons = set(self.mds_cluster.mds_ids) - (set(a_daemons) | set(b_daemons))
+
+        # Need all my standbys up as well as the active daemons
+        self.wait_for_daemon_start()
+        self.assertEqual(expect_standby_daemons, self.mds_cluster.get_standby_daemons())
+
+        # Kill fs_a's active MDS, see a standby take over
+        self.mds_cluster.mds_stop(original_a)
+        self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_a)
+        self.wait_until_equal(lambda: len(fs_a.get_active_names()), 1, 30,
+                              reject_fn=lambda v: v > 1)
+        # Assert that it's a *different* daemon that has now appeared in the map for fs_a
+        self.assertNotEqual(fs_a.get_active_names()[0], original_a)
+
+        # Kill fs_b's active MDS, see a standby take over
+        self.mds_cluster.mds_stop(original_b)
+        self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_b)
+        self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
+                              reject_fn=lambda v: v > 1)
+        # Assert that it's a *different* daemon that has now appeared in the map for fs_a
+        self.assertNotEqual(fs_b.get_active_names()[0], original_b)
+
+        # Both of the original active daemons should be gone, and all standbys used up
+        self.assertEqual(self.mds_cluster.get_standby_daemons(), set())
+
+        # Restart the ones I killed, see them reappear as standbys
+        self.mds_cluster.mds_restart(original_a)
+        self.mds_cluster.mds_restart(original_b)
+        self.wait_until_true(
+            lambda: {original_a, original_b} == self.mds_cluster.get_standby_daemons(),
+            timeout=30
+        )
+
+    def test_grow_shrink(self):
+        # Usual setup...
+        fs_a, fs_b = self._setup_two()
+
+        # Increase max_mds on fs_b, see a standby take up the role
+        fs_b.set_max_mds(2)
+        self.wait_until_equal(lambda: len(fs_b.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        # Increase max_mds on fs_a, see a standby take up the role
+        fs_a.set_max_mds(2)
+        self.wait_until_equal(lambda: len(fs_a.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        # Shrink fs_b back to 1, see a daemon go back to standby
+        fs_b.set_max_mds(1)
+        fs_b.deactivate(1)
+        self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        # Grow fs_a up to 3, see the former fs_b daemon join it.
+        fs_a.set_max_mds(3)
+        self.wait_until_equal(lambda: len(fs_a.get_active_names()), 3, 60,
+                              reject_fn=lambda v: v > 3 or v < 2)
+
+    def test_standby_for_name(self):
+        # Pick out exactly 4 daemons to be run during test
+        use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
+        mds_a, mds_b, mds_c, mds_d = use_daemons
+        log.info("Using MDS daemons: {0}".format(use_daemons))
+
+        def set_standby_for(leader, follower, replay):
+            self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader)
+            if replay:
+                self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true")
+
+        # Configure two pairs of MDSs that are standby for each other
+        set_standby_for(mds_a, mds_b, True)
+        set_standby_for(mds_b, mds_a, False)
+        set_standby_for(mds_c, mds_d, True)
+        set_standby_for(mds_d, mds_c, False)
+
+        # Create FS alpha and get mds_a to come up as active
+        fs_a = self.mds_cluster.newfs("alpha")
+        self.mds_cluster.mds_restart(mds_a)
+        fs_a.wait_for_daemons()
+        self.assertEqual(fs_a.get_active_names(), [mds_a])
+
+        # Create FS bravo and get mds_c to come up as active
+        fs_b = self.mds_cluster.newfs("bravo")
+        self.mds_cluster.mds_restart(mds_c)
+        fs_b.wait_for_daemons()
+        self.assertEqual(fs_b.get_active_names(), [mds_c])
+
+        # Start the standbys
+        self.mds_cluster.mds_restart(mds_b)
+        self.mds_cluster.mds_restart(mds_d)
+        self.wait_for_daemon_start([mds_b, mds_d])
+
+        def get_info_by_name(fs, mds_name):
+            mds_map = fs.get_mds_map()
+            for gid_str, info in mds_map['info'].items():
+                if info['name'] == mds_name:
+                    return info
+
+            log.warn(json.dumps(mds_map, indent=2))
+            raise RuntimeError("MDS '{0}' not found in filesystem MDSMap".format(mds_name))
+
+        # See both standbys come up as standby replay for the correct ranks
+        # mds_b should be in filesystem alpha following mds_a
+        info_b = get_info_by_name(fs_a, mds_b)
+        self.assertEqual(info_b['state'], "up:standby-replay")
+        self.assertEqual(info_b['standby_for_name'], mds_a)
+        self.assertEqual(info_b['rank'], 0)
+        # mds_d should be in filesystem alpha following mds_c
+        info_d = get_info_by_name(fs_b, mds_d)
+        self.assertEqual(info_d['state'], "up:standby-replay")
+        self.assertEqual(info_d['standby_for_name'], mds_c)
+        self.assertEqual(info_d['rank'], 0)
+
+        # Kill both active daemons
+        self.mds_cluster.mds_stop(mds_a)
+        self.mds_cluster.mds_fail(mds_a)
+        self.mds_cluster.mds_stop(mds_c)
+        self.mds_cluster.mds_fail(mds_c)
+
+        # Wait for standbys to take over
+        fs_a.wait_for_daemons()
+        self.assertEqual(fs_a.get_active_names(), [mds_b])
+        fs_b.wait_for_daemons()
+        self.assertEqual(fs_b.get_active_names(), [mds_d])
+
+        # Start the original active daemons up again
+        self.mds_cluster.mds_restart(mds_a)
+        self.mds_cluster.mds_restart(mds_c)
+        self.wait_for_daemon_start([mds_a, mds_c])
+
+        self.assertEqual(set(self.mds_cluster.get_standby_daemons()),
+                         {mds_a, mds_c})
+
+    def test_standby_for_rank(self):
+        use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
+        mds_a, mds_b, mds_c, mds_d = use_daemons
+        log.info("Using MDS daemons: {0}".format(use_daemons))
+
+        def set_standby_for(leader_rank, leader_fs, follower_id):
+            self.set_conf("mds.{0}".format(follower_id),
+                          "mds_standby_for_rank", leader_rank)
+
+            fscid = leader_fs.get_namespace_id()
+            self.set_conf("mds.{0}".format(follower_id),
+                          "mds_standby_for_fscid", fscid)
+
+        fs_a = self.mds_cluster.newfs("alpha")
+        fs_b = self.mds_cluster.newfs("bravo")
+        set_standby_for(0, fs_a, mds_a)
+        set_standby_for(0, fs_a, mds_b)
+        set_standby_for(0, fs_b, mds_c)
+        set_standby_for(0, fs_b, mds_d)
+
+        self.mds_cluster.mds_restart(mds_a)
+        fs_a.wait_for_daemons()
+        self.assertEqual(fs_a.get_active_names(), [mds_a])
+
+        self.mds_cluster.mds_restart(mds_c)
+        fs_b.wait_for_daemons()
+        self.assertEqual(fs_b.get_active_names(), [mds_c])
+
+        self.mds_cluster.mds_restart(mds_b)
+        self.mds_cluster.mds_restart(mds_d)
+        self.wait_for_daemon_start([mds_b, mds_d])
+
+        self.mds_cluster.mds_stop(mds_a)
+        self.mds_cluster.mds_fail(mds_a)
+        self.mds_cluster.mds_stop(mds_c)
+        self.mds_cluster.mds_fail(mds_c)
+
+        fs_a.wait_for_daemons()
+        self.assertEqual(fs_a.get_active_names(), [mds_b])
+        fs_b.wait_for_daemons()
+        self.assertEqual(fs_b.get_active_names(), [mds_d])
+
+    def test_standby_for_fscid(self):
+        """
+        That I can set a standby FSCID with no rank, and the result is
+        that daemons join any rank for that filesystem.
+        """
+        use_daemons = sorted(self.mds_cluster.mds_ids[0:4])
+        mds_a, mds_b, mds_c, mds_d = use_daemons
+
+        log.info("Using MDS daemons: {0}".format(use_daemons))
+
+        def set_standby_for(leader_fs, follower_id):
+            fscid = leader_fs.get_namespace_id()
+            self.set_conf("mds.{0}".format(follower_id),
+                          "mds_standby_for_fscid", fscid)
+
+        # Create two filesystems which should have two ranks each
+        fs_a = self.mds_cluster.newfs("alpha")
+
+        fs_b = self.mds_cluster.newfs("bravo")
+
+        fs_a.set_max_mds(2)
+        fs_b.set_max_mds(2)
+
+        # Set all the daemons to have a FSCID assignment but no other
+        # standby preferences.
+        set_standby_for(fs_a, mds_a)
+        set_standby_for(fs_a, mds_b)
+        set_standby_for(fs_b, mds_c)
+        set_standby_for(fs_b, mds_d)
+
+        # Now when we start all daemons at once, they should fall into
+        # ranks in the right filesystem
+        self.mds_cluster.mds_restart(mds_a)
+        self.mds_cluster.mds_restart(mds_b)
+        self.mds_cluster.mds_restart(mds_c)
+        self.mds_cluster.mds_restart(mds_d)
+        self.wait_for_daemon_start([mds_a, mds_b, mds_c, mds_d])
+        fs_a.wait_for_daemons()
+        fs_b.wait_for_daemons()
+        self.assertEqual(set(fs_a.get_active_names()), {mds_a, mds_b})
+        self.assertEqual(set(fs_b.get_active_names()), {mds_c, mds_d})
+
+    def test_standby_for_invalid_fscid(self):
+        """
+        That an invalid standby_fscid does not cause a mon crash
+        """
+        use_daemons = sorted(self.mds_cluster.mds_ids[0:3])
+        mds_a, mds_b, mds_c = use_daemons
+        log.info("Using MDS daemons: {0}".format(use_daemons))
+
+        def set_standby_for_rank(leader_rank, follower_id):
+            self.set_conf("mds.{0}".format(follower_id),
+                          "mds_standby_for_rank", leader_rank)
+
+        # Create one fs
+        fs_a = self.mds_cluster.newfs("cephfs")
+
+        # Get configured mons in the cluster, so we can see if any
+        # crashed later.
+        configured_mons = fs_a.mon_manager.get_mon_quorum()
+
+        # Set all the daemons to have a rank assignment but no other
+        # standby preferences.
+        set_standby_for_rank(0, mds_a)
+        set_standby_for_rank(0, mds_b)
+
+        # Set third daemon to have invalid fscid assignment and no other
+        # standby preferences
+        invalid_fscid = 123
+        self.set_conf("mds.{0}".format(mds_c), "mds_standby_for_fscid", invalid_fscid)
+
+        #Restart all the daemons to make the standby preference applied
+        self.mds_cluster.mds_restart(mds_a)
+        self.mds_cluster.mds_restart(mds_b)
+        self.mds_cluster.mds_restart(mds_c)
+        self.wait_for_daemon_start([mds_a, mds_b, mds_c])
+
+        #Stop active mds daemon service of fs
+        if (fs_a.get_active_names(), [mds_a]):
+            self.mds_cluster.mds_stop(mds_a)
+            self.mds_cluster.mds_fail(mds_a)
+            fs_a.wait_for_daemons()
+        else:
+            self.mds_cluster.mds_stop(mds_b)
+            self.mds_cluster.mds_fail(mds_b)
+            fs_a.wait_for_daemons()
+
+        #Get active mons from cluster
+        active_mons = fs_a.mon_manager.get_mon_quorum()
+
+        #Check for active quorum mon status and configured mon status
+        self.assertEqual(active_mons, configured_mons,
+                "Not all mons are in quorum Invalid standby invalid fscid test failed!")
diff --git a/src/ceph/qa/tasks/cephfs/test_flush.py b/src/ceph/qa/tasks/cephfs/test_flush.py
new file mode 100644
index 0000000..1f84e42
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_flush.py
@@ -0,0 +1,113 @@
+
+from textwrap import dedent
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
+
+
+class TestFlush(CephFSTestCase):
+    def test_flush(self):
+        self.mount_a.run_shell(["mkdir", "mydir"])
+        self.mount_a.run_shell(["touch", "mydir/alpha"])
+        dir_ino = self.mount_a.path_to_ino("mydir")
+        file_ino = self.mount_a.path_to_ino("mydir/alpha")
+
+        # Unmount the client so that it isn't still holding caps
+        self.mount_a.umount_wait()
+
+        # Before flush, the dirfrag object does not exist
+        with self.assertRaises(ObjectNotFound):
+            self.fs.list_dirfrag(dir_ino)
+
+        # Before flush, the file's backtrace has not been written
+        with self.assertRaises(ObjectNotFound):
+            self.fs.read_backtrace(file_ino)
+
+        # Before flush, there are no dentries in the root
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
+
+        # Execute flush
+        flush_data = self.fs.mds_asok(["flush", "journal"])
+        self.assertEqual(flush_data['return_code'], 0)
+
+        # After flush, the dirfrag object has been created
+        dir_list = self.fs.list_dirfrag(dir_ino)
+        self.assertEqual(dir_list, ["alpha_head"])
+
+        # And the 'mydir' dentry is in the root
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), ['mydir_head'])
+
+        # ...and the data object has its backtrace
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'mydir'], [a['dname'] for a in backtrace['ancestors']])
+        self.assertEqual([dir_ino, 1], [a['dirino'] for a in backtrace['ancestors']])
+        self.assertEqual(file_ino, backtrace['ino'])
+
+        # ...and the journal is truncated to just a single subtreemap from the
+        # newly created segment
+        summary_output = self.fs.journal_tool(["event", "get", "summary"])
+        try:
+            self.assertEqual(summary_output,
+                             dedent(
+                                 """
+                                 Events by type:
+                                   SUBTREEMAP: 1
+                                 Errors: 0
+                                 """
+                             ).strip())
+        except AssertionError:
+            # In some states, flushing the journal will leave you
+            # an extra event from locks a client held.   This is
+            # correct behaviour: the MDS is flushing the journal,
+            # it's just that new events are getting added too.
+            # In this case, we should nevertheless see a fully
+            # empty journal after a second flush.
+            self.assertEqual(summary_output,
+                             dedent(
+                                 """
+                                 Events by type:
+                                   SUBTREEMAP: 1
+                                   UPDATE: 1
+                                 Errors: 0
+                                 """
+                             ).strip())
+            flush_data = self.fs.mds_asok(["flush", "journal"])
+            self.assertEqual(flush_data['return_code'], 0)
+            self.assertEqual(self.fs.journal_tool(["event", "get", "summary"]),
+                             dedent(
+                                 """
+                                 Events by type:
+                                   SUBTREEMAP: 1
+                                 Errors: 0
+                                 """
+                             ).strip())
+
+        # Now for deletion!
+        # We will count the RADOS deletions and MDS file purges, to verify that
+        # the expected behaviour is happening as a result of the purge
+        initial_dels = self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete']
+        initial_purges = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued']
+
+        # Use a client to delete a file
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.mount_a.run_shell(["rm", "-rf", "mydir"])
+
+        # Flush the journal so that the directory inode can be purged
+        flush_data = self.fs.mds_asok(["flush", "journal"])
+        self.assertEqual(flush_data['return_code'], 0)
+
+        # We expect to see a single file purge
+        self.wait_until_true(
+            lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued'] - initial_purges >= 2,
+            60)
+
+        # We expect two deletions, one of the dirfrag and one of the backtrace
+        self.wait_until_true(
+            lambda: self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete'] - initial_dels >= 2,
+            60)  # timeout is fairly long to allow for tick+rados latencies
+
+        with self.assertRaises(ObjectNotFound):
+            self.fs.list_dirfrag(dir_ino)
+        with self.assertRaises(ObjectNotFound):
+            self.fs.read_backtrace(file_ino)
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
diff --git a/src/ceph/qa/tasks/cephfs/test_forward_scrub.py b/src/ceph/qa/tasks/cephfs/test_forward_scrub.py
new file mode 100644
index 0000000..ac912dd
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_forward_scrub.py
@@ -0,0 +1,291 @@
+
+"""
+Test that the forward scrub functionality can traverse metadata and apply
+requested tags, on well formed metadata.
+
+This is *not* the real testing for forward scrub, which will need to test
+how the functionality responds to damaged metadata.
+
+"""
+import json
+
+import logging
+from collections import namedtuple
+from textwrap import dedent
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+import struct
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class TestForwardScrub(CephFSTestCase):
+    MDSS_REQUIRED = 1
+
+    def _read_str_xattr(self, pool, obj, attr):
+        """
+        Read a ceph-encoded string from a rados xattr
+        """
+        output = self.fs.rados(["getxattr", obj, attr], pool=pool)
+        strlen = struct.unpack('i', output[0:4])[0]
+        return output[4:(4 + strlen)]
+
+    def _get_paths_to_ino(self):
+        inos = {}
+        p = self.mount_a.run_shell(["find", "./"])
+        paths = p.stdout.getvalue().strip().split()
+        for path in paths:
+            inos[path] = self.mount_a.path_to_ino(path)
+
+        return inos
+
+    def test_apply_tag(self):
+        self.mount_a.run_shell(["mkdir", "parentdir"])
+        self.mount_a.run_shell(["mkdir", "parentdir/childdir"])
+        self.mount_a.run_shell(["touch", "rfile"])
+        self.mount_a.run_shell(["touch", "parentdir/pfile"])
+        self.mount_a.run_shell(["touch", "parentdir/childdir/cfile"])
+
+        # Build a structure mapping path to inode, as we will later want
+        # to check object by object and objects are named after ino number
+        inos = self._get_paths_to_ino()
+
+        # Flush metadata: this is a friendly test of forward scrub so we're skipping
+        # the part where it's meant to cope with dirty metadata
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        tag = "mytag"
+
+        # Execute tagging forward scrub
+        self.fs.mds_asok(["tag", "path", "/parentdir", tag])
+        # Wait for completion
+        import time
+        time.sleep(10)
+        # FIXME watching clog isn't a nice mechanism for this, once we have a ScrubMap we'll
+        # watch that instead
+
+        # Check that dirs were tagged
+        for dirpath in ["./parentdir", "./parentdir/childdir"]:
+            self.assertTagged(inos[dirpath], tag, self.fs.get_metadata_pool_name())
+
+        # Check that files were tagged
+        for filepath in ["./parentdir/pfile", "./parentdir/childdir/cfile"]:
+            self.assertTagged(inos[filepath], tag, self.fs.get_data_pool_name())
+
+        # This guy wasn't in the tag path, shouldn't have been tagged
+        self.assertUntagged(inos["./rfile"])
+
+    def assertUntagged(self, ino):
+        file_obj_name = "{0:x}.00000000".format(ino)
+        with self.assertRaises(CommandFailedError):
+            self._read_str_xattr(
+                self.fs.get_data_pool_name(),
+                file_obj_name,
+                "scrub_tag"
+            )
+
+    def assertTagged(self, ino, tag, pool):
+        file_obj_name = "{0:x}.00000000".format(ino)
+        wrote = self._read_str_xattr(
+            pool,
+            file_obj_name,
+            "scrub_tag"
+        )
+        self.assertEqual(wrote, tag)
+
+    def _validate_linkage(self, expected):
+        inos = self._get_paths_to_ino()
+        try:
+            self.assertDictEqual(inos, expected)
+        except AssertionError:
+            log.error("Expected: {0}".format(json.dumps(expected, indent=2)))
+            log.error("Actual: {0}".format(json.dumps(inos, indent=2)))
+            raise
+
+    def test_orphan_scan(self):
+        # Create some files whose metadata we will flush
+        self.mount_a.run_python(dedent("""
+            import os
+            mount_point = "{mount_point}"
+            parent = os.path.join(mount_point, "parent")
+            os.mkdir(parent)
+            flushed = os.path.join(parent, "flushed")
+            os.mkdir(flushed)
+            for f in ["alpha", "bravo", "charlie"]:
+                open(os.path.join(flushed, f), 'w').write(f)
+        """.format(mount_point=self.mount_a.mountpoint)))
+
+        inos = self._get_paths_to_ino()
+
+        # Flush journal
+        # Umount before flush to avoid cap releases putting
+        # things we don't want in the journal later.
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Create a new inode that's just in the log, i.e. would
+        # look orphaned to backward scan if backward scan wisnae
+        # respectin' tha scrub_tag xattr.
+        self.mount_a.mount()
+        self.mount_a.run_shell(["mkdir", "parent/unflushed"])
+        self.mount_a.run_shell(["dd", "if=/dev/urandom",
+                                "of=./parent/unflushed/jfile",
+                                "bs=1M", "count=8"])
+        inos["./parent/unflushed"] = self.mount_a.path_to_ino("./parent/unflushed")
+        inos["./parent/unflushed/jfile"] = self.mount_a.path_to_ino("./parent/unflushed/jfile")
+        self.mount_a.umount_wait()
+
+        # Orphan an inode by deleting its dentry
+        # Our victim will be.... bravo.
+        self.mount_a.umount_wait()
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+        frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"])
+        self.fs.rados(["rmomapkey", frag_obj_id, "bravo_head"])
+
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        # See that the orphaned file is indeed missing from a client's POV
+        self.mount_a.mount()
+        damaged_state = self._get_paths_to_ino()
+        self.assertNotIn("./parent/flushed/bravo", damaged_state)
+        self.mount_a.umount_wait()
+
+        # Run a tagging forward scrub
+        tag = "mytag123"
+        self.fs.mds_asok(["tag", "path", "/parent", tag])
+
+        # See that the orphan wisnae tagged
+        self.assertUntagged(inos['./parent/flushed/bravo'])
+
+        # See that the flushed-metadata-and-still-present files are tagged
+        self.assertTagged(inos['./parent/flushed/alpha'], tag, self.fs.get_data_pool_name())
+        self.assertTagged(inos['./parent/flushed/charlie'], tag, self.fs.get_data_pool_name())
+
+        # See that journalled-but-not-flushed file *was* tagged
+        self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name())
+
+        # Run cephfs-data-scan targeting only orphans
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+        self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()])
+        self.fs.data_scan([
+            "scan_inodes",
+            "--filter-tag", tag,
+            self.fs.get_data_pool_name()
+        ])
+
+        # After in-place injection stats should be kosher again
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', True)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', True)
+
+        # And we should have all the same linkage we started with,
+        # and no lost+found, and no extra inodes!
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount()
+        self._validate_linkage(inos)
+
+    def _stash_inotable(self):
+        # Get all active ranks
+        ranks = self.fs.get_all_mds_rank()
+
+        inotable_dict = {}
+        for rank in ranks:
+            inotable_oid = "mds{rank:d}_".format(rank=rank) + "inotable"
+            print "Trying to fetch inotable object: " + inotable_oid
+
+            #self.fs.get_metadata_object("InoTable", "mds0_inotable")
+            inotable_raw = self.fs.get_metadata_object_raw(inotable_oid)
+            inotable_dict[inotable_oid] = inotable_raw
+        return inotable_dict
+
+    def test_inotable_sync(self):
+        self.mount_a.write_n_mb("file1_sixmegs", 6)
+
+        # Flush journal
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        inotable_copy = self._stash_inotable()
+
+        self.mount_a.mount()
+
+        self.mount_a.write_n_mb("file2_sixmegs", 6)
+        self.mount_a.write_n_mb("file3_sixmegs", 6)
+
+        inos = self._get_paths_to_ino()
+
+        # Flush journal
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"])
+
+        self.mount_a.umount_wait()
+
+        with self.assert_cluster_log("inode table repaired", invert_match=True):
+            self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+
+        self.mds_cluster.mds_stop()
+        self.mds_cluster.mds_fail()
+
+        # Truncate the journal (to ensure the inotable on disk
+        # is all that will be in the InoTable in memory)
+
+        self.fs.journal_tool(["event", "splice",
+            "--inode={0}".format(inos["./file2_sixmegs"]), "summary"])
+
+        self.fs.journal_tool(["event", "splice",
+            "--inode={0}".format(inos["./file3_sixmegs"]), "summary"])
+
+        # Revert to old inotable.
+        for key, value in inotable_copy.iteritems():
+           self.fs.put_metadata_object_raw(key, value)
+
+        self.mds_cluster.mds_restart()
+        self.fs.wait_for_daemons()
+
+        with self.assert_cluster_log("inode table repaired"):
+            self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+
+        self.mds_cluster.mds_stop()
+        table_text = self.fs.table_tool(["0", "show", "inode"])
+        table = json.loads(table_text)
+        self.assertGreater(
+                table['0']['data']['inotable']['free'][0]['start'],
+                inos['./file3_sixmegs'])
+
+    def test_backtrace_repair(self):
+        """
+        That the MDS can repair an inodes backtrace in the data pool
+        if it is found to be damaged.
+        """
+        # Create a file for subsequent checks
+        self.mount_a.run_shell(["mkdir", "parent_a"])
+        self.mount_a.run_shell(["touch", "parent_a/alpha"])
+        file_ino = self.mount_a.path_to_ino("parent_a/alpha")
+
+        # That backtrace and layout are written after initial flush
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_a'],
+                         [a['dname'] for a in backtrace['ancestors']])
+
+        # Go corrupt the backtrace
+        self.fs._write_data_xattr(file_ino, "parent",
+                                  "oh i'm sorry did i overwrite your xattr?")
+
+        with self.assert_cluster_log("bad backtrace on inode"):
+            self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"])
+        self.fs.mds_asok(["flush", "journal"])
+        backtrace = self.fs.read_backtrace(file_ino)
+        self.assertEqual(['alpha', 'parent_a'],
+                         [a['dname'] for a in backtrace['ancestors']])
diff --git a/src/ceph/qa/tasks/cephfs/test_fragment.py b/src/ceph/qa/tasks/cephfs/test_fragment.py
new file mode 100644
index 0000000..a62ef74
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_fragment.py
@@ -0,0 +1,232 @@
+
+
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.orchestra import run
+
+import logging
+log = logging.getLogger(__name__)
+
+
+class TestFragmentation(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 1
+
+    def get_splits(self):
+        return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_split']
+
+    def get_merges(self):
+        return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_merge']
+
+    def get_dir_ino(self, path):
+        dir_cache = self.fs.read_cache(path, 0)
+        dir_ino = None
+        dir_inono = self.mount_a.path_to_ino(path.strip("/"))
+        for ino in dir_cache:
+            if ino['ino'] == dir_inono:
+                dir_ino = ino
+                break
+        self.assertIsNotNone(dir_ino)
+        return dir_ino
+
+    def _configure(self, **kwargs):
+        """
+        Apply kwargs as MDS configuration settings, enable dirfrags
+        and restart the MDSs.
+        """
+        kwargs['mds_bal_frag'] = "true"
+
+        for k, v in kwargs.items():
+            self.ceph_cluster.set_ceph_conf("mds", k, v.__str__())
+
+        self.fs.set_allow_dirfrags(True)
+
+        self.mds_cluster.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+    def test_oversize(self):
+        """
+        That a directory is split when it becomes too large.
+        """
+
+        split_size = 20
+        merge_size = 5
+
+        self._configure(
+            mds_bal_split_size=split_size,
+            mds_bal_merge_size=merge_size,
+            mds_bal_split_bits=1
+        )
+
+        self.assertEqual(self.get_splits(), 0)
+
+        self.mount_a.create_n_files("splitdir/file", split_size + 1)
+
+        self.wait_until_true(
+            lambda: self.get_splits() == 1,
+            timeout=30
+        )
+
+        frags = self.get_dir_ino("/splitdir")['dirfrags']
+        self.assertEqual(len(frags), 2)
+        self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*")
+        self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*")
+        self.assertEqual(
+            sum([len(f['dentries']) for f in frags]),
+            split_size + 1
+        )
+
+        self.assertEqual(self.get_merges(), 0)
+
+        self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")])
+
+        self.wait_until_true(
+            lambda: self.get_merges() == 1,
+            timeout=30
+        )
+
+        self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 1)
+
+    def test_rapid_creation(self):
+        """
+        That the fast-splitting limit of 1.5x normal limit is
+        applied when creating dentries quickly.
+        """
+
+        split_size = 100
+        merge_size = 1
+
+        self._configure(
+            mds_bal_split_size=split_size,
+            mds_bal_merge_size=merge_size,
+            mds_bal_split_bits=3,
+            mds_bal_fragment_size_max=int(split_size * 1.5 + 2)
+        )
+
+        # We test this only at a single split level.  If a client was sending
+        # IO so fast that it hit a second split before the first split
+        # was complete, it could violate mds_bal_fragment_size_max -- there
+        # is a window where the child dirfrags of a split are unfrozen
+        # (so they can grow), but still have STATE_FRAGMENTING (so they
+        # can't be split).
+
+        # By writing 4x the split size when the split bits are set
+        # to 3 (i.e. 4-ways), I am reasonably sure to see precisely
+        # one split.  The test is to check whether that split
+        # happens soon enough that the client doesn't exceed
+        # 2x the split_size (the "immediate" split mode should
+        # kick in at 1.5x the split size).
+
+        self.assertEqual(self.get_splits(), 0)
+        self.mount_a.create_n_files("splitdir/file", split_size * 4)
+        self.wait_until_equal(
+            self.get_splits,
+            1,
+            reject_fn=lambda s: s > 1,
+            timeout=30
+        )
+
+    def test_deep_split(self):
+        """
+        That when the directory grows many times larger than split size,
+        the fragments get split again.
+        """
+
+        split_size = 100
+        merge_size = 1  # i.e. don't merge frag unless its empty
+        split_bits = 1
+
+        branch_factor = 2**split_bits
+
+        # Arbitrary: how many levels shall we try fragmenting before
+        # ending the test?
+        max_depth = 5
+
+        self._configure(
+            mds_bal_split_size=split_size,
+            mds_bal_merge_size=merge_size,
+            mds_bal_split_bits=split_bits
+        )
+
+        # Each iteration we will create another level of fragments.  The
+        # placement of dentries into fragments is by hashes (i.e. pseudo
+        # random), so we rely on statistics to get the behaviour that
+        # by writing about 1.5x as many dentries as the split_size times
+        # the number of frags, we will get them all to exceed their
+        # split size and trigger a split.
+        depth = 0
+        files_written = 0
+        splits_expected = 0
+        while depth < max_depth:
+            log.info("Writing files for depth {0}".format(depth))
+            target_files = branch_factor**depth * int(split_size * 1.5)
+            create_files = target_files - files_written
+
+            self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+                "{0} Writing {1} files (depth={2})".format(
+                    self.__class__.__name__, create_files, depth
+                ))
+            self.mount_a.create_n_files("splitdir/file_{0}".format(depth),
+                                        create_files)
+            self.ceph_cluster.mon_manager.raw_cluster_cmd("log",
+                "{0} Done".format(self.__class__.__name__))
+
+            files_written += create_files
+            log.info("Now have {0} files".format(files_written))
+
+            splits_expected += branch_factor**depth
+            log.info("Waiting to see {0} splits".format(splits_expected))
+            try:
+                self.wait_until_equal(
+                    self.get_splits,
+                    splits_expected,
+                    timeout=30,
+                    reject_fn=lambda x: x > splits_expected
+                )
+
+                frags = self.get_dir_ino("/splitdir")['dirfrags']
+                self.assertEqual(len(frags), branch_factor**(depth+1))
+                self.assertEqual(
+                    sum([len(f['dentries']) for f in frags]),
+                    target_files
+                )
+            except:
+                # On failures, log what fragmentation we actually ended
+                # up with.  This block is just for logging, at the end
+                # we raise the exception again.
+                frags = self.get_dir_ino("/splitdir")['dirfrags']
+                log.info("depth={0} splits_expected={1} files_written={2}".format(
+                    depth, splits_expected, files_written
+                ))
+                log.info("Dirfrags:")
+                for f in frags:
+                    log.info("{0}: {1}".format(
+                        f['dirfrag'], len(f['dentries'])
+                    ))
+                raise
+
+            depth += 1
+
+        # Remember the inode number because we will be checking for
+        # objects later.
+        dir_inode_no = self.mount_a.path_to_ino("splitdir")
+
+        self.mount_a.run_shell(["rm", "-rf", "splitdir/"])
+        self.mount_a.umount_wait()
+
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # Wait for all strays to purge
+        self.wait_until_equal(
+            lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache']
+                                     )['mds_cache']['num_strays'],
+            0,
+            timeout=1200
+        )
+        # Check that the metadata pool objects for all the myriad
+        # child fragments are gone
+        metadata_objs = self.fs.rados(["ls"])
+        frag_objs = []
+        for o in metadata_objs:
+            if o.startswith("{0:x}.".format(dir_inode_no)):
+                frag_objs.append(o)
+        self.assertListEqual(frag_objs, [])
diff --git a/src/ceph/qa/tasks/cephfs/test_full.py b/src/ceph/qa/tasks/cephfs/test_full.py
new file mode 100644
index 0000000..e69ccb3
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_full.py
@@ -0,0 +1,414 @@
+
+
+import json
+import logging
+import os
+from textwrap import dedent
+import time
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class FullnessTestCase(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+
+    # Subclasses define whether they're filling whole cluster or just data pool
+    data_only = False
+
+    # Subclasses define how many bytes should be written to achieve fullness
+    pool_capacity = None
+    fill_mb = None
+
+    # Subclasses define what fullness means to them
+    def is_full(self):
+        raise NotImplementedError()
+
+    def setUp(self):
+        CephFSTestCase.setUp(self)
+
+        # These tests just use a single active MDS throughout, so remember its ID
+        # for use in mds_asok calls
+        self.active_mds_id = self.fs.get_active_names()[0]
+
+        # Capture the initial OSD map epoch for later use
+        self.initial_osd_epoch = json.loads(
+            self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
+        )['epoch']
+
+        # Check the initial barrier epoch on the MDS: this should be
+        # set to the latest map at MDS startup.  We do this check in
+        # setUp to get in there before subclasses might touch things
+        # in their own setUp functions.
+        self.assertGreaterEqual(self.fs.mds_asok(["status"], mds_id=self.active_mds_id)['osdmap_epoch_barrier'],
+                                self.initial_osd_epoch)
+
+    def test_barrier(self):
+        """
+        That when an OSD epoch barrier is set on an MDS, subsequently
+        issued capabilities cause clients to update their OSD map to that
+        epoch.
+        """
+
+        # Sync up clients with initial MDS OSD map barrier
+        self.mount_a.open_no_data("foo")
+        self.mount_b.open_no_data("bar")
+
+        # Grab mounts' initial OSD epochs: later we will check that
+        # it hasn't advanced beyond this point.
+        mount_a_initial_epoch = self.mount_a.get_osd_epoch()[0]
+        mount_b_initial_epoch = self.mount_b.get_osd_epoch()[0]
+
+        # Freshly mounted at start of test, should be up to date with OSD map
+        self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch)
+        self.assertGreaterEqual(mount_b_initial_epoch, self.initial_osd_epoch)
+
+        # Set and unset a flag to cause OSD epoch to increment
+        self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause")
+        self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause")
+
+        out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip()
+        new_epoch = json.loads(out)['epoch']
+        self.assertNotEqual(self.initial_osd_epoch, new_epoch)
+
+        # Do a metadata operation on clients, witness that they end up with
+        # the old OSD map from startup time (nothing has prompted client
+        # to update its map)
+        self.mount_a.open_no_data("alpha")
+        self.mount_b.open_no_data("bravo1")
+
+        # Sleep long enough that if the OSD map was propagating it would
+        # have done so (this is arbitrary because we are 'waiting' for something
+        # to *not* happen).
+        time.sleep(30)
+
+        mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
+        self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
+        mount_b_epoch, mount_b_barrier = self.mount_b.get_osd_epoch()
+        self.assertEqual(mount_b_epoch, mount_b_initial_epoch)
+
+        # Set a barrier on the MDS
+        self.fs.mds_asok(["osdmap", "barrier", new_epoch.__str__()], mds_id=self.active_mds_id)
+
+        # Do an operation on client B, witness that it ends up with
+        # the latest OSD map from the barrier.  This shouldn't generate any
+        # cap revokes to A because B was already the last one to touch
+        # a file in root.
+        self.mount_b.run_shell(["touch", "bravo2"])
+        self.mount_b.open_no_data("bravo2")
+
+        # Some time passes here because the metadata part of the operation
+        # completes immediately, while the resulting OSD map update happens
+        # asynchronously (it's an Objecter::_maybe_request_map) as a result
+        # of seeing the new epoch barrier.
+        self.wait_until_equal(
+            lambda: self.mount_b.get_osd_epoch(),
+            (new_epoch, new_epoch),
+            30,
+            lambda x: x[0] > new_epoch or x[1] > new_epoch)
+
+        # ...and none of this should have affected the oblivious mount a,
+        # because it wasn't doing any data or metadata IO
+        mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch()
+        self.assertEqual(mount_a_epoch, mount_a_initial_epoch)
+
+    def _data_pool_name(self):
+        data_pool_names = self.fs.get_data_pool_names()
+        if len(data_pool_names) > 1:
+            raise RuntimeError("This test can't handle multiple data pools")
+        else:
+            return data_pool_names[0]
+
+    def _test_full(self, easy_case):
+        """
+        - That a client trying to write data to a file is prevented
+        from doing so with an -EFULL result
+        - That they are also prevented from creating new files by the MDS.
+        - That they may delete another file to get the system healthy again
+
+        :param easy_case: if true, delete a successfully written file to
+                          free up space.  else, delete the file that experienced
+                          the failed write.
+        """
+
+        osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd'))
+
+        log.info("Writing {0}MB should fill this cluster".format(self.fill_mb))
+
+        # Fill up the cluster.  This dd may or may not fail, as it depends on
+        # how soon the cluster recognises its own fullness
+        self.mount_a.write_n_mb("large_file_a", self.fill_mb / 2)
+        try:
+            self.mount_a.write_n_mb("large_file_b", self.fill_mb / 2)
+        except CommandFailedError:
+            log.info("Writing file B failed (full status happened already)")
+            assert self.is_full()
+        else:
+            log.info("Writing file B succeeded (full status will happen soon)")
+            self.wait_until_true(lambda: self.is_full(),
+                                 timeout=osd_mon_report_interval_max * 5)
+
+        # Attempting to write more data should give me ENOSPC
+        with self.assertRaises(CommandFailedError) as ar:
+            self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb / 2)
+        self.assertEqual(ar.exception.exitstatus, 1)  # dd returns 1 on "No space"
+
+        # Wait for the MDS to see the latest OSD map so that it will reliably
+        # be applying the policy of rejecting non-deletion metadata operations
+        # while in the full state.
+        osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
+        self.wait_until_true(
+            lambda: self.fs.mds_asok(['status'], mds_id=self.active_mds_id)['osdmap_epoch'] >= osd_epoch,
+            timeout=10)
+
+        if not self.data_only:
+            with self.assertRaises(CommandFailedError):
+                self.mount_a.write_n_mb("small_file_1", 0)
+
+        # Clear out some space
+        if easy_case:
+            self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
+            self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
+        else:
+            # In the hard case it is the file that filled the system.
+            # Before the new #7317 (ENOSPC, epoch barrier) changes, this
+            # would fail because the last objects written would be
+            # stuck in the client cache as objecter operations.
+            self.mount_a.run_shell(['rm', '-f', 'large_file_b'])
+            self.mount_a.run_shell(['rm', '-f', 'large_file_a'])
+
+        # Here we are waiting for two things to happen:
+        # * The MDS to purge the stray folder and execute object deletions
+        #  * The OSDs to inform the mon that they are no longer full
+        self.wait_until_true(lambda: not self.is_full(),
+                             timeout=osd_mon_report_interval_max * 5)
+
+        # Wait for the MDS to see the latest OSD map so that it will reliably
+        # be applying the free space policy
+        osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch']
+        self.wait_until_true(
+            lambda: self.fs.mds_asok(['status'], mds_id=self.active_mds_id)['osdmap_epoch'] >= osd_epoch,
+            timeout=10)
+
+        # Now I should be able to write again
+        self.mount_a.write_n_mb("large_file", 50, seek=0)
+
+        # Ensure that the MDS keeps its OSD epoch barrier across a restart
+
+    def test_full_different_file(self):
+        self._test_full(True)
+
+    def test_full_same_file(self):
+        self._test_full(False)
+
+    def _remote_write_test(self, template):
+        """
+        Run some remote python in a way that's useful for
+        testing free space behaviour (see test_* methods using this)
+        """
+        file_path = os.path.join(self.mount_a.mountpoint, "full_test_file")
+
+        # Enough to trip the full flag
+        osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd'))
+        mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon"))
+
+        # Sufficient data to cause RADOS cluster to go 'full'
+        log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb))
+
+        # Long enough for RADOS cluster to notice it is full and set flag on mons
+        # (report_interval for mon to learn PG stats, tick interval for it to update OSD map,
+        #  factor of 1.5 for I/O + network latency in committing OSD map and distributing it
+        #  to the OSDs)
+        full_wait = (osd_mon_report_interval_max + mon_tick_interval) * 1.5
+
+        # Configs for this test should bring this setting down in order to
+        # run reasonably quickly
+        if osd_mon_report_interval_max > 10:
+            log.warn("This test may run rather slowly unless you decrease"
+                     "osd_mon_report_interval_max (5 is a good setting)!")
+
+        self.mount_a.run_python(template.format(
+            fill_mb=self.fill_mb,
+            file_path=file_path,
+            full_wait=full_wait,
+            is_fuse=isinstance(self.mount_a, FuseMount)
+        ))
+
+    def test_full_fclose(self):
+        # A remote script which opens a file handle, fills up the filesystem, and then
+        # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
+        remote_script = dedent("""
+            import time
+            import datetime
+            import subprocess
+            import os
+
+            # Write some buffered data through before going full, all should be well
+            print "writing some data through which we expect to succeed"
+            bytes = 0
+            f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
+            bytes += os.write(f, 'a' * 4096)
+            os.fsync(f)
+            print "fsync'ed data successfully, will now attempt to fill fs"
+
+            # Okay, now we're going to fill up the filesystem, and then keep
+            # writing until we see an error from fsync.  As long as we're doing
+            # buffered IO, the error should always only appear from fsync and not
+            # from write
+            full = False
+
+            for n in range(0, {fill_mb}):
+                bytes += os.write(f, 'x' * 1024 * 1024)
+                print "wrote bytes via buffered write, may repeat"
+            print "done writing bytes"
+
+            # OK, now we should sneak in under the full condition
+            # due to the time it takes the OSDs to report to the
+            # mons, and get a successful fsync on our full-making data
+            os.fsync(f)
+            print "successfully fsync'ed prior to getting full state reported"
+
+            # Now wait for the full flag to get set so that our
+            # next flush IO will fail
+            time.sleep(30)
+
+            # A buffered IO, should succeed
+            print "starting buffered write we expect to succeed"
+            os.write(f, 'x' * 4096)
+            print "wrote, now waiting 30s and then doing a close we expect to fail"
+
+            # Wait long enough for a background flush that should fail
+            time.sleep(30)
+
+            if {is_fuse}:
+                # ...and check that the failed background flush is reflected in fclose
+                try:
+                    os.close(f)
+                except OSError:
+                    print "close() returned an error as expected"
+                else:
+                    raise RuntimeError("close() failed to raise error")
+            else:
+                # The kernel cephfs client does not raise errors on fclose
+                os.close(f)
+
+            os.unlink("{file_path}")
+            """)
+        self._remote_write_test(remote_script)
+
+    def test_full_fsync(self):
+        """
+        That when the full flag is encountered during asynchronous
+        flushes, such that an fwrite() succeeds but an fsync/fclose()
+        should return the ENOSPC error.
+        """
+
+        # A remote script which opens a file handle, fills up the filesystem, and then
+        # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync
+        remote_script = dedent("""
+            import time
+            import datetime
+            import subprocess
+            import os
+
+            # Write some buffered data through before going full, all should be well
+            print "writing some data through which we expect to succeed"
+            bytes = 0
+            f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT)
+            bytes += os.write(f, 'a' * 4096)
+            os.fsync(f)
+            print "fsync'ed data successfully, will now attempt to fill fs"
+
+            # Okay, now we're going to fill up the filesystem, and then keep
+            # writing until we see an error from fsync.  As long as we're doing
+            # buffered IO, the error should always only appear from fsync and not
+            # from write
+            full = False
+
+            for n in range(0, {fill_mb} + 1):
+                try:
+                    bytes += os.write(f, 'x' * 1024 * 1024)
+                    print "wrote bytes via buffered write, moving on to fsync"
+                except OSError as e:
+                    print "Unexpected error %s from write() instead of fsync()" % e
+                    raise
+
+                try:
+                    os.fsync(f)
+                    print "fsync'ed successfully"
+                except OSError as e:
+                    print "Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0))
+                    full = True
+                    break
+                else:
+                    print "Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0))
+
+                if n > {fill_mb} * 0.8:
+                    # Be cautious in the last region where we expect to hit
+                    # the full condition, so that we don't overshoot too dramatically
+                    print "sleeping a bit as we've exceeded 80% of our expected full ratio"
+                    time.sleep({full_wait})
+
+            if not full:
+                raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes)
+
+            # close() should not raise an error because we already caught it in
+            # fsync.  There shouldn't have been any more writeback errors
+            # since then because all IOs got cancelled on the full flag.
+            print "calling close"
+            os.close(f)
+            print "close() did not raise error"
+
+            os.unlink("{file_path}")
+            """)
+
+        self._remote_write_test(remote_script)
+
+
+class TestQuotaFull(FullnessTestCase):
+    """
+    Test per-pool fullness, which indicates quota limits exceeded
+    """
+    pool_capacity = 1024 * 1024 * 32   # arbitrary low-ish limit
+    fill_mb = pool_capacity / (1024 * 1024)
+
+    # We are only testing quota handling on the data pool, not the metadata
+    # pool.
+    data_only = True
+
+    def setUp(self):
+        super(TestQuotaFull, self).setUp()
+
+        pool_name = self.fs.get_data_pool_name()
+        self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name,
+                                            "max_bytes", "{0}".format(self.pool_capacity))
+
+    def is_full(self):
+        return self.fs.is_pool_full(self.fs.get_data_pool_name())
+
+
+class TestClusterFull(FullnessTestCase):
+    """
+    Test cluster-wide fullness, which indicates that an OSD has become too full
+    """
+    pool_capacity = None
+    REQUIRE_MEMSTORE = True
+
+    def setUp(self):
+        super(TestClusterFull, self).setUp()
+
+        if self.pool_capacity is None:
+            # This is a hack to overcome weird fluctuations in the reported
+            # `max_avail` attribute of pools that sometimes occurs in between
+            # tests (reason as yet unclear, but this dodges the issue)
+            TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail']
+            TestClusterFull.fill_mb = int(1.05 * (self.pool_capacity / (1024.0 * 1024.0)))
+
+    def is_full(self):
+        return self.fs.is_full()
+
+# Hide the parent class so that unittest.loader doesn't try to run it.
+del globals()['FullnessTestCase']
diff --git a/src/ceph/qa/tasks/cephfs/test_journal_migration.py b/src/ceph/qa/tasks/cephfs/test_journal_migration.py
new file mode 100644
index 0000000..64fe939
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_journal_migration.py
@@ -0,0 +1,118 @@
+
+from StringIO import StringIO
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.workunit import task as workunit
+
+JOURNAL_FORMAT_LEGACY = 0
+JOURNAL_FORMAT_RESILIENT = 1
+
+
+class TestJournalMigration(CephFSTestCase):
+    CLIENTS_REQUIRED = 1
+    MDSS_REQUIRED = 2
+
+    def test_journal_migration(self):
+        old_journal_version = JOURNAL_FORMAT_LEGACY
+        new_journal_version = JOURNAL_FORMAT_RESILIENT
+
+        # Pick out two daemons to use
+        mds_a, mds_b = sorted(self.mds_cluster.mds_ids[0:2]) 
+
+        self.mount_a.umount_wait()
+        self.fs.mds_stop()
+
+        # Enable standby replay, to cover the bug case #8811 where
+        # a standby replay might mistakenly end up trying to rewrite
+        # the journal at the same time as an active daemon.
+        self.fs.set_ceph_conf('mds', 'mds standby replay', "true")
+        self.fs.set_ceph_conf('mds', 'mds standby for rank', "0")
+
+        # Create a filesystem using the older journal format.
+        self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version)
+        self.fs.recreate()
+        self.fs.mds_restart(mds_id=mds_a)
+        self.fs.wait_for_daemons()
+        self.assertEqual(self.fs.get_active_names(), [mds_a])
+
+        def replay_names():
+            return [s['name']
+                    for s in self.fs.status().get_replays(fscid = self.fs.id)]
+
+        # Start the standby and wait for it to come up
+        self.fs.mds_restart(mds_id=mds_b)
+        self.wait_until_equal(
+                replay_names,
+                [mds_b],
+                timeout = 30)
+
+        # Do some client work so that the log is populated with something.
+        with self.mount_a.mounted():
+            self.mount_a.create_files()
+            self.mount_a.check_files()  # sanity, this should always pass
+
+            # Run a more substantial workunit so that the length of the log to be
+            # coverted is going span at least a few segments
+            workunit(self.ctx, {
+                'clients': {
+                    "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"],
+                },
+                "timeout": "3h"
+            })
+
+        # Modify the ceph.conf to ask the MDS to use the new journal format.
+        self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version)
+
+        # Restart the MDS.
+        self.fs.mds_fail_restart(mds_id=mds_a)
+        self.fs.mds_fail_restart(mds_id=mds_b)
+
+        # This ensures that all daemons come up into a valid state
+        self.fs.wait_for_daemons()
+
+        # Check that files created in the initial client workload are still visible
+        # in a client mount.
+        with self.mount_a.mounted():
+            self.mount_a.check_files()
+
+        # Verify that the journal really has been rewritten.
+        journal_version = self.fs.get_journal_version()
+        if journal_version != new_journal_version:
+            raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format(
+                new_journal_version, journal_version()
+            ))
+
+        # Verify that cephfs-journal-tool can now read the rewritten journal
+        inspect_out = self.fs.journal_tool(["journal", "inspect"])
+        if not inspect_out.endswith(": OK"):
+            raise RuntimeError("Unexpected journal-tool result: '{0}'".format(
+                inspect_out
+            ))
+
+        self.fs.journal_tool(["event", "get", "json", "--path", "/tmp/journal.json"])
+        p = self.fs.tool_remote.run(
+            args=[
+                "python",
+                "-c",
+                "import json; print len(json.load(open('/tmp/journal.json')))"
+            ],
+            stdout=StringIO())
+        event_count = int(p.stdout.getvalue().strip())
+        if event_count < 1000:
+            # Approximate value of "lots", expected from having run fsstress
+            raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count))
+
+        # Do some client work to check that writing the log is still working
+        with self.mount_a.mounted():
+            workunit(self.ctx, {
+                'clients': {
+                    "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"],
+                },
+                "timeout": "3h"
+            })
+
+        # Check that both an active and a standby replay are still up
+        self.assertEqual(len(replay_names()), 1)
+        self.assertEqual(len(self.fs.get_active_names()), 1)
+        self.assertTrue(self.mds_cluster.mds_daemons[mds_a].running())
+        self.assertTrue(self.mds_cluster.mds_daemons[mds_b].running())
+
diff --git a/src/ceph/qa/tasks/cephfs/test_journal_repair.py b/src/ceph/qa/tasks/cephfs/test_journal_repair.py
new file mode 100644
index 0000000..62cbbb0
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_journal_repair.py
@@ -0,0 +1,443 @@
+
+"""
+Test our tools for recovering the content of damaged journals
+"""
+
+import json
+import logging
+from textwrap import dedent
+import time
+
+from teuthology.exceptions import CommandFailedError, ConnectionLostError
+from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+from tasks.workunit import task as workunit
+
+log = logging.getLogger(__name__)
+
+
+class TestJournalRepair(CephFSTestCase):
+    MDSS_REQUIRED = 2
+
+    def test_inject_to_empty(self):
+        """
+        That when some dentries in the journal but nothing is in
+        the backing store, we correctly populate the backing store
+        from the journalled dentries.
+        """
+
+        # Inject metadata operations
+        self.mount_a.run_shell(["touch", "rootfile"])
+        self.mount_a.run_shell(["mkdir", "subdir"])
+        self.mount_a.run_shell(["touch", "subdir/subdirfile"])
+        # There are several different paths for handling hardlinks, depending
+        # on whether an existing dentry (being overwritten) is also a hardlink
+        self.mount_a.run_shell(["mkdir", "linkdir"])
+
+        # Test inode -> remote transition for a dentry
+        self.mount_a.run_shell(["touch", "linkdir/link0"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link0"])
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"])
+
+        # Test nothing -> remote transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"])
+
+        # Test remote -> inode transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link2"])
+        self.mount_a.run_shell(["touch", "linkdir/link2"])
+
+        # Test remote -> diff remote transition
+        self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"])
+        self.mount_a.run_shell(["rm", "-f", "linkdir/link3"])
+        self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"])
+
+        # Test an empty directory
+        self.mount_a.run_shell(["mkdir", "subdir/subsubdir"])
+        self.mount_a.run_shell(["sync"])
+
+        # Before we unmount, make a note of the inode numbers, later we will
+        # check that they match what we recover from the journal
+        rootfile_ino = self.mount_a.path_to_ino("rootfile")
+        subdir_ino = self.mount_a.path_to_ino("subdir")
+        linkdir_ino = self.mount_a.path_to_ino("linkdir")
+        subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile")
+        subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir")
+
+        self.mount_a.umount_wait()
+
+        # Stop the MDS
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # Now, the journal should contain the operations, but the backing
+        # store shouldn't
+        with self.assertRaises(ObjectNotFound):
+            self.fs.list_dirfrag(subdir_ino)
+        self.assertEqual(self.fs.list_dirfrag(ROOT_INO), [])
+
+        # Execute the dentry recovery, this should populate the backing store
+        self.fs.journal_tool(['event', 'recover_dentries', 'list'])
+
+        # Dentries in ROOT_INO are present
+        self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head']))
+        self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head'])
+        self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)),
+                         sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head']))
+
+        # Now check the MDS can read what we wrote: truncate the journal
+        # and start the mds.
+        self.fs.journal_tool(['journal', 'reset'])
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        # List files
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # First ls -R to populate MDCache, such that hardlinks will
+        # resolve properly (recover_dentries does not create backtraces,
+        # so ordinarily hardlinks to inodes that happen not to have backtraces
+        # will be invisible in readdir).
+        # FIXME: hook in forward scrub here to regenerate backtraces
+        proc = self.mount_a.run_shell(['ls', '-R'])
+        self.mount_a.umount_wait()  # remount to clear client cache before our second ls
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        proc = self.mount_a.run_shell(['ls', '-R'])
+        self.assertEqual(proc.stdout.getvalue().strip(),
+                         dedent("""
+                         .:
+                         linkdir
+                         rootfile
+                         subdir
+
+                         ./linkdir:
+                         link0
+                         link1
+                         link2
+                         link3
+
+                         ./subdir:
+                         subdirfile
+                         subsubdir
+
+                         ./subdir/subsubdir:
+                         """).strip())
+
+        # Check the correct inos were preserved by path
+        self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile"))
+        self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir"))
+        self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile"))
+        self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir"))
+
+        # Check that the hard link handling came out correctly
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino)
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino)
+        self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino)
+        self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino)
+
+        # Create a new file, ensure it is not issued the same ino as one of the
+        # recovered ones
+        self.mount_a.run_shell(["touch", "afterwards"])
+        new_ino = self.mount_a.path_to_ino("afterwards")
+        self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino])
+
+        # Check that we can do metadata ops in the recovered directory
+        self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"])
+
+    @for_teuthology # 308s
+    def test_reset(self):
+        """
+        That after forcibly modifying the backing store, we can get back into
+        a good state by resetting the MDSMap.
+
+        The scenario is that we have two active MDSs, and we lose the journals.  Once
+        we have completely lost confidence in the integrity of the metadata, we want to
+        return the system to a single-MDS state to go into a scrub to recover what we
+        can.
+        """
+
+        # Set max_mds to 2
+        self.fs.set_max_mds(2)
+
+        # See that we have two active MDSs
+        self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+        active_mds_names = self.fs.get_active_names()
+
+        # Switch off any unneeded MDS daemons
+        for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names):
+            self.mds_cluster.mds_stop(unneeded_mds)
+            self.mds_cluster.mds_fail(unneeded_mds)
+
+        # Create a dir on each rank
+        self.mount_a.run_shell(["mkdir", "alpha"])
+        self.mount_a.run_shell(["mkdir", "bravo"])
+        self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0")
+        self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1")
+
+        def subtrees_assigned():
+            got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0])
+
+            for s in got_subtrees:
+                if s['dir']['path'] == '/bravo':
+                    if s['auth_first'] == 1:
+                        return True
+                    else:
+                        # Should not happen
+                        raise RuntimeError("/bravo is subtree but not rank 1!")
+
+            return False
+
+        # Ensure the pinning has taken effect and the /bravo dir is now
+        # migrated to rank 1.
+        self.wait_until_true(subtrees_assigned, 30)
+
+        # Do some IO (this should be split across ranks according to
+        # the rank-pinned dirs)
+        self.mount_a.create_n_files("alpha/file", 1000)
+        self.mount_a.create_n_files("bravo/file", 1000)
+
+        # Flush the journals so that we have some backing store data
+        # belonging to one MDS, and some to the other MDS.
+        for mds_name in active_mds_names:
+            self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        # Stop (hard) the second MDS daemon
+        self.fs.mds_stop(active_mds_names[1])
+
+        # Wipe out the tables for MDS rank 1 so that it is broken and can't start
+        # (this is the simulated failure that we will demonstrate that the disaster
+        #  recovery tools can get us back from)
+        self.fs.erase_metadata_objects(prefix="mds1_")
+
+        # Try to access files from the client
+        blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False)
+
+        # Check that this "ls -R" blocked rather than completing: indicates
+        # it got stuck trying to access subtrees which were on the now-dead MDS.
+        log.info("Sleeping to check ls is blocked...")
+        time.sleep(60)
+        self.assertFalse(blocked_ls.finished)
+
+        # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1
+        # is not coming back.  Kill it.
+        log.info("Killing mount, it's blocked on the MDS we killed")
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+        try:
+            # Now that the mount is dead, the ls -R should error out.
+            blocked_ls.wait()
+        except (CommandFailedError, ConnectionLostError):
+            # The ConnectionLostError case is for kernel client, where
+            # killing the mount also means killing the node.
+            pass
+
+        # See that the second MDS will crash when it starts and tries to
+        # acquire rank 1
+        damaged_id = active_mds_names[1]
+        self.fs.mds_restart(damaged_id)
+
+        # The daemon taking the damaged rank should start starting, then
+        # restart back into standby after asking the mon to mark the rank
+        # damaged.
+        def is_marked_damaged():
+            mds_map = self.fs.get_mds_map()
+            return 1 in mds_map['damaged']
+
+        self.wait_until_true(is_marked_damaged, 60)
+
+        def get_state():
+            info = self.mds_cluster.get_mds_info(damaged_id)
+            return info['state'] if info is not None else None
+
+        self.wait_until_equal(
+                get_state,
+                "up:standby",
+                timeout=60)
+
+        self.fs.mds_stop(damaged_id)
+        self.fs.mds_fail(damaged_id)
+
+        # Now give up and go through a disaster recovery procedure
+        self.fs.mds_stop(active_mds_names[0])
+        self.fs.mds_fail(active_mds_names[0])
+        # Invoke recover_dentries quietly, because otherwise log spews millions of lines
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True)
+        self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True)
+        self.fs.table_tool(["0", "reset", "session"])
+        self.fs.journal_tool(["journal", "reset"], rank=0)
+        self.fs.erase_mds_objects(1)
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
+                '--yes-i-really-mean-it')
+
+        # Bring an MDS back online, mount a client, and see that we can walk the full
+        # filesystem tree again
+        self.fs.mds_fail_restart(active_mds_names[0])
+        self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30,
+                              reject_fn=lambda v: len(v) > 1)
+        self.mount_a.mount()
+        self.mount_a.run_shell(["ls", "-R"], wait=True)
+
+    def test_table_tool(self):
+        active_mdss = self.fs.get_active_names()
+        self.assertEqual(len(active_mdss), 1)
+        mds_name = active_mdss[0]
+
+        self.mount_a.run_shell(["touch", "foo"])
+        self.fs.mds_asok(["flush", "journal"], mds_name)
+
+        log.info(self.fs.table_tool(["all", "show", "inode"]))
+        log.info(self.fs.table_tool(["all", "show", "snap"]))
+        log.info(self.fs.table_tool(["all", "show", "session"]))
+
+        # Inode table should always be the same because initial state
+        # and choice of inode are deterministic.
+        # Should see one inode consumed
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "inode"])),
+            {"0": {
+                "data": {
+                    "version": 2,
+                    "inotable": {
+                        "projected_free": [
+                            {"start": 1099511628777,
+                             "len": 1099511626775}],
+                        "free": [
+                            {"start": 1099511628777,
+                             "len": 1099511626775}]}},
+                "result": 0}}
+
+        )
+
+        # Should see one session
+        session_data = json.loads(self.fs.table_tool(
+            ["all", "show", "session"]))
+        self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1)
+        self.assertEqual(session_data["0"]["result"], 0)
+
+        # Should see no snaps
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "snap"])),
+            {"version": 0,
+             "snapserver": {"last_snap": 1,
+                            "pending_noop": [],
+                            "snaps": [],
+                            "need_to_purge": {},
+                            "pending_update": [],
+                            "pending_destroy": []},
+             "result": 0}
+        )
+
+        # Reset everything
+        for table in ["session", "inode", "snap"]:
+            self.fs.table_tool(["all", "reset", table])
+
+        log.info(self.fs.table_tool(["all", "show", "inode"]))
+        log.info(self.fs.table_tool(["all", "show", "snap"]))
+        log.info(self.fs.table_tool(["all", "show", "session"]))
+
+        # Should see 0 sessions
+        session_data = json.loads(self.fs.table_tool(
+            ["all", "show", "session"]))
+        self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0)
+        self.assertEqual(session_data["0"]["result"], 0)
+
+        # Should see entire inode range now marked free
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "inode"])),
+            {"0": {"data": {"version": 1,
+                            "inotable": {"projected_free": [
+                                {"start": 1099511627776,
+                                 "len": 1099511627776}],
+                                 "free": [
+                                    {"start": 1099511627776,
+                                    "len": 1099511627776}]}},
+                   "result": 0}}
+        )
+
+        # Should see no snaps
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "snap"])),
+            {"version": 1,
+             "snapserver": {"last_snap": 1,
+                            "pending_noop": [],
+                            "snaps": [],
+                            "need_to_purge": {},
+                            "pending_update": [],
+                            "pending_destroy": []},
+             "result": 0}
+        )
+
+    def test_table_tool_take_inos(self):
+        initial_range_start = 1099511627776
+        initial_range_len = 1099511627776
+        # Initially a completely clear range
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "show", "inode"])),
+            {"0": {"data": {"version": 0,
+                            "inotable": {"projected_free": [
+                                {"start": initial_range_start,
+                                 "len": initial_range_len}],
+                                "free": [
+                                    {"start": initial_range_start,
+                                     "len": initial_range_len}]}},
+                   "result": 0}}
+        )
+
+        # Remove some
+        self.assertEqual(
+            json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])),
+            {"0": {"data": {"version": 1,
+                            "inotable": {"projected_free": [
+                                {"start": initial_range_start + 101,
+                                 "len": initial_range_len - 101}],
+                                "free": [
+                                    {"start": initial_range_start + 101,
+                                     "len": initial_range_len - 101}]}},
+                   "result": 0}}
+        )
+
+    @for_teuthology  # Hack: "for_teuthology" because .sh doesn't work outside teuth
+    def test_journal_smoke(self):
+        workunit(self.ctx, {
+            'clients': {
+                "client.{0}".format(self.mount_a.client_id): [
+                    "fs/misc/trivial_sync.sh"],
+            },
+            "timeout": "1h"
+        })
+
+        for mount in self.mounts:
+            mount.umount_wait()
+
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # journal tool smoke
+        workunit(self.ctx, {
+            'clients': {
+                "client.{0}".format(self.mount_a.client_id): [
+                    "suites/cephfs_journal_tool_smoke.sh"],
+            },
+            "timeout": "1h"
+        })
+
+
+
+        self.fs.mds_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount()
+
+        # trivial sync moutn a
+        workunit(self.ctx, {
+            'clients': {
+                "client.{0}".format(self.mount_a.client_id): [
+                    "fs/misc/trivial_sync.sh"],
+            },
+            "timeout": "1h"
+        })
+
diff --git a/src/ceph/qa/tasks/cephfs/test_mantle.py b/src/ceph/qa/tasks/cephfs/test_mantle.py
new file mode 100644
index 0000000..6cd86ad
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_mantle.py
@@ -0,0 +1,109 @@
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import json
+import logging
+
+log = logging.getLogger(__name__)
+failure = "using old balancer; mantle failed for balancer="
+success = "mantle balancer version changed: "
+
+class TestMantle(CephFSTestCase):
+    def start_mantle(self):
+        self.wait_for_health_clear(timeout=30)
+        self.fs.set_max_mds(2)
+        self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        for m in self.fs.get_active_names():
+            self.fs.mds_asok(['config', 'set', 'debug_objecter', '20'], mds_id=m)
+            self.fs.mds_asok(['config', 'set', 'debug_ms', '0'], mds_id=m)
+            self.fs.mds_asok(['config', 'set', 'debug_mds', '0'], mds_id=m)
+            self.fs.mds_asok(['config', 'set', 'debug_mds_balancer', '5'], mds_id=m)
+
+    def push_balancer(self, obj, lua_code, expect):
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', obj)
+        self.fs.rados(["put", obj, "-"], stdin_data=lua_code)
+        with self.assert_cluster_log(failure + obj + " " + expect):
+            log.info("run a " + obj + " balancer that expects=" + expect)
+
+    def test_version_empty(self):
+        self.start_mantle()
+        expect = " : (2) No such file or directory"
+
+        ret = self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer')
+        assert(ret == 22) # EINVAL
+
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', " ")
+        with self.assert_cluster_log(failure + " " + expect): pass
+
+    def test_version_not_in_rados(self):
+        self.start_mantle()
+        expect = failure + "ghost.lua : (2) No such file or directory"
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "ghost.lua")
+        with self.assert_cluster_log(expect): pass
+
+    def test_balancer_invalid(self):
+        self.start_mantle()
+        expect = ": (22) Invalid argument"
+
+        lua_code = "this is invalid lua code!"
+        self.push_balancer("invalid.lua", lua_code, expect)
+
+        lua_code = "BAL_LOG()"
+        self.push_balancer("invalid_log.lua", lua_code, expect)
+
+        lua_code = "BAL_LOG(0)"
+        self.push_balancer("invalid_log_again.lua", lua_code, expect)
+
+    def test_balancer_valid(self):
+        self.start_mantle()
+        lua_code = "BAL_LOG(0, \"test\")\nreturn {3, 4}"
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
+        self.fs.rados(["put", "valid.lua", "-"], stdin_data=lua_code)
+        with self.assert_cluster_log(success + "valid.lua"):
+            log.info("run a valid.lua balancer")
+
+    def test_return_invalid(self):
+        self.start_mantle()
+        expect = ": (22) Invalid argument"
+
+        lua_code = "return \"hello\""
+        self.push_balancer("string.lua", lua_code, expect)
+
+        lua_code = "return 3"
+        self.push_balancer("number.lua", lua_code, expect)
+
+        lua_code = "return {}"
+        self.push_balancer("dict_empty.lua", lua_code, expect)
+
+        lua_code = "return {\"this\", \"is\", \"a\", \"test\"}"
+        self.push_balancer("dict_of_strings.lua", lua_code, expect)
+
+        lua_code = "return {3, \"test\"}"
+        self.push_balancer("dict_of_mixed.lua", lua_code, expect)
+
+        lua_code = "return {3}"
+        self.push_balancer("not_enough_numbers.lua", lua_code, expect)
+
+        lua_code = "return {3, 4, 5, 6, 7, 8, 9}"
+        self.push_balancer("too_many_numbers.lua", lua_code, expect)
+
+    def test_dead_osd(self):
+        self.start_mantle()
+        expect = " : (110) Connection timed out"
+
+        # kill the OSDs so that the balancer pull from RADOS times out
+        osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
+        for i in range(0, len(osd_map['osds'])):
+          self.fs.mon_manager.raw_cluster_cmd_result('osd', 'down', str(i))
+          self.fs.mon_manager.raw_cluster_cmd_result('osd', 'out', str(i))
+
+        # trigger a pull from RADOS
+        self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua")
+
+        # make the timeout a little longer since dead OSDs spam ceph -w
+        with self.assert_cluster_log(failure + "valid.lua" + expect, timeout=30):
+            log.info("run a balancer that should timeout")
+
+        # cleanup
+        for i in range(0, len(osd_map['osds'])):
+          self.fs.mon_manager.raw_cluster_cmd_result('osd', 'in', str(i))
diff --git a/src/ceph/qa/tasks/cephfs/test_misc.py b/src/ceph/qa/tasks/cephfs/test_misc.py
new file mode 100644
index 0000000..d857cfd
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_misc.py
@@ -0,0 +1,149 @@
+
+from unittest import SkipTest
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from teuthology.orchestra.run import CommandFailedError
+import errno
+import time
+import json
+
+
+class TestMisc(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+
+    LOAD_SETTINGS = ["mds_session_autoclose"]
+    mds_session_autoclose = None
+
+    def test_getattr_caps(self):
+        """
+        Check if MDS recognizes the 'mask' parameter of open request.
+        The paramter allows client to request caps when opening file
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Require FUSE client")
+
+        # Enable debug. Client will requests CEPH_CAP_XATTR_SHARED
+        # on lookup/open
+        self.mount_b.umount_wait()
+        self.set_conf('client', 'client debug getattr caps', 'true')
+        self.mount_b.mount()
+        self.mount_b.wait_until_mounted()
+
+        # create a file and hold it open. MDS will issue CEPH_CAP_EXCL_*
+        # to mount_a
+        p = self.mount_a.open_background("testfile")
+        self.mount_b.wait_for_visible("testfile")
+
+        # this tiggers a lookup request and an open request. The debug
+        # code will check if lookup/open reply contains xattrs
+        self.mount_b.run_shell(["cat", "testfile"])
+
+        self.mount_a.kill_background(p)
+
+    def test_fs_new(self):
+        data_pool_name = self.fs.get_data_pool_name()
+
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
+                                            '--yes-i-really-mean-it')
+
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+                                            self.fs.metadata_pool_name,
+                                            self.fs.metadata_pool_name,
+                                            '--yes-i-really-really-mean-it')
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                            self.fs.metadata_pool_name,
+                                            self.fs.get_pgs_per_fs_pool().__str__())
+
+        dummyfile = '/etc/fstab'
+
+        self.fs.put_metadata_object_raw("key", dummyfile)
+
+        def get_pool_df(fs, name):
+            try:
+                return fs.get_pool_df(name)['objects'] > 0
+            except RuntimeError as e:
+                return False
+
+        self.wait_until_true(lambda: get_pool_df(self.fs, self.fs.metadata_pool_name), timeout=30)
+
+        try:
+            self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
+                                                self.fs.metadata_pool_name,
+                                                data_pool_name)
+        except CommandFailedError as e:
+            self.assertEqual(e.exitstatus, errno.EINVAL)
+        else:
+            raise AssertionError("Expected EINVAL")
+
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
+                                            self.fs.metadata_pool_name,
+                                            data_pool_name, "--force")
+
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name,
+                                            '--yes-i-really-mean-it')
+
+
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete',
+                                            self.fs.metadata_pool_name,
+                                            self.fs.metadata_pool_name,
+                                            '--yes-i-really-really-mean-it')
+        self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create',
+                                            self.fs.metadata_pool_name,
+                                            self.fs.get_pgs_per_fs_pool().__str__())
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name,
+                                            self.fs.metadata_pool_name,
+                                            data_pool_name)
+
+    def test_evict_client(self):
+        """
+        Check that a slow client session won't get evicted if it's the
+        only session
+        """
+
+        self.mount_b.umount_wait()
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        self.assert_session_count(1, ls_data)
+
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+
+        time.sleep(self.mds_session_autoclose * 1.5)
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        self.assert_session_count(1, ls_data)
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.mount_b.mount()
+        self.mount_b.wait_until_mounted()
+
+        ls_data = self._session_list()
+        self.assert_session_count(2, ls_data)
+
+        self.mount_a.kill()
+        self.mount_a.kill_cleanup()
+
+        time.sleep(self.mds_session_autoclose * 1.5)
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        self.assert_session_count(1, ls_data)
+
+    def test_filtered_df(self):
+        pool_name = self.fs.get_data_pool_name()
+        raw_df = self.fs.get_pool_df(pool_name)
+        raw_avail = float(raw_df["max_avail"])
+        out = self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'get',
+                                                  pool_name, 'size',
+                                                  '-f', 'json-pretty')
+        j = json.loads(out)
+        pool_size = int(j['size'])
+
+        proc = self.mount_a.run_shell(['df', '.'])
+        output = proc.stdout.getvalue()
+        fs_avail = output.split('\n')[1].split()[3]
+        fs_avail = float(fs_avail) * 1024
+
+        ratio = raw_avail / fs_avail
+        assert 0.9 < ratio < 1.1
diff --git a/src/ceph/qa/tasks/cephfs/test_pool_perm.py b/src/ceph/qa/tasks/cephfs/test_pool_perm.py
new file mode 100644
index 0000000..22775e7
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_pool_perm.py
@@ -0,0 +1,113 @@
+from textwrap import dedent
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+import os
+
+
+class TestPoolPerm(CephFSTestCase):
+    def test_pool_perm(self):
+        self.mount_a.run_shell(["touch", "test_file"])
+
+        file_path = os.path.join(self.mount_a.mountpoint, "test_file")
+
+        remote_script = dedent("""
+            import os
+            import errno
+
+            fd = os.open("{path}", os.O_RDWR)
+            try:
+                if {check_read}:
+                    ret = os.read(fd, 1024)
+                else:
+                    os.write(fd, 'content')
+            except OSError, e:
+                if e.errno != errno.EPERM:
+                    raise
+            else:
+                raise RuntimeError("client does not check permission of data pool")
+            """)
+
+        client_name = "client.{0}".format(self.mount_a.client_id)
+
+        # set data pool read only
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd',
+            'allow r pool={0}'.format(self.fs.get_data_pool_name()))
+
+        self.mount_a.umount_wait()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # write should fail
+        self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(False)))
+
+        # set data pool write only
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd',
+            'allow w pool={0}'.format(self.fs.get_data_pool_name()))
+
+        self.mount_a.umount_wait()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # read should fail
+        self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(True)))
+
+    def test_forbidden_modification(self):
+        """
+        That a client who does not have the capability for setting
+        layout pools is prevented from doing so.
+        """
+
+        # Set up
+        client_name = "client.{0}".format(self.mount_a.client_id)
+        new_pool_name = "data_new"
+        self.fs.add_data_pool(new_pool_name)
+
+        self.mount_a.run_shell(["touch", "layoutfile"])
+        self.mount_a.run_shell(["mkdir", "layoutdir"])
+
+        # Set MDS 'rw' perms: missing 'p' means no setting pool layouts
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow rw', 'mon', 'allow r',
+            'osd',
+            'allow rw pool={0},allow rw pool={1}'.format(
+                self.fs.get_data_pool_names()[0],
+                self.fs.get_data_pool_names()[1],
+            ))
+
+        self.mount_a.umount_wait()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        with self.assertRaises(CommandFailedError):
+            self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool",
+                                  new_pool_name)
+        with self.assertRaises(CommandFailedError):
+            self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool",
+                                  new_pool_name)
+        self.mount_a.umount_wait()
+
+        # Set MDS 'rwp' perms: should now be able to set layouts
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', client_name, 'mds', 'allow rwp', 'mon', 'allow r',
+            'osd',
+            'allow rw pool={0},allow rw pool={1}'.format(
+                self.fs.get_data_pool_names()[0],
+                self.fs.get_data_pool_names()[1],
+            ))
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool",
+                              new_pool_name)
+        self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool",
+                              new_pool_name)
+        self.mount_a.umount_wait()
+
+    def tearDown(self):
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'caps', "client.{0}".format(self.mount_a.client_id),
+            'mds', 'allow', 'mon', 'allow r', 'osd',
+            'allow rw pool={0}'.format(self.fs.get_data_pool_names()[0]))
+        super(TestPoolPerm, self).tearDown()
+
diff --git a/src/ceph/qa/tasks/cephfs/test_quota.py b/src/ceph/qa/tasks/cephfs/test_quota.py
new file mode 100644
index 0000000..ee11c58
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_quota.py
@@ -0,0 +1,106 @@
+
+from cephfs_test_case import CephFSTestCase
+
+from teuthology.exceptions import CommandFailedError
+
+class TestQuota(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+    MDSS_REQUIRED = 1
+
+    def test_remote_update_getfattr(self):
+        """
+        That quota changes made from one client are visible to another
+        client looking at ceph.quota xattrs
+        """
+        self.mount_a.run_shell(["mkdir", "subdir"])
+
+        self.assertEqual(
+            self.mount_a.getfattr("./subdir", "ceph.quota.max_files"),
+            None)
+        self.assertEqual(
+            self.mount_b.getfattr("./subdir", "ceph.quota.max_files"),
+            None)
+
+        self.mount_a.setfattr("./subdir", "ceph.quota.max_files", "10")
+        self.assertEqual(
+            self.mount_a.getfattr("./subdir", "ceph.quota.max_files"),
+            "10")
+
+        # Should be visible as soon as setxattr operation completes on
+        # mds (we get here sooner because setfattr gets an early reply)
+        self.wait_until_equal(
+            lambda: self.mount_b.getfattr("./subdir", "ceph.quota.max_files"),
+            "10", timeout=10)
+
+    def test_remote_update_df(self):
+        """
+        That when a client modifies the quota on a directory used
+        as another client's root, the other client sees the change
+        reflected in their statfs output.
+        """
+
+        self.mount_b.umount_wait()
+
+        self.mount_a.run_shell(["mkdir", "subdir"])
+
+        size_before = 1024 * 1024 * 128
+        self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes",
+                              "%s" % size_before)
+
+        self.mount_b.mount(mount_path="/subdir")
+
+        self.assertDictEqual(
+            self.mount_b.df(),
+            {
+                "total": size_before,
+                "used": 0,
+                "available": size_before
+            })
+
+        size_after = 1024 * 1024 * 256
+        self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes",
+                              "%s" % size_after)
+
+        # Should be visible as soon as setxattr operation completes on
+        # mds (we get here sooner because setfattr gets an early reply)
+        self.wait_until_equal(
+            lambda: self.mount_b.df(),
+            {
+                "total": size_after,
+                "used": 0,
+                "available": size_after
+            },
+            timeout=10
+        )
+
+    def test_remote_update_write(self):
+        """
+        That when a client modifies the quota on a directory used
+        as another client's root, the other client sees the effect
+        of the change when writing data.
+        """
+
+        self.mount_a.run_shell(["mkdir", "subdir_files"])
+        self.mount_a.run_shell(["mkdir", "subdir_data"])
+
+        # Set some nice high quotas that mount_b's initial operations
+        # will be well within
+        self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "100")
+        self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "104857600")
+
+        # Do some writes within my quota
+        self.mount_b.create_n_files("subdir_files/file", 20)
+        self.mount_b.write_n_mb("subdir_data/file", 20)
+
+        # Set quotas lower than what mount_b already wrote, it should
+        # refuse to write more once it's seen them
+        self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "10")
+        self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "1048576")
+
+        # Do some writes that would have been okay within the old quota,
+        # but are forbidden under the new quota
+        with self.assertRaises(CommandFailedError):
+            self.mount_b.create_n_files("subdir_files/file", 40)
+        with self.assertRaises(CommandFailedError):
+            self.mount_b.write_n_mb("subdir_data/file", 40)
+
diff --git a/src/ceph/qa/tasks/cephfs/test_readahead.py b/src/ceph/qa/tasks/cephfs/test_readahead.py
new file mode 100644
index 0000000..31e7bf1
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_readahead.py
@@ -0,0 +1,31 @@
+import logging
+from tasks.cephfs.fuse_mount import FuseMount
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestReadahead(CephFSTestCase):
+    def test_flush(self):
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("FUSE needed for measuring op counts")
+
+        # Create 32MB file
+        self.mount_a.run_shell(["dd", "if=/dev/urandom", "of=foo", "bs=1M", "count=32"])
+
+        # Unmount and remount the client to flush cache
+        self.mount_a.umount_wait()
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        initial_op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r']
+        self.mount_a.run_shell(["dd", "if=foo", "of=/dev/null", "bs=128k", "count=32"])
+        op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r']
+        assert op_r >= initial_op_r
+        op_r -= initial_op_r
+        log.info("read operations: {0}".format(op_r))
+
+        # with exponentially increasing readahead, we should see fewer than 10 operations
+        # but this test simply checks if the client is doing a remote read for each local read
+        if op_r >= 32:
+            raise RuntimeError("readahead not working")
diff --git a/src/ceph/qa/tasks/cephfs/test_recovery_pool.py b/src/ceph/qa/tasks/cephfs/test_recovery_pool.py
new file mode 100644
index 0000000..097342a
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_recovery_pool.py
@@ -0,0 +1,220 @@
+
+"""
+Test our tools for recovering metadata from the data pool into an alternate pool
+"""
+import json
+
+import logging
+import os
+from textwrap import dedent
+import traceback
+from collections import namedtuple, defaultdict
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class OverlayWorkload(object):
+    def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
+        self._orig_fs = orig_fs
+        self._recovery_fs = recovery_fs
+        self._orig_mount = orig_mount
+        self._recovery_mount = recovery_mount
+        self._initial_state = None
+
+        # Accumulate backtraces for every failed validation, and return them.  Backtraces
+        # are rather verbose, but we only see them when something breaks, and they
+        # let us see which check failed without having to decorate each check with
+        # a string
+        self._errors = []
+
+    def assert_equal(self, a, b):
+        try:
+            if a != b:
+                raise AssertionError("{0} != {1}".format(a, b))
+        except AssertionError as e:
+            self._errors.append(
+                ValidationError(e, traceback.format_exc(3))
+            )
+
+    def write(self):
+        """
+        Write the workload files to the mount
+        """
+        raise NotImplementedError()
+
+    def validate(self):
+        """
+        Read from the mount and validate that the workload files are present (i.e. have
+        survived or been reconstructed from the test scenario)
+        """
+        raise NotImplementedError()
+
+    def damage(self):
+        """
+        Damage the filesystem pools in ways that will be interesting to recover from.  By
+        default just wipe everything in the metadata pool
+        """
+        # Delete every object in the metadata pool
+        objects = self._orig_fs.rados(["ls"]).split("\n")
+        for o in objects:
+            self._orig_fs.rados(["rm", o])
+
+    def flush(self):
+        """
+        Called after client unmount, after write: flush whatever you want
+        """
+        self._orig_fs.mds_asok(["flush", "journal"])
+        self._recovery_fs.mds_asok(["flush", "journal"])
+
+
+class SimpleOverlayWorkload(OverlayWorkload):
+    """
+    Single file, single directory, check that it gets recovered and so does its size
+    """
+    def write(self):
+        self._orig_mount.run_shell(["mkdir", "subdir"])
+        self._orig_mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._orig_mount.stat("subdir/sixmegs")
+
+    def validate(self):
+        self._recovery_mount.run_shell(["ls", "subdir"])
+        st = self._recovery_mount.stat("subdir/sixmegs")
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+        return self._errors
+
+class TestRecoveryPool(CephFSTestCase):
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 2
+    REQUIRE_RECOVERY_FILESYSTEM = True
+
+    def is_marked_damaged(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank in mds_map['damaged']
+
+    def _rebuild_metadata(self, workload, other_pool=None, workers=1):
+        """
+        That when all objects in metadata pool are removed, we can rebuild a metadata pool
+        based on the contents of a data pool, and a client can see and read our files.
+        """
+
+        # First, inject some files
+
+        workload.write()
+
+        # Unmount the client and flush the journal: the tool should also cope with
+        # situations where there is dirty metadata, but we'll test that separately
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+        workload.flush()
+
+        # Create the alternate pool if requested
+        recovery_fs = self.recovery_fs.name
+        recovery_pool = self.recovery_fs.get_metadata_pool_name()
+        self.recovery_fs.data_scan(['init', '--force-init',
+                                    '--filesystem', recovery_fs,
+                                    '--alternate-pool', recovery_pool])
+        self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
+
+        # Stop the MDS
+        self.fs.mds_stop()
+        self.fs.mds_fail()
+
+        # After recovery, we need the MDS to not be strict about stats (in production these options
+        # are off by default, but in QA we need to explicitly disable them)
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+        # Apply any data damage the workload wants
+        workload.damage()
+
+        # Reset the MDS map in case multiple ranks were in play: recovery procedure
+        # only understands how to rebuild metadata under rank 0
+        self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
+                '--yes-i-really-mean-it')
+
+        def get_state(mds_id):
+            info = self.mds_cluster.get_mds_info(mds_id)
+            return info['state'] if info is not None else None
+
+        self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
+
+        # Run the recovery procedure
+        if False:
+            with self.assertRaises(CommandFailedError):
+                # Normal reset should fail when no objects are present, we'll use --force instead
+                self.fs.journal_tool(["journal", "reset"])
+
+        self.fs.mds_stop()
+        self.fs.data_scan(['scan_extents', '--alternate-pool',
+                           recovery_pool, '--filesystem', self.fs.name,
+                           self.fs.get_data_pool_name()])
+        self.fs.data_scan(['scan_inodes', '--alternate-pool',
+                           recovery_pool, '--filesystem', self.fs.name,
+                           '--force-corrupt', '--force-init',
+                           self.fs.get_data_pool_name()])
+        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
+                              'recover_dentries', 'list',
+                              '--alternate-pool', recovery_pool])
+
+        self.fs.data_scan(['init', '--force-init', '--filesystem',
+                           self.fs.name])
+        self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
+                           '--force-corrupt', '--force-init',
+                           self.fs.get_data_pool_name()])
+        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
+                              'recover_dentries', 'list'])
+
+        self.fs.journal_tool(['--rank=' + recovery_fs + ":0", 'journal',
+                              'reset', '--force'])
+        self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal',
+                              'reset', '--force'])
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
+                                            recovery_fs + ":0")
+
+        # Mark the MDS repaired
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+        # Start the MDS
+        self.fs.mds_restart()
+        self.recovery_fs.mds_restart()
+        self.fs.wait_for_daemons()
+        self.recovery_fs.wait_for_daemons()
+        for mds_id in self.recovery_fs.mds_ids:
+            self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id,
+                                                'injectargs', '--debug-mds=20')
+            self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id,
+                                                'scrub_path', '/',
+                                                'recursive', 'repair')
+        log.info(str(self.mds_cluster.status()))
+
+        # Mount a client
+        self.mount_a.mount()
+        self.mount_b.mount(mount_fs_name=recovery_fs)
+        self.mount_a.wait_until_mounted()
+        self.mount_b.wait_until_mounted()
+
+        # See that the files are present and correct
+        errors = workload.validate()
+        if errors:
+            log.error("Validation errors found: {0}".format(len(errors)))
+            for e in errors:
+                log.error(e.exception)
+                log.error(e.backtrace)
+            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+                errors[0].exception, errors[0].backtrace
+            ))
+
+    def test_rebuild_simple(self):
+        self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
+                                                     self.mount_a, self.mount_b))
diff --git a/src/ceph/qa/tasks/cephfs/test_scrub_checks.py b/src/ceph/qa/tasks/cephfs/test_scrub_checks.py
new file mode 100644
index 0000000..a2de527
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_scrub_checks.py
@@ -0,0 +1,245 @@
+"""
+MDS admin socket scrubbing-related tests.
+"""
+import json
+import logging
+import errno
+import time
+from teuthology.exceptions import CommandFailedError
+import os
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestScrubChecks(CephFSTestCase):
+    """
+    Run flush and scrub commands on the specified files in the filesystem. This
+    task will run through a sequence of operations, but it is not comprehensive
+    on its own -- it doesn't manipulate the mds cache state to test on both
+    in- and out-of-memory parts of the hierarchy. So it's designed to be run
+    multiple times within a single test run, so that the test can manipulate
+    memory state.
+
+    Usage:
+    mds_scrub_checks:
+      mds_rank: 0
+      path: path/to/test/dir
+      client: 0
+      run_seq: [0-9]+
+
+    Increment the run_seq on subsequent invocations within a single test run;
+    it uses that value to generate unique folder and file names.
+    """
+
+    MDSS_REQUIRED = 1
+    CLIENTS_REQUIRED = 1
+
+    def test_scrub_checks(self):
+        self._checks(0)
+        self._checks(1)
+
+    def _checks(self, run_seq):
+        mds_rank = 0
+        test_dir = "scrub_test_path"
+
+        abs_test_path = "/{0}".format(test_dir)
+
+        log.info("mountpoint: {0}".format(self.mount_a.mountpoint))
+        client_path = os.path.join(self.mount_a.mountpoint, test_dir)
+        log.info("client_path: {0}".format(client_path))
+
+        log.info("Cloning repo into place")
+        repo_path = self.clone_repo(self.mount_a, client_path)
+
+        log.info("Initiating mds_scrub_checks on mds.{id_}, " +
+                 "test_path {path}, run_seq {seq}".format(
+                     id_=mds_rank, path=abs_test_path, seq=run_seq)
+                 )
+
+
+        success_validator = lambda j, r: self.json_validator(j, r, "return_code", 0)
+
+        nep = "{test_path}/i/dont/exist".format(test_path=abs_test_path)
+        self.asok_command(mds_rank, "flush_path {nep}".format(nep=nep),
+                          lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
+        self.asok_command(mds_rank, "scrub_path {nep}".format(nep=nep),
+                          lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
+
+        test_repo_path = "{test_path}/ceph-qa-suite".format(test_path=abs_test_path)
+        dirpath = "{repo_path}/suites".format(repo_path=test_repo_path)
+
+        if run_seq == 0:
+            log.info("First run: flushing {dirpath}".format(dirpath=dirpath))
+            command = "flush_path {dirpath}".format(dirpath=dirpath)
+            self.asok_command(mds_rank, command, success_validator)
+        command = "scrub_path {dirpath}".format(dirpath=dirpath)
+        self.asok_command(mds_rank, command, success_validator)
+
+        filepath = "{repo_path}/suites/fs/verify/validater/valgrind.yaml".format(
+            repo_path=test_repo_path)
+        if run_seq == 0:
+            log.info("First run: flushing {filepath}".format(filepath=filepath))
+            command = "flush_path {filepath}".format(filepath=filepath)
+            self.asok_command(mds_rank, command, success_validator)
+        command = "scrub_path {filepath}".format(filepath=filepath)
+        self.asok_command(mds_rank, command, success_validator)
+
+        filepath = "{repo_path}/suites/fs/basic/clusters/fixed-3-cephfs.yaml". \
+            format(repo_path=test_repo_path)
+        command = "scrub_path {filepath}".format(filepath=filepath)
+        self.asok_command(mds_rank, command,
+                          lambda j, r: self.json_validator(j, r, "performed_validation",
+                                                           False))
+
+        if run_seq == 0:
+            log.info("First run: flushing base dir /")
+            command = "flush_path /"
+            self.asok_command(mds_rank, command, success_validator)
+        command = "scrub_path /"
+        self.asok_command(mds_rank, command, success_validator)
+
+        new_dir = "{repo_path}/new_dir_{i}".format(repo_path=repo_path, i=run_seq)
+        test_new_dir = "{repo_path}/new_dir_{i}".format(repo_path=test_repo_path,
+                                                        i=run_seq)
+        self.mount_a.run_shell(["mkdir", new_dir])
+        command = "flush_path {dir}".format(dir=test_new_dir)
+        self.asok_command(mds_rank, command, success_validator)
+
+        new_file = "{repo_path}/new_file_{i}".format(repo_path=repo_path,
+                                                     i=run_seq)
+        test_new_file = "{repo_path}/new_file_{i}".format(repo_path=test_repo_path,
+                                                          i=run_seq)
+        self.mount_a.write_n_mb(new_file, 1)
+
+        command = "flush_path {file}".format(file=test_new_file)
+        self.asok_command(mds_rank, command, success_validator)
+
+        # check that scrub fails on errors
+        ino = self.mount_a.path_to_ino(new_file)
+        rados_obj_name = "{ino:x}.00000000".format(ino=ino)
+        command = "scrub_path {file}".format(file=test_new_file)
+
+        # Missing parent xattr -> ENODATA
+        self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name())
+        self.asok_command(mds_rank, command,
+                          lambda j, r: self.json_validator(j, r, "return_code", -errno.ENODATA))
+
+        # Missing object -> ENOENT
+        self.fs.rados(["rm", rados_obj_name], pool=self.fs.get_data_pool_name())
+        self.asok_command(mds_rank, command,
+                          lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT))
+
+        command = "flush_path /"
+        self.asok_command(mds_rank, command, success_validator)
+
+    def test_scrub_repair(self):
+        mds_rank = 0
+        test_dir = "scrub_repair_path"
+
+        self.mount_a.run_shell(["sudo", "mkdir", test_dir])
+        self.mount_a.run_shell(["sudo", "touch", "{0}/file".format(test_dir)])
+        dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino(test_dir))
+
+        self.mount_a.umount_wait()
+
+        # flush journal entries to dirfrag objects, and expire journal
+        self.fs.mds_asok(['flush', 'journal'])
+        self.fs.mds_stop()
+
+        # remove the dentry from dirfrag, cause incorrect fragstat/rstat
+        self.fs.rados(["rmomapkey", dir_objname, "file_head"],
+                      pool=self.fs.get_metadata_pool_name())
+
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # fragstat indicates the directory is not empty, rmdir should fail
+        with self.assertRaises(CommandFailedError) as ar:
+            self.mount_a.run_shell(["sudo", "rmdir", test_dir])
+        self.assertEqual(ar.exception.exitstatus, 1)
+
+        self.asok_command(mds_rank, "scrub_path /{0} repair".format(test_dir),
+                          lambda j, r: self.json_validator(j, r, "return_code", 0))
+
+	# wait a few second for background repair
+	time.sleep(10)
+
+	# fragstat should be fixed
+	self.mount_a.run_shell(["sudo", "rmdir", test_dir])
+
+    @staticmethod
+    def json_validator(json_out, rc, element, expected_value):
+        if rc != 0:
+            return False, "asok command returned error {rc}".format(rc=rc)
+        element_value = json_out.get(element)
+        if element_value != expected_value:
+            return False, "unexpectedly got {jv} instead of {ev}!".format(
+                jv=element_value, ev=expected_value)
+        return True, "Succeeded"
+
+    def asok_command(self, mds_rank, command, validator):
+        log.info("Running command '{command}'".format(command=command))
+
+        command_list = command.split()
+
+        # we just assume there's an active mds for every rank
+        mds_id = self.fs.get_active_names()[mds_rank]
+        proc = self.fs.mon_manager.admin_socket('mds', mds_id,
+                                                command_list, check_status=False)
+        rout = proc.exitstatus
+        sout = proc.stdout.getvalue()
+
+        if sout.strip():
+            jout = json.loads(sout)
+        else:
+            jout = None
+
+        log.info("command '{command}' got response code " +
+                 "'{rout}' and stdout '{sout}'".format(
+                     command=command, rout=rout, sout=sout))
+
+        success, errstring = validator(jout, rout)
+
+        if not success:
+            raise AsokCommandFailedError(command, rout, jout, errstring)
+
+        return jout
+
+    def clone_repo(self, client_mount, path):
+        repo = "ceph-qa-suite"
+        repo_path = os.path.join(path, repo)
+        client_mount.run_shell(["mkdir", "-p", path])
+
+        try:
+            client_mount.stat(repo_path)
+        except CommandFailedError:
+            client_mount.run_shell([
+                "git", "clone", '--branch', 'giant',
+                "http://github.com/ceph/{repo}".format(repo=repo),
+                "{path}/{repo}".format(path=path, repo=repo)
+            ])
+
+        return repo_path
+
+
+class AsokCommandFailedError(Exception):
+    """
+    Exception thrown when we get an unexpected response
+    on an admin socket command
+    """
+
+    def __init__(self, command, rc, json_out, errstring):
+        self.command = command
+        self.rc = rc
+        self.json = json_out
+        self.errstring = errstring
+
+    def __str__(self):
+        return "Admin socket: {command} failed with rc={rc}," + \
+               "json output={json}, because '{es}'".format(
+                   command=self.command, rc=self.rc,
+                   json=self.json, es=self.errstring)
diff --git a/src/ceph/qa/tasks/cephfs/test_sessionmap.py b/src/ceph/qa/tasks/cephfs/test_sessionmap.py
new file mode 100644
index 0000000..9d12ab6
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_sessionmap.py
@@ -0,0 +1,235 @@
+from StringIO import StringIO
+import json
+import logging
+from unittest import SkipTest
+
+from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.exceptions import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+class TestSessionMap(CephFSTestCase):
+    CLIENTS_REQUIRED = 2
+    MDSS_REQUIRED = 2
+
+    def test_tell_session_drop(self):
+        """
+        That when a `tell` command is sent using the python CLI,
+        its MDS session is gone after it terminates
+        """
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        mds_id = self.fs.get_lone_mds_id()
+        self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "session", "ls")
+
+        ls_data = self.fs.mds_asok(['session', 'ls'])
+        self.assertEqual(len(ls_data), 0)
+
+    def _get_thread_count(self, mds_id):
+        remote = self.fs.mds_daemons[mds_id].remote
+
+        ps_txt = remote.run(
+            args=["ps", "-ww", "axo", "nlwp,cmd"],
+            stdout=StringIO()
+        ).stdout.getvalue().strip()
+        lines = ps_txt.split("\n")[1:]
+
+        for line in lines:
+            if "ceph-mds" in line and not "daemon-helper" in line:
+                if line.find("-i {0}".format(mds_id)) != -1:
+                    log.info("Found ps line for daemon: {0}".format(line))
+                    return int(line.split()[0])
+
+        raise RuntimeError("No process found in ps output for MDS {0}: {1}".format(
+            mds_id, ps_txt
+        ))
+
+    def test_tell_conn_close(self):
+        """
+        That when a `tell` command is sent using the python CLI,
+        the thread count goes back to where it started (i.e. we aren't
+        leaving connections open)
+        """
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        mds_id = self.fs.get_lone_mds_id()
+
+        initial_thread_count = self._get_thread_count(mds_id)
+        self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "session", "ls")
+        final_thread_count = self._get_thread_count(mds_id)
+
+        self.assertEqual(initial_thread_count, final_thread_count)
+
+    def test_mount_conn_close(self):
+        """
+        That when a client unmounts, the thread count on the MDS goes back
+        to what it was before the client mounted
+        """
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        mds_id = self.fs.get_lone_mds_id()
+
+        initial_thread_count = self._get_thread_count(mds_id)
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+        self.assertGreater(self._get_thread_count(mds_id), initial_thread_count)
+        self.mount_a.umount_wait()
+        final_thread_count = self._get_thread_count(mds_id)
+
+        self.assertEqual(initial_thread_count, final_thread_count)
+
+    def test_version_splitting(self):
+        """
+        That when many sessions are updated, they are correctly
+        split into multiple versions to obey mds_sessionmap_keys_per_op
+        """
+
+        # Start umounted
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        # Configure MDS to write one OMAP key at once
+        self.set_conf('mds', 'mds_sessionmap_keys_per_op', 1)
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        # I would like two MDSs, so that I can do an export dir later
+        self.fs.set_max_mds(2)
+        self.fs.wait_for_daemons()
+
+        active_mds_names = self.fs.get_active_names()
+        rank_0_id = active_mds_names[0]
+        rank_1_id = active_mds_names[1]
+        log.info("Ranks 0 and 1 are {0} and {1}".format(
+            rank_0_id, rank_1_id))
+
+        # Bring the clients back
+        self.mount_a.mount()
+        self.mount_b.mount()
+        self.mount_a.create_files()  # Kick the client into opening sessions
+        self.mount_b.create_files()
+
+        # See that they've got sessions
+        self.assert_session_count(2, mds_id=rank_0_id)
+
+        # See that we persist their sessions
+        self.fs.mds_asok(["flush", "journal"], rank_0_id)
+        table_json = json.loads(self.fs.table_tool(["0", "show", "session"]))
+        log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2)))
+        self.assertEqual(table_json['0']['result'], 0)
+        self.assertEqual(len(table_json['0']['data']['Sessions']), 2)
+
+        # Now, induce a "force_open_sessions" event by exporting a dir
+        self.mount_a.run_shell(["mkdir", "bravo"])
+        self.mount_a.run_shell(["touch", "bravo/file"])
+        self.mount_b.run_shell(["ls", "-l", "bravo/file"])
+
+        def get_omap_wrs():
+            return self.fs.mds_asok(['perf', 'dump', 'objecter'], rank_1_id)['objecter']['omap_wr']
+
+        # Flush so that there are no dirty sessions on rank 1
+        self.fs.mds_asok(["flush", "journal"], rank_1_id)
+
+        # Export so that we get a force_open to rank 1 for the two sessions from rank 0
+        initial_omap_wrs = get_omap_wrs()
+        self.fs.mds_asok(['export', 'dir', '/bravo', '1'], rank_0_id)
+
+        # This is the critical (if rather subtle) check: that in the process of doing an export dir,
+        # we hit force_open_sessions, and as a result we end up writing out the sessionmap.  There
+        # will be two sessions dirtied here, and because we have set keys_per_op to 1, we should see
+        # a single session get written out (the first of the two, triggered by the second getting marked
+        # dirty)
+        # The number of writes is two per session, because the header (sessionmap version) update and
+        # KV write both count.
+        self.wait_until_true(
+            lambda: get_omap_wrs() - initial_omap_wrs == 2,
+            timeout=10  # Long enough for an export to get acked
+        )
+
+        # Now end our sessions and check the backing sessionmap is updated correctly
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+
+        # In-memory sessionmap check
+        self.assert_session_count(0, mds_id=rank_0_id)
+
+        # On-disk sessionmap check
+        self.fs.mds_asok(["flush", "journal"], rank_0_id)
+        table_json = json.loads(self.fs.table_tool(["0", "show", "session"]))
+        log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2)))
+        self.assertEqual(table_json['0']['result'], 0)
+        self.assertEqual(len(table_json['0']['data']['Sessions']), 0)
+
+    def _sudo_write_file(self, remote, path, data):
+        """
+        Write data to a remote file as super user
+
+        :param remote: Remote site.
+        :param path: Path on the remote being written to.
+        :param data: Data to be written.
+
+        Both perms and owner are passed directly to chmod.
+        """
+        remote.run(
+            args=[
+                'sudo',
+                'python',
+                '-c',
+                'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))',
+                path,
+            ],
+            stdin=data,
+        )
+
+    def _configure_auth(self, mount, id_name, mds_caps, osd_caps=None, mon_caps=None):
+        """
+        Set up auth credentials for a client mount, and write out the keyring
+        for the client to use.
+        """
+
+        if osd_caps is None:
+            osd_caps = "allow rw"
+
+        if mon_caps is None:
+            mon_caps = "allow r"
+
+        out = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.{name}".format(name=id_name),
+            "mds", mds_caps,
+            "osd", osd_caps,
+            "mon", mon_caps
+        )
+        mount.client_id = id_name
+        self._sudo_write_file(mount.client_remote, mount.get_keyring_path(), out)
+        self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path())
+
+    def test_session_reject(self):
+        if not isinstance(self.mount_a, FuseMount):
+            raise SkipTest("Requires FUSE client to inject client metadata")
+
+        self.mount_a.run_shell(["mkdir", "foo"])
+        self.mount_a.run_shell(["mkdir", "foo/bar"])
+        self.mount_a.umount_wait()
+
+        # Mount B will be my rejected client
+        self.mount_b.umount_wait()
+
+        # Configure a client that is limited to /foo/bar
+        self._configure_auth(self.mount_b, "badguy", "allow rw path=/foo/bar")
+        # Check he can mount that dir and do IO
+        self.mount_b.mount(mount_path="/foo/bar")
+        self.mount_b.wait_until_mounted()
+        self.mount_b.create_destroy()
+        self.mount_b.umount_wait()
+
+        # Configure the client to claim that its mount point metadata is /baz
+        self.set_conf("client.badguy", "client_metadata", "root=/baz")
+        # Try to mount the client, see that it fails
+        with self.assert_cluster_log("client session with invalid root '/baz' denied"):
+            with self.assertRaises(CommandFailedError):
+                self.mount_b.mount(mount_path="/foo/bar")
diff --git a/src/ceph/qa/tasks/cephfs/test_strays.py b/src/ceph/qa/tasks/cephfs/test_strays.py
new file mode 100644
index 0000000..b64f3e9
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_strays.py
@@ -0,0 +1,1049 @@
+import json
+import time
+import logging
+from textwrap import dedent
+import datetime
+import gevent
+import datetime
+
+from teuthology.orchestra.run import CommandFailedError, Raw
+from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
+
+log = logging.getLogger(__name__)
+
+
+class TestStrays(CephFSTestCase):
+    MDSS_REQUIRED = 2
+
+    OPS_THROTTLE = 1
+    FILES_THROTTLE = 2
+
+    # Range of different file sizes used in throttle test's workload
+    throttle_workload_size_range = 16
+
+    @for_teuthology
+    def test_ops_throttle(self):
+        self._test_throttling(self.OPS_THROTTLE)
+
+    @for_teuthology
+    def test_files_throttle(self):
+        self._test_throttling(self.FILES_THROTTLE)
+
+    def test_dir_deletion(self):
+        """
+        That when deleting a bunch of dentries and the containing
+        directory, everything gets purged.
+        Catches cases where the client might e.g. fail to trim
+        the unlinked dir from its cache.
+        """
+        file_count = 1000
+        create_script = dedent("""
+            import os
+
+            mount_path = "{mount_path}"
+            subdir = "delete_me"
+            size = {size}
+            file_count = {file_count}
+            os.mkdir(os.path.join(mount_path, subdir))
+            for i in xrange(0, file_count):
+                filename = "{{0}}_{{1}}.bin".format(i, size)
+                f = open(os.path.join(mount_path, subdir, filename), 'w')
+                f.write(size * 'x')
+                f.close()
+        """.format(
+            mount_path=self.mount_a.mountpoint,
+            size=1024,
+            file_count=file_count
+        ))
+
+        self.mount_a.run_python(create_script)
+
+        # That the dirfrag object is created
+        self.fs.mds_asok(["flush", "journal"])
+        dir_ino = self.mount_a.path_to_ino("delete_me")
+        self.assertTrue(self.fs.dirfrag_exists(dir_ino, 0))
+
+        # Remove everything
+        self.mount_a.run_shell(["rm", "-rf", "delete_me"])
+        self.fs.mds_asok(["flush", "journal"])
+
+        # That all the removed files get created as strays
+        strays = self.get_mdc_stat("strays_created")
+        self.assertEqual(strays, file_count + 1)
+
+        # That the strays all get enqueued for purge
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("strays_enqueued"),
+            strays,
+            timeout=600
+
+        )
+
+        # That all the purge operations execute
+        self.wait_until_equal(
+            lambda: self.get_stat("purge_queue", "pq_executed"),
+            strays,
+            timeout=600
+        )
+
+        # That finally, the directory metadata object is gone
+        self.assertFalse(self.fs.dirfrag_exists(dir_ino, 0))
+
+        # That finally, the data objects are all gone
+        self.await_data_pool_empty()
+
+    def _test_throttling(self, throttle_type):
+        self.data_log = []
+        try:
+            return self._do_test_throttling(throttle_type)
+        except:
+            for l in self.data_log:
+                log.info(",".join([l_.__str__() for l_ in l]))
+            raise
+
+    def _do_test_throttling(self, throttle_type):
+        """
+        That the mds_max_purge_ops setting is respected
+        """
+
+        def set_throttles(files, ops):
+            """
+            Helper for updating ops/files limits, and calculating effective
+            ops_per_pg setting to give the same ops limit.
+            """
+            self.set_conf('mds', 'mds_max_purge_files', "%d" % files)
+            self.set_conf('mds', 'mds_max_purge_ops', "%d" % ops)
+
+            pgs = self.fs.mon_manager.get_pool_property(
+                self.fs.get_data_pool_name(),
+                "pg_num"
+            )
+            ops_per_pg = float(ops) / pgs
+            self.set_conf('mds', 'mds_max_purge_ops_per_pg', "%s" % ops_per_pg)
+
+        # Test conditions depend on what we're going to be exercising.
+        # * Lift the threshold on whatever throttle we are *not* testing, so
+        #   that the throttle of interest is the one that will be the bottleneck
+        # * Create either many small files (test file count throttling) or fewer
+        #   large files (test op throttling)
+        if throttle_type == self.OPS_THROTTLE:
+            set_throttles(files=100000000, ops=16)
+            size_unit = 1024 * 1024  # big files, generate lots of ops
+            file_multiplier = 100
+        elif throttle_type == self.FILES_THROTTLE:
+            # The default value of file limit is pretty permissive, so to avoid
+            # the test running too fast, create lots of files and set the limit
+            # pretty low.
+            set_throttles(ops=100000000, files=6)
+            size_unit = 1024  # small, numerous files
+            file_multiplier = 200
+        else:
+            raise NotImplemented(throttle_type)
+
+        # Pick up config changes
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        create_script = dedent("""
+            import os
+
+            mount_path = "{mount_path}"
+            subdir = "delete_me"
+            size_unit = {size_unit}
+            file_multiplier = {file_multiplier}
+            os.mkdir(os.path.join(mount_path, subdir))
+            for i in xrange(0, file_multiplier):
+                for size in xrange(0, {size_range}*size_unit, size_unit):
+                    filename = "{{0}}_{{1}}.bin".format(i, size / size_unit)
+                    f = open(os.path.join(mount_path, subdir, filename), 'w')
+                    f.write(size * 'x')
+                    f.close()
+        """.format(
+            mount_path=self.mount_a.mountpoint,
+            size_unit=size_unit,
+            file_multiplier=file_multiplier,
+            size_range=self.throttle_workload_size_range
+        ))
+
+        self.mount_a.run_python(create_script)
+
+        # We will run the deletion in the background, to reduce the risk of it completing before
+        # we have started monitoring the stray statistics.
+        def background():
+            self.mount_a.run_shell(["rm", "-rf", "delete_me"])
+            self.fs.mds_asok(["flush", "journal"])
+
+        background_thread = gevent.spawn(background)
+
+        total_inodes = file_multiplier * self.throttle_workload_size_range + 1
+        mds_max_purge_ops = int(self.fs.get_config("mds_max_purge_ops", 'mds'))
+        mds_max_purge_files = int(self.fs.get_config("mds_max_purge_files", 'mds'))
+
+        # During this phase we look for the concurrent ops to exceed half
+        # the limit (a heuristic) and not exceed the limit (a correctness
+        # condition).
+        purge_timeout = 600
+        elapsed = 0
+        files_high_water = 0
+        ops_high_water = 0
+
+        while True:
+            stats = self.fs.mds_asok(['perf', 'dump'])
+            mdc_stats = stats['mds_cache']
+            pq_stats = stats['purge_queue']
+            if elapsed >= purge_timeout:
+                raise RuntimeError("Timeout waiting for {0} inodes to purge, stats:{1}".format(total_inodes, mdc_stats))
+
+            num_strays = mdc_stats['num_strays']
+            num_strays_purging = pq_stats['pq_executing']
+            num_purge_ops = pq_stats['pq_executing_ops']
+
+            self.data_log.append([datetime.datetime.now(), num_strays, num_strays_purging, num_purge_ops])
+
+            files_high_water = max(files_high_water, num_strays_purging)
+            ops_high_water = max(ops_high_water, num_purge_ops)
+
+            total_strays_created = mdc_stats['strays_created']
+            total_strays_purged = pq_stats['pq_executed']
+
+            if total_strays_purged == total_inodes:
+                log.info("Complete purge in {0} seconds".format(elapsed))
+                break
+            elif total_strays_purged > total_inodes:
+                raise RuntimeError("Saw more strays than expected, mdc stats: {0}".format(mdc_stats))
+            else:
+                if throttle_type == self.OPS_THROTTLE:
+                    # 11 is filer_max_purge_ops plus one for the backtrace:
+                    # limit is allowed to be overshot by this much.
+                    if num_purge_ops > mds_max_purge_ops + 11:
+                        raise RuntimeError("num_purge_ops violates threshold {0}/{1}".format(
+                            num_purge_ops, mds_max_purge_ops
+                        ))
+                elif throttle_type == self.FILES_THROTTLE:
+                    if num_strays_purging > mds_max_purge_files:
+                        raise RuntimeError("num_strays_purging violates threshold {0}/{1}".format(
+                            num_strays_purging, mds_max_purge_files
+                        ))
+                else:
+                    raise NotImplemented(throttle_type)
+
+                log.info("Waiting for purge to complete {0}/{1}, {2}/{3}".format(
+                    num_strays_purging, num_strays,
+                    total_strays_purged, total_strays_created
+                ))
+                time.sleep(1)
+                elapsed += 1
+
+        background_thread.join()
+
+        # Check that we got up to a respectable rate during the purge.  This is totally
+        # racy, but should be safeish unless the cluster is pathologically slow, or
+        # insanely fast such that the deletions all pass before we have polled the
+        # statistics.
+        if throttle_type == self.OPS_THROTTLE:
+            if ops_high_water < mds_max_purge_ops / 2:
+                raise RuntimeError("Ops in flight high water is unexpectedly low ({0} / {1})".format(
+                    ops_high_water, mds_max_purge_ops
+                ))
+        elif throttle_type == self.FILES_THROTTLE:
+            if files_high_water < mds_max_purge_files / 2:
+                raise RuntimeError("Files in flight high water is unexpectedly low ({0} / {1})".format(
+                    ops_high_water, mds_max_purge_files
+                ))
+
+        # Sanity check all MDC stray stats
+        stats = self.fs.mds_asok(['perf', 'dump'])
+        mdc_stats = stats['mds_cache']
+        pq_stats = stats['purge_queue']
+        self.assertEqual(mdc_stats['num_strays'], 0)
+        self.assertEqual(mdc_stats['num_strays_delayed'], 0)
+        self.assertEqual(pq_stats['pq_executing'], 0)
+        self.assertEqual(pq_stats['pq_executing_ops'], 0)
+        self.assertEqual(mdc_stats['strays_created'], total_inodes)
+        self.assertEqual(mdc_stats['strays_enqueued'], total_inodes)
+        self.assertEqual(pq_stats['pq_executed'], total_inodes)
+
+    def get_mdc_stat(self, name, mds_id=None):
+        return self.get_stat("mds_cache", name, mds_id)
+
+    def get_stat(self, subsys, name, mds_id=None):
+        return self.fs.mds_asok(['perf', 'dump', subsys, name],
+                                mds_id=mds_id)[subsys][name]
+
+    def _wait_for_counter(self, subsys, counter, expect_val, timeout=60,
+                          mds_id=None):
+        self.wait_until_equal(
+            lambda: self.get_stat(subsys, counter, mds_id),
+            expect_val=expect_val, timeout=timeout,
+            reject_fn=lambda x: x > expect_val
+        )
+
+    def test_open_inode(self):
+        """
+        That the case of a dentry unlinked while a client holds an
+        inode open is handled correctly.
+
+        The inode should be moved into a stray dentry, while the original
+        dentry and directory should be purged.
+
+        The inode's data should be purged when the client eventually closes
+        it.
+        """
+        mount_a_client_id = self.mount_a.get_global_id()
+
+        # Write some bytes to a file
+        size_mb = 8
+
+        # Hold the file open
+        p = self.mount_a.open_background("open_file")
+        self.mount_a.write_n_mb("open_file", size_mb)
+        open_file_ino = self.mount_a.path_to_ino("open_file")
+
+        self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2)
+
+        # Unlink the dentry
+        self.mount_a.run_shell(["rm", "-f", "open_file"])
+
+        # Wait to see the stray count increment
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=1, timeout=60, reject_fn=lambda x: x > 1)
+
+        # See that while the stray count has incremented, none have passed
+        # on to the purge queue
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+        self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0)
+
+        # See that the client still holds 2 caps
+        self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2)
+
+        # See that the data objects remain in the data pool
+        self.assertTrue(self.fs.data_objects_present(open_file_ino, size_mb * 1024 * 1024))
+
+        # Now close the file
+        self.mount_a.kill_background(p)
+
+        # Wait to see the client cap count decrement
+        self.wait_until_equal(
+            lambda: self.get_session(mount_a_client_id)['num_caps'],
+            expect_val=1, timeout=60, reject_fn=lambda x: x > 2 or x < 1
+        )
+        # Wait to see the purge counter increment, stray count go to zero
+        self._wait_for_counter("mds_cache", "strays_enqueued", 1)
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0, timeout=6, reject_fn=lambda x: x > 1
+        )
+        self._wait_for_counter("purge_queue", "pq_executed", 1)
+
+        # See that the data objects no longer exist
+        self.assertTrue(self.fs.data_objects_absent(open_file_ino, size_mb * 1024 * 1024))
+
+        self.await_data_pool_empty()
+
+    def test_hardlink_reintegration(self):
+        """
+        That removal of primary dentry of hardlinked inode results
+        in reintegration of inode into the previously-remote dentry,
+        rather than lingering as a stray indefinitely.
+        """
+        # Write some bytes to file_a
+        size_mb = 8
+        self.mount_a.run_shell(["mkdir", "dir_1"])
+        self.mount_a.write_n_mb("dir_1/file_a", size_mb)
+        ino = self.mount_a.path_to_ino("dir_1/file_a")
+
+        # Create a hardlink named file_b
+        self.mount_a.run_shell(["mkdir", "dir_2"])
+        self.mount_a.run_shell(["ln", "dir_1/file_a", "dir_2/file_b"])
+        self.assertEqual(self.mount_a.path_to_ino("dir_2/file_b"), ino)
+
+        # Flush journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # See that backtrace for the file points to the file_a path
+        pre_unlink_bt = self.fs.read_backtrace(ino)
+        self.assertEqual(pre_unlink_bt['ancestors'][0]['dname'], "file_a")
+
+        # empty mds cache. otherwise mds reintegrates stray when unlink finishes
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(['flush', 'journal'])
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+        self.mount_a.mount()
+
+        # Unlink file_a
+        self.mount_a.run_shell(["rm", "-f", "dir_1/file_a"])
+
+        # See that a stray was created
+        self.assertEqual(self.get_mdc_stat("num_strays"), 1)
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+
+        # Wait, see that data objects are still present (i.e. that the
+        # stray did not advance to purging given time)
+        time.sleep(30)
+        self.assertTrue(self.fs.data_objects_present(ino, size_mb * 1024 * 1024))
+        self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0)
+
+        # See that before reintegration, the inode's backtrace points to a stray dir
+        self.fs.mds_asok(['flush', 'journal'])
+        self.assertTrue(self.get_backtrace_path(ino).startswith("stray"))
+
+        last_reintegrated = self.get_mdc_stat("strays_reintegrated")
+
+        # Do a metadata operation on the remaining link (mv is heavy handed, but
+        # others like touch may be satisfied from caps without poking MDS)
+        self.mount_a.run_shell(["mv", "dir_2/file_b", "dir_2/file_c"])
+
+        # Stray reintegration should happen as a result of the eval_remote call
+        # on responding to a client request.
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0,
+            timeout=60
+        )
+
+        # See the reintegration counter increment
+        curr_reintegrated = self.get_mdc_stat("strays_reintegrated")
+        self.assertGreater(curr_reintegrated, last_reintegrated)
+        last_reintegrated = curr_reintegrated
+
+        # Flush the journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # See that the backtrace for the file points to the remaining link's path
+        post_reint_bt = self.fs.read_backtrace(ino)
+        self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_c")
+
+        # mds should reintegrates stray when unlink finishes
+        self.mount_a.run_shell(["ln", "dir_2/file_c", "dir_2/file_d"])
+        self.mount_a.run_shell(["rm", "-f", "dir_2/file_c"])
+
+        # Stray reintegration should happen as a result of the notify_stray call
+        # on completion of unlink
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0,
+            timeout=60
+        )
+
+        # See the reintegration counter increment
+        curr_reintegrated = self.get_mdc_stat("strays_reintegrated")
+        self.assertGreater(curr_reintegrated, last_reintegrated)
+        last_reintegrated = curr_reintegrated
+
+        # Flush the journal
+        self.fs.mds_asok(['flush', 'journal'])
+
+        # See that the backtrace for the file points to the newest link's path
+        post_reint_bt = self.fs.read_backtrace(ino)
+        self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_d")
+
+        # Now really delete it
+        self.mount_a.run_shell(["rm", "-f", "dir_2/file_d"])
+        self._wait_for_counter("mds_cache", "strays_enqueued", 1)
+        self._wait_for_counter("purge_queue", "pq_executed", 1)
+
+        self.assert_purge_idle()
+        self.assertTrue(self.fs.data_objects_absent(ino, size_mb * 1024 * 1024))
+
+        # We caused the inode to go stray 3 times
+        self.assertEqual(self.get_mdc_stat("strays_created"), 3)
+        # We purged it at the last
+        self.assertEqual(self.get_mdc_stat("strays_enqueued"), 1)
+
+    def test_mv_hardlink_cleanup(self):
+        """
+        That when doing a rename from A to B, and B has hardlinks,
+        then we make a stray for B which is then reintegrated
+        into one of his hardlinks.
+        """
+        # Create file_a, file_b, and a hardlink to file_b
+        size_mb = 8
+        self.mount_a.write_n_mb("file_a", size_mb)
+        file_a_ino = self.mount_a.path_to_ino("file_a")
+
+        self.mount_a.write_n_mb("file_b", size_mb)
+        file_b_ino = self.mount_a.path_to_ino("file_b")
+
+        self.mount_a.run_shell(["ln", "file_b", "linkto_b"])
+        self.assertEqual(self.mount_a.path_to_ino("linkto_b"), file_b_ino)
+
+        # mv file_a file_b
+        self.mount_a.run_shell(["mv", "file_a", "file_b"])
+
+        # Stray reintegration should happen as a result of the notify_stray call on
+        # completion of rename
+        self.wait_until_equal(
+            lambda: self.get_mdc_stat("num_strays"),
+            expect_val=0,
+            timeout=60
+        )
+
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+        self.assertGreaterEqual(self.get_mdc_stat("strays_reintegrated"), 1)
+
+        # No data objects should have been deleted, as both files still have linkage.
+        self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024))
+        self.assertTrue(self.fs.data_objects_present(file_b_ino, size_mb * 1024 * 1024))
+
+        self.fs.mds_asok(['flush', 'journal'])
+
+        post_reint_bt = self.fs.read_backtrace(file_b_ino)
+        self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "linkto_b")
+
+    def _setup_two_ranks(self):
+        # Set up two MDSs
+        self.fs.set_max_mds(2)
+
+        # See that we have two active MDSs
+        self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30,
+                              reject_fn=lambda v: v > 2 or v < 1)
+
+        active_mds_names = self.fs.get_active_names()
+        rank_0_id = active_mds_names[0]
+        rank_1_id = active_mds_names[1]
+        log.info("Ranks 0 and 1 are {0} and {1}".format(
+            rank_0_id, rank_1_id))
+
+        # Get rid of other MDS daemons so that it's easier to know which
+        # daemons to expect in which ranks after restarts
+        for unneeded_mds in set(self.mds_cluster.mds_ids) - {rank_0_id, rank_1_id}:
+            self.mds_cluster.mds_stop(unneeded_mds)
+            self.mds_cluster.mds_fail(unneeded_mds)
+
+        return rank_0_id, rank_1_id
+
+    def _force_migrate(self, to_id, path, watch_ino):
+        """
+        :param to_id: MDS id to move it to
+        :param path: Filesystem path (string) to move
+        :param watch_ino: Inode number to look for at destination to confirm move
+        :return: None
+        """
+        self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", path])
+
+        # Poll the MDS cache dump to watch for the export completing
+        migrated = False
+        migrate_timeout = 60
+        migrate_elapsed = 0
+        while not migrated:
+            data = self.fs.mds_asok(["dump", "cache"], to_id)
+            for inode_data in data:
+                if inode_data['ino'] == watch_ino:
+                    log.debug("Found ino in cache: {0}".format(json.dumps(inode_data, indent=2)))
+                    if inode_data['is_auth'] is True:
+                        migrated = True
+                    break
+
+            if not migrated:
+                if migrate_elapsed > migrate_timeout:
+                    raise RuntimeError("Migration hasn't happened after {0}s!".format(migrate_elapsed))
+                else:
+                    migrate_elapsed += 1
+                    time.sleep(1)
+
+    def _is_stopped(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank not in [i['rank'] for i in mds_map['info'].values()]
+
+    def test_purge_on_shutdown(self):
+        """
+        That when an MDS rank is shut down, its purge queue is
+        drained in the process.
+        """
+        rank_0_id, rank_1_id = self._setup_two_ranks()
+
+        self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0")
+        self.mds_cluster.mds_fail_restart(rank_1_id)
+        self.fs.wait_for_daemons()
+
+        file_count = 5
+
+        self.mount_a.create_n_files("delete_me/file", file_count)
+
+        self._force_migrate(rank_1_id, "delete_me",
+                            self.mount_a.path_to_ino("delete_me/file_0"))
+
+        self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")])
+        self.mount_a.umount_wait()
+
+        # See all the strays go into purge queue
+        self._wait_for_counter("mds_cache", "strays_created", file_count, mds_id=rank_1_id)
+        self._wait_for_counter("mds_cache", "strays_enqueued", file_count, mds_id=rank_1_id)
+        self.assertEqual(self.get_stat("mds_cache", "num_strays", mds_id=rank_1_id), 0)
+
+        # See nothing get purged from the purge queue (yet)
+        time.sleep(10)
+        self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0)
+
+        # Shut down rank 1
+        self.fs.set_max_mds(1)
+        self.fs.deactivate(1)
+
+        # It shouldn't proceed past stopping because its still not allowed
+        # to purge
+        time.sleep(10)
+        self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0)
+        self.assertFalse(self._is_stopped(1))
+
+        # Permit the daemon to start purging again
+        self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.{0}'.format(rank_1_id),
+                                            'injectargs',
+                                            "--mds_max_purge_files 100")
+
+        # It should now proceed through shutdown
+        self.wait_until_true(
+            lambda: self._is_stopped(1),
+            timeout=60
+        )
+
+        # ...and in the process purge all that data
+        self.await_data_pool_empty()
+
+    def test_migration_on_shutdown(self):
+        """
+        That when an MDS rank is shut down, any non-purgeable strays
+        get migrated to another rank.
+        """
+
+        rank_0_id, rank_1_id = self._setup_two_ranks()
+
+        # Create a non-purgeable stray in a ~mds1 stray directory
+        # by doing a hard link and deleting the original file
+        self.mount_a.run_shell(["mkdir", "dir_1", "dir_2"])
+        self.mount_a.run_shell(["touch", "dir_1/original"])
+        self.mount_a.run_shell(["ln", "dir_1/original", "dir_2/linkto"])
+
+        self._force_migrate(rank_1_id, "dir_1",
+                            self.mount_a.path_to_ino("dir_1/original"))
+
+        # empty mds cache. otherwise mds reintegrates stray when unlink finishes
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(['flush', 'journal'], rank_0_id)
+        self.fs.mds_asok(['flush', 'journal'], rank_1_id)
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        active_mds_names = self.fs.get_active_names()
+        rank_0_id = active_mds_names[0]
+        rank_1_id = active_mds_names[1]
+
+        self.mount_a.mount()
+
+        self.mount_a.run_shell(["rm", "-f", "dir_1/original"])
+        self.mount_a.umount_wait()
+
+        self._wait_for_counter("mds_cache", "strays_created", 1,
+                               mds_id=rank_1_id)
+
+        # Shut down rank 1
+        self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "1")
+        self.fs.mon_manager.raw_cluster_cmd_result('mds', 'deactivate', "1")
+
+        # Wait til we get to a single active MDS mdsmap state
+        self.wait_until_true(lambda: self._is_stopped(1), timeout=120)
+
+        # See that the stray counter on rank 0 has incremented
+        self.assertEqual(self.get_mdc_stat("strays_created", rank_0_id), 1)
+
+    def assert_backtrace(self, ino, expected_path):
+        """
+        Assert that the backtrace in the data pool for an inode matches
+        an expected /foo/bar path.
+        """
+        expected_elements = expected_path.strip("/").split("/")
+        bt = self.fs.read_backtrace(ino)
+        actual_elements = list(reversed([dn['dname'] for dn in bt['ancestors']]))
+        self.assertListEqual(expected_elements, actual_elements)
+
+    def get_backtrace_path(self, ino):
+        bt = self.fs.read_backtrace(ino)
+        elements = reversed([dn['dname'] for dn in bt['ancestors']])
+        return "/".join(elements)
+
+    def assert_purge_idle(self):
+        """
+        Assert that the MDS perf counters indicate no strays exist and
+        no ongoing purge activity.  Sanity check for when PurgeQueue should
+        be idle.
+        """
+        mdc_stats = self.fs.mds_asok(['perf', 'dump', "mds_cache"])['mds_cache']
+        pq_stats = self.fs.mds_asok(['perf', 'dump', "purge_queue"])['purge_queue']
+        self.assertEqual(mdc_stats["num_strays"], 0)
+        self.assertEqual(mdc_stats["num_strays_delayed"], 0)
+        self.assertEqual(pq_stats["pq_executing"], 0)
+        self.assertEqual(pq_stats["pq_executing_ops"], 0)
+
+    def test_mv_cleanup(self):
+        """
+        That when doing a rename from A to B, and B has no hardlinks,
+        then we make a stray for B and purge him.
+        """
+        # Create file_a and file_b, write some to both
+        size_mb = 8
+        self.mount_a.write_n_mb("file_a", size_mb)
+        file_a_ino = self.mount_a.path_to_ino("file_a")
+        self.mount_a.write_n_mb("file_b", size_mb)
+        file_b_ino = self.mount_a.path_to_ino("file_b")
+
+        self.fs.mds_asok(['flush', 'journal'])
+        self.assert_backtrace(file_a_ino, "file_a")
+        self.assert_backtrace(file_b_ino, "file_b")
+
+        # mv file_a file_b
+        self.mount_a.run_shell(['mv', 'file_a', 'file_b'])
+
+        # See that stray counter increments
+        self.assertEqual(self.get_mdc_stat("strays_created"), 1)
+        # Wait for purge counter to increment
+        self._wait_for_counter("mds_cache", "strays_enqueued", 1)
+        self._wait_for_counter("purge_queue", "pq_executed", 1)
+
+        self.assert_purge_idle()
+
+        # file_b should have been purged
+        self.assertTrue(self.fs.data_objects_absent(file_b_ino, size_mb * 1024 * 1024))
+
+        # Backtrace should have updated from file_a to file_b
+        self.fs.mds_asok(['flush', 'journal'])
+        self.assert_backtrace(file_a_ino, "file_b")
+
+        # file_a's data should still exist
+        self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024))
+
+    def _pool_df(self, pool_name):
+        """
+        Return a dict like
+            {
+                "kb_used": 0,
+                "bytes_used": 0,
+                "max_avail": 19630292406,
+                "objects": 0
+            }
+
+        :param pool_name: Which pool (must exist)
+        """
+        out = self.fs.mon_manager.raw_cluster_cmd("df", "--format=json-pretty")
+        for p in json.loads(out)['pools']:
+            if p['name'] == pool_name:
+                return p['stats']
+
+        raise RuntimeError("Pool '{0}' not found".format(pool_name))
+
+    def await_data_pool_empty(self):
+        self.wait_until_true(
+            lambda: self._pool_df(
+                self.fs.get_data_pool_name()
+            )['objects'] == 0,
+            timeout=60)
+
+    def test_snapshot_remove(self):
+        """
+        That removal of a snapshot that references a now-unlinked file results
+        in purging on the stray for the file.
+        """
+        # Enable snapshots
+        self.fs.mon_manager.raw_cluster_cmd("mds", "set", "allow_new_snaps", "true",
+                                            "--yes-i-really-mean-it")
+
+        # Create a dir with a file in it
+        size_mb = 8
+        self.mount_a.run_shell(["mkdir", "snapdir"])
+        self.mount_a.run_shell(["mkdir", "snapdir/subdir"])
+        self.mount_a.write_test_pattern("snapdir/subdir/file_a", size_mb * 1024 * 1024)
+        file_a_ino = self.mount_a.path_to_ino("snapdir/subdir/file_a")
+
+        # Snapshot the dir
+        self.mount_a.run_shell(["mkdir", "snapdir/.snap/snap1"])
+
+        # Cause the head revision to deviate from the snapshot
+        self.mount_a.write_n_mb("snapdir/subdir/file_a", size_mb)
+
+        # Flush the journal so that backtraces, dirfrag objects will actually be written
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Unlink the file
+        self.mount_a.run_shell(["rm", "-f", "snapdir/subdir/file_a"])
+        self.mount_a.run_shell(["rmdir", "snapdir/subdir"])
+
+        # Unmount the client because when I come back to check the data is still
+        # in the file I don't want to just see what's in the page cache.
+        self.mount_a.umount_wait()
+
+        self.assertEqual(self.get_mdc_stat("strays_created"), 2)
+
+        # FIXME: at this stage we see a purge and the stray count drops to
+        # zero, but there's actually still a stray, so at the very
+        # least the StrayManager stats code is slightly off
+
+        self.mount_a.mount()
+
+        # See that the data from the snapshotted revision of the file is still present
+        # and correct
+        self.mount_a.validate_test_pattern("snapdir/.snap/snap1/subdir/file_a", size_mb * 1024 * 1024)
+
+        # Remove the snapshot
+        self.mount_a.run_shell(["rmdir", "snapdir/.snap/snap1"])
+
+        # Purging file_a doesn't happen until after we've flushed the journal, because
+        # it is referenced by the snapshotted subdir, and the snapshot isn't really
+        # gone until the journal references to it are gone
+        self.fs.mds_asok(["flush", "journal"])
+
+        # Wait for purging to complete, which requires the OSDMap to propagate to the OSDs.
+        # See also: http://tracker.ceph.com/issues/20072
+        self.wait_until_true(
+            lambda: self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024),
+            timeout=60
+        )
+
+        # See that a purge happens now
+        self._wait_for_counter("mds_cache", "strays_enqueued", 2)
+        self._wait_for_counter("purge_queue", "pq_executed", 2)
+
+        self.await_data_pool_empty()
+
+    def test_fancy_layout(self):
+        """
+        purge stray file with fancy layout
+        """
+
+        file_name = "fancy_layout_file"
+        self.mount_a.run_shell(["touch", file_name])
+
+        file_layout = "stripe_unit=1048576 stripe_count=4 object_size=8388608"
+        self.mount_a.setfattr(file_name, "ceph.file.layout", file_layout)
+
+        # 35MB requires 7 objects
+        size_mb = 35
+        self.mount_a.write_n_mb(file_name, size_mb)
+
+        self.mount_a.run_shell(["rm", "-f", file_name])
+        self.fs.mds_asok(["flush", "journal"])
+
+        # can't use self.fs.data_objects_absent here, it does not support fancy layout
+        self.await_data_pool_empty()
+
+    def test_dirfrag_limit(self):
+        """
+        That the directory fragment size cannot exceed mds_bal_fragment_size_max (using a limit of 50 in all configurations).
+
+        That fragmentation (forced) will allow more entries to be created.
+
+        That unlinking fails when the stray directory fragment becomes too large and that unlinking may continue once those strays are purged.
+        """
+
+        self.fs.set_allow_dirfrags(True)
+
+        LOW_LIMIT = 50
+        for mds in self.fs.get_daemon_names():
+            self.fs.mds_asok(["config", "set", "mds_bal_fragment_size_max", str(LOW_LIMIT)], mds)
+
+        try:
+            self.mount_a.run_python(dedent("""
+                import os
+                path = os.path.join("{path}", "subdir")
+                os.mkdir(path)
+                for n in range(0, {file_count}):
+                    open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
+                """.format(
+            path=self.mount_a.mountpoint,
+            file_count=LOW_LIMIT+1
+            )))
+        except CommandFailedError:
+            pass # ENOSPAC
+        else:
+            raise RuntimeError("fragment size exceeded")
+
+        # Now test that we can go beyond the limit if we fragment the directory
+
+        self.mount_a.run_python(dedent("""
+            import os
+            path = os.path.join("{path}", "subdir2")
+            os.mkdir(path)
+            for n in range(0, {file_count}):
+                open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
+            dfd = os.open(path, os.O_DIRECTORY)
+            os.fsync(dfd)
+            """.format(
+        path=self.mount_a.mountpoint,
+        file_count=LOW_LIMIT
+        )))
+
+        # Ensure that subdir2 is fragmented
+        mds_id = self.fs.get_active_names()[0]
+        self.fs.mds_asok(["dirfrag", "split", "/subdir2", "0/0", "1"], mds_id)
+
+        # remount+flush (release client caps)
+        self.mount_a.umount_wait()
+        self.fs.mds_asok(["flush", "journal"], mds_id)
+        self.mount_a.mount()
+        self.mount_a.wait_until_mounted()
+
+        # Create 50% more files than the current fragment limit
+        self.mount_a.run_python(dedent("""
+            import os
+            path = os.path.join("{path}", "subdir2")
+            for n in range({file_count}, ({file_count}*3)//2):
+                open(os.path.join(path, "%s" % n), 'w').write("%s" % n)
+            """.format(
+        path=self.mount_a.mountpoint,
+        file_count=LOW_LIMIT
+        )))
+
+        # Now test the stray directory size is limited and recovers
+        strays_before = self.get_mdc_stat("strays_created")
+        try:
+            self.mount_a.run_python(dedent("""
+                import os
+                path = os.path.join("{path}", "subdir3")
+                os.mkdir(path)
+                for n in range({file_count}):
+                    fpath = os.path.join(path, "%s" % n)
+                    f = open(fpath, 'w')
+                    f.write("%s" % n)
+                    f.close()
+                    os.unlink(fpath)
+                """.format(
+            path=self.mount_a.mountpoint,
+            file_count=LOW_LIMIT*10 # 10 stray directories, should collide before this count
+            )))
+        except CommandFailedError:
+            pass # ENOSPAC
+        else:
+            raise RuntimeError("fragment size exceeded")
+
+        strays_after = self.get_mdc_stat("strays_created")
+        self.assertGreaterEqual(strays_after-strays_before, LOW_LIMIT)
+
+        self._wait_for_counter("mds_cache", "strays_enqueued", strays_after)
+        self._wait_for_counter("purge_queue", "pq_executed", strays_after)
+
+        self.mount_a.run_python(dedent("""
+            import os
+            path = os.path.join("{path}", "subdir4")
+            os.mkdir(path)
+            for n in range({file_count}):
+                fpath = os.path.join(path, "%s" % n)
+                f = open(fpath, 'w')
+                f.write("%s" % n)
+                f.close()
+                os.unlink(fpath)
+            """.format(
+        path=self.mount_a.mountpoint,
+        file_count=LOW_LIMIT
+        )))
+
+    def test_purge_queue_upgrade(self):
+        """
+        That when starting on a system with no purge queue in the metadata
+        pool, we silently create one.
+        :return:
+        """
+
+        self.mds_cluster.mds_stop()
+        self.mds_cluster.mds_fail()
+        self.fs.rados(["rm", "500.00000000"])
+        self.mds_cluster.mds_restart()
+        self.fs.wait_for_daemons()
+
+    def test_purge_queue_op_rate(self):
+        """
+        A busy purge queue is meant to aggregate operations sufficiently
+        that our RADOS ops to the metadata pool are not O(files).  Check
+        that that is so.
+        :return:
+        """
+
+        # For low rates of deletion, the rate of metadata ops actually
+        # will be o(files), so to see the desired behaviour we have to give
+        # the system a significant quantity, i.e. an order of magnitude
+        # more than the number of files it will purge at one time.
+
+        max_purge_files = 2
+
+        self.set_conf('mds', 'mds_bal_frag', 'false')
+        self.set_conf('mds', 'mds_max_purge_files', "%d" % max_purge_files)
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+
+        phase_1_files = 256
+        phase_2_files = 512
+
+        self.mount_a.run_shell(["mkdir", "phase1"])
+        self.mount_a.create_n_files("phase1/file", phase_1_files)
+
+        self.mount_a.run_shell(["mkdir", "phase2"])
+        self.mount_a.create_n_files("phase2/file", phase_2_files)
+
+        def unlink_and_count_ops(path, expected_deletions):
+            initial_ops = self.get_stat("objecter", "op")
+            initial_pq_executed = self.get_stat("purge_queue", "pq_executed")
+
+            self.mount_a.run_shell(["rm", "-rf", path])
+
+            self._wait_for_counter(
+                "purge_queue", "pq_executed", initial_pq_executed + expected_deletions
+            )
+
+            final_ops = self.get_stat("objecter", "op")
+
+            # Calculation of the *overhead* operations, i.e. do not include
+            # the operations where we actually delete files.
+            return final_ops - initial_ops - expected_deletions
+
+        self.fs.mds_asok(['flush', 'journal'])
+        phase1_ops = unlink_and_count_ops("phase1/", phase_1_files + 1)
+
+        self.fs.mds_asok(['flush', 'journal'])
+        phase2_ops = unlink_and_count_ops("phase2/", phase_2_files + 1)
+
+        log.info("Phase 1: {0}".format(phase1_ops))
+        log.info("Phase 2: {0}".format(phase2_ops))
+
+        # The success criterion is that deleting double the number
+        # of files doesn't generate double the number of overhead ops
+        # -- this comparison is a rough approximation of that rule.
+        self.assertTrue(phase2_ops < phase1_ops * 1.25)
+
+        # Finally, check that our activity did include properly quiescing
+        # the queue (i.e. call to Journaler::write_head in the right place),
+        # by restarting the MDS and checking that it doesn't try re-executing
+        # any of the work we did.
+        self.fs.mds_asok(['flush', 'journal'])  # flush to ensure no strays
+                                                # hanging around
+        self.fs.mds_fail_restart()
+        self.fs.wait_for_daemons()
+        time.sleep(10)
+        self.assertEqual(self.get_stat("purge_queue", "pq_executed"), 0)
+
+    def test_replicated_delete_speed(self):
+        """
+        That deletions of replicated metadata are not pathologically slow
+        """
+        rank_0_id, rank_1_id = self._setup_two_ranks()
+
+        self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0")
+        self.mds_cluster.mds_fail_restart(rank_1_id)
+        self.fs.wait_for_daemons()
+
+        file_count = 10
+
+        self.mount_a.create_n_files("delete_me/file", file_count)
+
+        self._force_migrate(rank_1_id, "delete_me",
+                            self.mount_a.path_to_ino("delete_me/file_0"))
+
+        begin = datetime.datetime.now()
+        self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")])
+        end = datetime.datetime.now()
+
+        # What we're really checking here is that we are completing client
+        # operations immediately rather than delaying until the next tick.
+        tick_period = float(self.fs.get_config("mds_tick_interval",
+                                               service_type="mds"))
+
+        duration = (end - begin).total_seconds()
+        self.assertLess(duration, (file_count * tick_period) * 0.25)
+
diff --git a/src/ceph/qa/tasks/cephfs/test_volume_client.py b/src/ceph/qa/tasks/cephfs/test_volume_client.py
new file mode 100644
index 0000000..0876af9
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs/test_volume_client.py
@@ -0,0 +1,1016 @@
+import json
+import logging
+import time
+import os
+from textwrap import dedent
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+from tasks.cephfs.fuse_mount import FuseMount
+from teuthology.exceptions import CommandFailedError
+
+log = logging.getLogger(__name__)
+
+
+class TestVolumeClient(CephFSTestCase):
+    # One for looking at the global filesystem, one for being
+    # the VolumeClient, two for mounting the created shares
+    CLIENTS_REQUIRED = 4
+
+    def _volume_client_python(self, client, script, vol_prefix=None, ns_prefix=None):
+        # Can't dedent this *and* the script we pass in, because they might have different
+        # levels of indentation to begin with, so leave this string zero-indented
+        if vol_prefix:
+            vol_prefix = "\"" + vol_prefix + "\""
+        if ns_prefix:
+            ns_prefix = "\"" + ns_prefix + "\""
+        return client.run_python("""
+from ceph_volume_client import CephFSVolumeClient, VolumePath
+import logging
+log = logging.getLogger("ceph_volume_client")
+log.addHandler(logging.StreamHandler())
+log.setLevel(logging.DEBUG)
+vc = CephFSVolumeClient("manila", "{conf_path}", "ceph", {vol_prefix}, {ns_prefix})
+vc.connect()
+{payload}
+vc.disconnect()
+        """.format(payload=script, conf_path=client.config_path, vol_prefix=vol_prefix, ns_prefix=ns_prefix))
+
+    def _sudo_write_file(self, remote, path, data):
+        """
+        Write data to a remote file as super user
+
+        :param remote: Remote site.
+        :param path: Path on the remote being written to.
+        :param data: Data to be written.
+
+        Both perms and owner are passed directly to chmod.
+        """
+        remote.run(
+            args=[
+                'sudo',
+                'python',
+                '-c',
+                'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))',
+                path,
+            ],
+            stdin=data,
+        )
+
+    def _configure_vc_auth(self, mount, id_name):
+        """
+        Set up auth credentials for the VolumeClient user
+        """
+        out = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-or-create", "client.{name}".format(name=id_name),
+            "mds", "allow *",
+            "osd", "allow rw",
+            "mon", "allow *"
+        )
+        mount.client_id = id_name
+        self._sudo_write_file(mount.client_remote, mount.get_keyring_path(), out)
+        self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path())
+
+    def _configure_guest_auth(self, volumeclient_mount, guest_mount,
+                              guest_entity, mount_path,
+                              namespace_prefix=None, readonly=False,
+                              tenant_id=None):
+        """
+        Set up auth credentials for the guest client to mount a volume.
+
+        :param volumeclient_mount: mount used as the handle for driving
+                                   volumeclient.
+        :param guest_mount: mount used by the guest client.
+        :param guest_entity: auth ID used by the guest client.
+        :param mount_path: path of the volume.
+        :param namespace_prefix: name prefix of the RADOS namespace, which
+                                 is used for the volume's layout.
+        :param readonly: defaults to False. If set to 'True' only read-only
+                         mount access is granted to the guest.
+        :param tenant_id: (OpenStack) tenant ID of the guest client.
+        """
+
+        head, volume_id = os.path.split(mount_path)
+        head, group_id = os.path.split(head)
+        head, volume_prefix = os.path.split(head)
+        volume_prefix = "/" + volume_prefix
+
+        # Authorize the guest client's auth ID to mount the volume.
+        key = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            auth_result = vc.authorize(vp, "{guest_entity}", readonly={readonly},
+                                       tenant_id="{tenant_id}")
+            print auth_result['auth_key']
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guest_entity,
+            readonly=readonly,
+            tenant_id=tenant_id)), volume_prefix, namespace_prefix
+        )
+
+        # CephFSVolumeClient's authorize() does not return the secret
+        # key to a caller who isn't multi-tenant aware. Explicitly
+        # query the key for such a client.
+        if not tenant_id:
+            key = self.fs.mon_manager.raw_cluster_cmd(
+            "auth", "get-key", "client.{name}".format(name=guest_entity),
+            )
+
+        # The guest auth ID should exist.
+        existing_ids = [a['entity'] for a in self.auth_list()]
+        self.assertIn("client.{0}".format(guest_entity), existing_ids)
+
+        # Create keyring file for the guest client.
+        keyring_txt = dedent("""
+        [client.{guest_entity}]
+            key = {key}
+
+        """.format(
+            guest_entity=guest_entity,
+            key=key
+        ))
+        guest_mount.client_id = guest_entity
+        self._sudo_write_file(guest_mount.client_remote,
+                              guest_mount.get_keyring_path(),
+                              keyring_txt)
+
+        # Add a guest client section to the ceph config file.
+        self.set_conf("client.{0}".format(guest_entity), "client quota", "True")
+        self.set_conf("client.{0}".format(guest_entity), "debug client", "20")
+        self.set_conf("client.{0}".format(guest_entity), "debug objecter", "20")
+        self.set_conf("client.{0}".format(guest_entity),
+                      "keyring", guest_mount.get_keyring_path())
+
+    def test_default_prefix(self):
+        group_id = "grpid"
+        volume_id = "volid"
+        DEFAULT_VOL_PREFIX = "volumes"
+        DEFAULT_NS_PREFIX = "fsvolumens_"
+
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        #create a volume with default prefix
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 10, data_isolated=True)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # The dir should be created
+        self.mount_a.stat(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id))
+
+        #namespace should be set
+        ns_in_attr = self.mount_a.getfattr(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id), "ceph.dir.layout.pool_namespace")
+        namespace = "{0}{1}".format(DEFAULT_NS_PREFIX, volume_id)
+        self.assertEqual(namespace, ns_in_attr)
+
+
+    def test_lifecycle(self):
+        """
+        General smoke test for create, extend, destroy
+        """
+
+        # I'm going to use mount_c later as a guest for mounting the created
+        # shares
+        self.mounts[2].umount_wait()
+
+        # I'm going to leave mount_b unmounted and just use it as a handle for
+        # driving volumeclient.  It's a little hacky but we don't have a more
+        # general concept for librados/libcephfs clients as opposed to full
+        # blown mounting clients.
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        volume_id = "volid"
+
+        volume_prefix = "/myprefix"
+        namespace_prefix = "mynsprefix_"
+
+        # Create a 100MB volume
+        volume_size = 100
+        mount_path = self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*{volume_size})
+            print create_result['mount_path']
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            volume_size=volume_size
+        )), volume_prefix, namespace_prefix)
+
+        # The dir should be created
+        self.mount_a.stat(os.path.join("myprefix", group_id, volume_id))
+
+        # Authorize and configure credentials for the guest to mount the
+        # the volume.
+        self._configure_guest_auth(self.mount_b, self.mounts[2], guest_entity,
+                                   mount_path, namespace_prefix)
+        self.mounts[2].mount(mount_path=mount_path)
+
+        # The kernel client doesn't have the quota-based df behaviour,
+        # or quotas at all, so only exercise the client behaviour when
+        # running fuse.
+        if isinstance(self.mounts[2], FuseMount):
+            # df should see volume size, same as the quota set on volume's dir
+            self.assertEqual(self.mounts[2].df()['total'],
+                             volume_size * 1024 * 1024)
+            self.assertEqual(
+                    self.mount_a.getfattr(
+                        os.path.join(volume_prefix.strip("/"), group_id, volume_id),
+                        "ceph.quota.max_bytes"),
+                    "%s" % (volume_size * 1024 * 1024))
+
+            # df granularity is 4MB block so have to write at least that much
+            data_bin_mb = 4
+            self.mounts[2].write_n_mb("data.bin", data_bin_mb)
+
+            # Write something outside volume to check this space usage is
+            # not reported in the volume's DF.
+            other_bin_mb = 8
+            self.mount_a.write_n_mb("other.bin", other_bin_mb)
+
+            # global: df should see all the writes (data + other).  This is a >
+            # rather than a == because the global spaced used includes all pools
+            def check_df():
+                used = self.mount_a.df()['used']
+                return used >= (other_bin_mb * 1024 * 1024)
+
+            self.wait_until_true(check_df, timeout=30)
+
+            # Hack: do a metadata IO to kick rstats
+            self.mounts[2].run_shell(["touch", "foo"])
+
+            # volume: df should see the data_bin_mb consumed from quota, same
+            # as the rbytes for the volume's dir
+            self.wait_until_equal(
+                    lambda: self.mounts[2].df()['used'],
+                    data_bin_mb * 1024 * 1024, timeout=60)
+            self.wait_until_equal(
+                    lambda: self.mount_a.getfattr(
+                        os.path.join(volume_prefix.strip("/"), group_id, volume_id),
+                        "ceph.dir.rbytes"),
+                    "%s" % (data_bin_mb * 1024 * 1024), timeout=60)
+
+            # sync so that file data are persist to rados
+            self.mounts[2].run_shell(["sync"])
+
+            # Our data should stay in particular rados namespace
+            pool_name = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool")
+            namespace = "{0}{1}".format(namespace_prefix, volume_id)
+            ns_in_attr = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool_namespace")
+            self.assertEqual(namespace, ns_in_attr)
+
+            objects_in_ns = set(self.fs.rados(["ls"], pool=pool_name, namespace=namespace).split("\n"))
+            self.assertNotEqual(objects_in_ns, set())
+
+            # De-authorize the guest
+            self._volume_client_python(self.mount_b, dedent("""
+                vp = VolumePath("{group_id}", "{volume_id}")
+                vc.deauthorize(vp, "{guest_entity}")
+                vc.evict("{guest_entity}")
+            """.format(
+                group_id=group_id,
+                volume_id=volume_id,
+                guest_entity=guest_entity
+            )), volume_prefix, namespace_prefix)
+
+            # Once deauthorized, the client should be unable to do any more metadata ops
+            # The way that the client currently behaves here is to block (it acts like
+            # it has lost network, because there is nothing to tell it that is messages
+            # are being dropped because it's identity is gone)
+            background = self.mounts[2].write_n_mb("rogue.bin", 1, wait=False)
+            time.sleep(10)  # Approximate check for 'stuck' as 'still running after 10s'
+            self.assertFalse(background.finished)
+
+            # After deauthorisation, the client ID should be gone (this was the only
+            # volume it was authorised for)
+            self.assertNotIn("client.{0}".format(guest_entity), [e['entity'] for e in self.auth_list()])
+
+            # Clean up the dead mount (ceph-fuse's behaviour here is a bit undefined)
+            self.mounts[2].kill()
+            self.mounts[2].kill_cleanup()
+            try:
+                background.wait()
+            except CommandFailedError:
+                # We killed the mount out from under you
+                pass
+
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )), volume_prefix, namespace_prefix)
+
+    def test_idempotency(self):
+        """
+        That the volumeclient interface works when calling everything twice
+        """
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        volume_id = "volid"
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 10)
+            vc.create_volume(vp, 10)
+            vc.authorize(vp, "{guest_entity}")
+            vc.authorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.delete_volume(vp)
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+            vc.purge_volume(vp)
+
+            vc.create_volume(vp, 10, data_isolated=True)
+            vc.create_volume(vp, 10, data_isolated=True)
+            vc.authorize(vp, "{guest_entity}")
+            vc.authorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.evict("{guest_entity}")
+            vc.delete_volume(vp, data_isolated=True)
+            vc.delete_volume(vp, data_isolated=True)
+            vc.purge_volume(vp, data_isolated=True)
+            vc.purge_volume(vp, data_isolated=True)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guest_entity
+        )))
+
+    def test_data_isolated(self):
+        """
+        That data isolated shares get their own pool
+        :return:
+        """
+
+        # Because the teuthology config template sets mon_max_pg_per_osd to
+        # 10000 (i.e. it just tries to ignore health warnings), reset it to something
+        # sane before using volume_client, to avoid creating pools with absurdly large
+        # numbers of PGs.
+        self.set_conf("global", "mon max pg per osd", "300")
+        for mon_daemon_state in self.ctx.daemons.iter_daemons_of_role('mon'):
+            mon_daemon_state.restart()
+
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        # Calculate how many PGs we'll expect the new volume pool to have
+        osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty'))
+        max_per_osd = int(self.fs.get_config('mon_max_pg_per_osd'))
+        osd_count = len(osd_map['osds'])
+        max_overall = osd_count * max_per_osd
+
+        existing_pg_count = 0
+        for p in osd_map['pools']:
+            existing_pg_count += p['pg_num']
+
+        expected_pg_num = (max_overall - existing_pg_count) / 10
+        log.info("max_per_osd {0}".format(max_per_osd))
+        log.info("osd_count {0}".format(osd_count))
+        log.info("max_overall {0}".format(max_overall))
+        log.info("existing_pg_count {0}".format(existing_pg_count))
+        log.info("expected_pg_num {0}".format(expected_pg_num))
+
+        pools_a = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+
+        group_id = "grpid"
+        volume_id = "volid"
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 10, data_isolated=True)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        pools_b = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools']
+
+        # Should have created one new pool
+        new_pools = set(p['pool_name'] for p in pools_b) - set([p['pool_name'] for p in pools_a])
+        self.assertEqual(len(new_pools), 1)
+
+        # It should have followed the heuristic for PG count
+        # (this is an overly strict test condition, so we may want to remove
+        #  it at some point as/when the logic gets fancier)
+        created_pg_num = self.fs.mon_manager.get_pool_property(list(new_pools)[0], "pg_num")
+        self.assertEqual(expected_pg_num, created_pg_num)
+
+    def test_15303(self):
+        """
+        Reproducer for #15303 "Client holds incorrect complete flag on dir
+        after losing caps" (http://tracker.ceph.com/issues/15303)
+        """
+        for m in self.mounts:
+            m.umount_wait()
+
+        # Create a dir on mount A
+        self.mount_a.mount()
+        self.mount_a.run_shell(["mkdir", "parent1"])
+        self.mount_a.run_shell(["mkdir", "parent2"])
+        self.mount_a.run_shell(["mkdir", "parent1/mydir"])
+
+        # Put some files in it from mount B
+        self.mount_b.mount()
+        self.mount_b.run_shell(["touch", "parent1/mydir/afile"])
+        self.mount_b.umount_wait()
+
+        # List the dir's contents on mount A
+        self.assertListEqual(self.mount_a.ls("parent1/mydir"),
+                             ["afile"])
+
+    def test_evict_client(self):
+        """
+        That a volume client can be evicted based on its auth ID and the volume
+        path it has mounted.
+        """
+
+        if not isinstance(self.mount_a, FuseMount):
+            self.skipTest("Requires FUSE client to inject client metadata")
+
+        # mounts[1] would be used as handle for driving VolumeClient. mounts[2]
+        # and mounts[3] would be used as guests to mount the volumes/shares.
+
+        for i in range(1, 4):
+            self.mounts[i].umount_wait()
+
+        volumeclient_mount = self.mounts[1]
+        self._configure_vc_auth(volumeclient_mount, "manila")
+        guest_mounts = (self.mounts[2], self.mounts[3])
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        mount_paths = []
+        volume_ids = []
+
+        # Create two volumes. Authorize 'guest' auth ID to mount the two
+        # volumes. Mount the two volumes. Write data to the volumes.
+        for i in range(2):
+            # Create volume.
+            volume_ids.append("volid_{0}".format(str(i)))
+            mount_paths.append(
+                self._volume_client_python(volumeclient_mount, dedent("""
+                    vp = VolumePath("{group_id}", "{volume_id}")
+                    create_result = vc.create_volume(vp, 10 * 1024 * 1024)
+                    print create_result['mount_path']
+                """.format(
+                    group_id=group_id,
+                    volume_id=volume_ids[i]
+            ))))
+
+            # Authorize 'guest' auth ID to mount the volume.
+            self._configure_guest_auth(volumeclient_mount, guest_mounts[i],
+                                       guest_entity, mount_paths[i])
+
+            # Mount the volume.
+            guest_mounts[i].mountpoint_dir_name = 'mnt.{id}.{suffix}'.format(
+                id=guest_entity, suffix=str(i))
+            guest_mounts[i].mount(mount_path=mount_paths[i])
+            guest_mounts[i].write_n_mb("data.bin", 1)
+
+
+        # Evict client, guest_mounts[0], using auth ID 'guest' and has mounted
+        # one volume.
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+            vc.evict("{guest_entity}", volume_path=vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_ids[0],
+            guest_entity=guest_entity
+        )))
+
+        # Evicted guest client, guest_mounts[0], should not be able to do
+        # anymore metadata ops.  It should start failing all operations
+        # when it sees that its own address is in the blacklist.
+        try:
+            guest_mounts[0].write_n_mb("rogue.bin", 1)
+        except CommandFailedError:
+            pass
+        else:
+            raise RuntimeError("post-eviction write should have failed!")
+
+        # The blacklisted guest client should now be unmountable
+        guest_mounts[0].umount_wait()
+
+        # Guest client, guest_mounts[1], using the same auth ID 'guest', but
+        # has mounted the other volume, should be able to use its volume
+        # unaffected.
+        guest_mounts[1].write_n_mb("data.bin.1", 1)
+
+        # Cleanup.
+        for i in range(2):
+            self._volume_client_python(volumeclient_mount, dedent("""
+                vp = VolumePath("{group_id}", "{volume_id}")
+                vc.deauthorize(vp, "{guest_entity}")
+                vc.delete_volume(vp)
+                vc.purge_volume(vp)
+            """.format(
+                group_id=group_id,
+                volume_id=volume_ids[i],
+                guest_entity=guest_entity
+            )))
+
+
+    def test_purge(self):
+        """
+        Reproducer for #15266, exception trying to purge volumes that
+        contain non-ascii filenames.
+
+        Additionally test any other purge corner cases here.
+        """
+        # I'm going to leave mount_b unmounted and just use it as a handle for
+        # driving volumeclient.  It's a little hacky but we don't have a more
+        # general concept for librados/libcephfs clients as opposed to full
+        # blown mounting clients.
+        self.mount_b.umount_wait()
+        self._configure_vc_auth(self.mount_b, "manila")
+
+        group_id = "grpid"
+        # Use a unicode volume ID (like Manila), to reproduce #15266
+        volume_id = u"volid"
+
+        # Create
+        mount_path = self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", u"{volume_id}")
+            create_result = vc.create_volume(vp, 10)
+            print create_result['mount_path']
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )))
+
+        # Strip leading "/"
+        mount_path = mount_path[1:]
+
+        # A file with non-ascii characters
+        self.mount_a.run_shell(["touch", os.path.join(mount_path, u"b\u00F6b")])
+
+        # A file with no permissions to do anything
+        self.mount_a.run_shell(["touch", os.path.join(mount_path, "noperms")])
+        self.mount_a.run_shell(["chmod", "0000", os.path.join(mount_path, "noperms")])
+
+        self._volume_client_python(self.mount_b, dedent("""
+            vp = VolumePath("{group_id}", u"{volume_id}")
+            vc.delete_volume(vp)
+            vc.purge_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )))
+
+        # Check it's really gone
+        self.assertEqual(self.mount_a.ls("volumes/_deleting"), [])
+        self.assertEqual(self.mount_a.ls("volumes/"), ["_deleting", group_id])
+
+    def test_readonly_authorization(self):
+        """
+        That guest clients can be restricted to read-only mounts of volumes.
+        """
+
+        volumeclient_mount = self.mounts[1]
+        guest_mount = self.mounts[2]
+        volumeclient_mount.umount_wait()
+        guest_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        guest_entity = "guest"
+        group_id = "grpid"
+        volume_id = "volid"
+
+        # Create a volume.
+        mount_path = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*10)
+            print create_result['mount_path']
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Authorize and configure credentials for the guest to mount the
+        # the volume with read-write access.
+        self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity,
+                                   mount_path, readonly=False)
+
+        # Mount the volume, and write to it.
+        guest_mount.mount(mount_path=mount_path)
+        guest_mount.write_n_mb("data.bin", 1)
+
+        # Change the guest auth ID's authorization to read-only mount access.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guest_entity
+        )))
+        self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity,
+                                   mount_path, readonly=True)
+
+        # The effect of the change in access level to read-only is not
+        # immediate. The guest sees the change only after a remount of
+        # the volume.
+        guest_mount.umount_wait()
+        guest_mount.mount(mount_path=mount_path)
+
+        # Read existing content of the volume.
+        self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"])
+        # Cannot write into read-only volume.
+        with self.assertRaises(CommandFailedError):
+            guest_mount.write_n_mb("rogue.bin", 1)
+
+    def test_get_authorized_ids(self):
+        """
+        That for a volume, the authorized IDs and their access levels
+        can be obtained using CephFSVolumeClient's get_authorized_ids().
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "grpid"
+        volume_id = "volid"
+        guest_entity_1 = "guest1"
+        guest_entity_2 = "guest2"
+
+        log.info("print group ID: {0}".format(group_id))
+
+        # Create a volume.
+        auths = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+            auths = vc.get_authorized_ids(vp)
+            print auths
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+        # Check the list of authorized IDs for the volume.
+        expected_result = None
+        self.assertEqual(str(expected_result), auths)
+
+        # Allow two auth IDs access to the volume.
+        auths = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{guest_entity_1}", readonly=False)
+            vc.authorize(vp, "{guest_entity_2}", readonly=True)
+            auths = vc.get_authorized_ids(vp)
+            print auths
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity_1=guest_entity_1,
+            guest_entity_2=guest_entity_2,
+        )))
+        # Check the list of authorized IDs and their access levels.
+        expected_result = [(u'guest1', u'rw'), (u'guest2', u'r')]
+        self.assertItemsEqual(str(expected_result), auths)
+
+        # Disallow both the auth IDs' access to the volume.
+        auths = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity_1}")
+            vc.deauthorize(vp, "{guest_entity_2}")
+            auths = vc.get_authorized_ids(vp)
+            print auths
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity_1=guest_entity_1,
+            guest_entity_2=guest_entity_2,
+        )))
+        # Check the list of authorized IDs for the volume.
+        expected_result = None
+        self.assertItemsEqual(str(expected_result), auths)
+
+    def test_multitenant_volumes(self):
+        """
+        That volume access can be restricted to a tenant.
+
+        That metadata used to enforce tenant isolation of
+        volumes is stored as a two-way mapping between auth
+        IDs and volumes that they're authorized to access.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        # Guest clients belonging to different tenants, but using the same
+        # auth ID.
+        auth_id = "guest"
+        guestclient_1 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant1",
+        }
+        guestclient_2 = {
+            "auth_id": auth_id,
+            "tenant_id": "tenant2",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Check that volume metadata file is created on volume creation.
+        vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id)
+        self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Authorize 'guestclient_1', using auth ID 'guest' and belonging to
+        # 'tenant1', with 'rw' access to the volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient_1["auth_id"],
+            tenant_id=guestclient_1["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest', is
+        # created on authorizing 'guest' access to the volume.
+        auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Verify that the auth metadata file stores the tenant ID that the
+        # auth ID belongs to, the auth ID's authorized access levels
+        # for different volumes, versioning details, etc.
+        expected_auth_metadata = {
+            u"version": 2,
+            u"compat_version": 1,
+            u"dirty": False,
+            u"tenant_id": u"tenant1",
+            u"volumes": {
+                u"groupid/volumeid": {
+                    u"dirty": False,
+                    u"access_level": u"rw",
+                }
+            }
+        }
+
+        auth_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            print auth_metadata
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient_1["auth_id"],
+        )))
+
+        self.assertItemsEqual(str(expected_auth_metadata), auth_metadata)
+
+        # Verify that the volume metadata file stores info about auth IDs
+        # and their access levels to the volume, versioning details, etc.
+        expected_vol_metadata = {
+            u"version": 2,
+            u"compat_version": 1,
+            u"auths": {
+                u"guest": {
+                    u"dirty": False,
+                    u"access_level": u"rw"
+                }
+            }
+        }
+
+        vol_metadata = self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            volume_metadata = vc._volume_metadata_get(vp)
+            print volume_metadata
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+        self.assertItemsEqual(str(expected_vol_metadata), vol_metadata)
+
+        # Cannot authorize 'guestclient_2' to access the volume.
+        # It uses auth ID 'guest', which has already been used by a
+        # 'guestclient_1' belonging to an another tenant for accessing
+        # the volume.
+        with self.assertRaises(CommandFailedError):
+            self._volume_client_python(volumeclient_mount, dedent("""
+                vp = VolumePath("{group_id}", "{volume_id}")
+                vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+            """.format(
+                group_id=group_id,
+                volume_id=volume_id,
+                auth_id=guestclient_2["auth_id"],
+                tenant_id=guestclient_2["tenant_id"]
+            )))
+
+        # Check that auth metadata file is cleaned up on removing
+        # auth ID's only access to a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.deauthorize(vp, "{guest_entity}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            guest_entity=guestclient_1["auth_id"]
+        )))
+
+        self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Check that volume metadata file is cleaned up on volume deletion.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.delete_volume(vp)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+        self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes"))
+
+    def test_recover_metadata(self):
+        """
+        That volume client can recover from partial auth updates using
+        metadata files, which store auth info and its update status info.
+        """
+        volumeclient_mount = self.mounts[1]
+        volumeclient_mount.umount_wait()
+
+        # Configure volumeclient_mount as the handle for driving volumeclient.
+        self._configure_vc_auth(volumeclient_mount, "manila")
+
+        group_id = "groupid"
+        volume_id = "volumeid"
+
+        guestclient = {
+            "auth_id": "guest",
+            "tenant_id": "tenant",
+        }
+
+        # Create a volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.create_volume(vp, 1024*1024*10)
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+        )))
+
+        # Authorize 'guestclient' access to the volume.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}")
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient["auth_id"],
+            tenant_id=guestclient["tenant_id"]
+        )))
+
+        # Check that auth metadata file for auth ID 'guest' is created.
+        auth_metadata_filename = "${0}.meta".format(guestclient["auth_id"])
+        self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes"))
+
+        # Induce partial auth update state by modifying the auth metadata file,
+        # and then run recovery procedure.
+        self._volume_client_python(volumeclient_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            auth_metadata = vc._auth_metadata_get("{auth_id}")
+            auth_metadata['dirty'] = True
+            vc._auth_metadata_set("{auth_id}", auth_metadata)
+            vc.recover()
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id,
+            auth_id=guestclient["auth_id"],
+        )))
+
+    def test_put_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test data'
+        obj_name = 'test_vc_obj_1'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self._volume_client_python(vc_mount, dedent("""
+            vc.put_object("{pool_name}", "{obj_name}", b"{obj_data}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+            obj_data = obj_data
+        )))
+
+        read_data = self.fs.rados(['get', obj_name, '-'], pool=pool_name)
+        self.assertEqual(obj_data, read_data)
+
+    def test_get_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test_data'
+        obj_name = 'test_vc_ob_2'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.get_object("{pool_name}", "{obj_name}")
+            assert data_read == b"{obj_data}"
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+            obj_data = obj_data
+        )))
+
+    def test_delete_object(self):
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+        self._configure_vc_auth(vc_mount, "manila")
+
+        obj_data = 'test data'
+        obj_name = 'test_vc_obj_3'
+        pool_name = self.fs.get_data_pool_names()[0]
+
+        self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data)
+
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.delete_object("{pool_name}", "{obj_name}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+        )))
+
+        with self.assertRaises(CommandFailedError):
+            self.fs.rados(['stat', obj_name], pool=pool_name)
+
+        # Check idempotency -- no error raised trying to delete non-existent
+        # object
+        self._volume_client_python(vc_mount, dedent("""
+            data_read = vc.delete_object("{pool_name}", "{obj_name}")
+        """.format(
+            pool_name = pool_name,
+            obj_name = obj_name,
+        )))
+
+    def test_21501(self):
+        """
+        Reproducer for #21501 "ceph_volume_client: sets invalid caps for
+        existing IDs with no caps" (http://tracker.ceph.com/issues/21501)
+        """
+
+        vc_mount = self.mounts[1]
+        vc_mount.umount_wait()
+
+        # Configure vc_mount as the handle for driving volumeclient
+        self._configure_vc_auth(vc_mount, "manila")
+
+        # Create a volume
+        group_id = "grpid"
+        volume_id = "volid"
+        mount_path = self._volume_client_python(vc_mount, dedent("""
+            vp = VolumePath("{group_id}", "{volume_id}")
+            create_result = vc.create_volume(vp, 1024*1024*10)
+            print create_result['mount_path']
+        """.format(
+            group_id=group_id,
+            volume_id=volume_id
+        )))
+
+        # Create an auth ID with no caps
+        guest_id = '21501'
+        self.fs.mon_manager.raw_cluster_cmd_result(
+            'auth', 'get-or-create', 'client.{0}'.format(guest_id))
+
+        guest_mount = self.mounts[2]
+        guest_mount.umount_wait()
+
+        # Set auth caps for the auth ID using the volumeclient
+        self._configure_guest_auth(vc_mount, guest_mount, guest_id, mount_path)
+
+        # Mount the volume in the guest using the auth ID to assert that the
+        # auth caps are valid
+        guest_mount.mount(mount_path=mount_path)
diff --git a/src/ceph/qa/tasks/cephfs_test_runner.py b/src/ceph/qa/tasks/cephfs_test_runner.py
new file mode 100644
index 0000000..d57e85d
--- /dev/null
+++ b/src/ceph/qa/tasks/cephfs_test_runner.py
@@ -0,0 +1,209 @@
+import contextlib
+import logging
+import os
+import unittest
+from unittest import suite, loader, case
+from teuthology.task import interactive
+from teuthology import misc
+from tasks.cephfs.filesystem import Filesystem, MDSCluster, CephCluster
+from tasks.mgr.mgr_test_case import MgrCluster
+
+log = logging.getLogger(__name__)
+
+
+class DecoratingLoader(loader.TestLoader):
+    """
+    A specialization of TestLoader that tags some extra attributes
+    onto test classes as they are loaded.
+    """
+    def __init__(self, params):
+        self._params = params
+        super(DecoratingLoader, self).__init__()
+
+    def _apply_params(self, obj):
+        for k, v in self._params.items():
+            setattr(obj, k, v)
+
+    def loadTestsFromTestCase(self, testCaseClass):
+        self._apply_params(testCaseClass)
+        return super(DecoratingLoader, self).loadTestsFromTestCase(testCaseClass)
+
+    def loadTestsFromName(self, name, module=None):
+        result = super(DecoratingLoader, self).loadTestsFromName(name, module)
+
+        # Special case for when we were called with the name of a method, we get
+        # a suite with one TestCase
+        tests_in_result = list(result)
+        if len(tests_in_result) == 1 and isinstance(tests_in_result[0], case.TestCase):
+            self._apply_params(tests_in_result[0])
+
+        return result
+
+
+class LogStream(object):
+    def __init__(self):
+        self.buffer = ""
+
+    def write(self, data):
+        self.buffer += data
+        if "\n" in self.buffer:
+            lines = self.buffer.split("\n")
+            for line in lines[:-1]:
+                log.info(line)
+            self.buffer = lines[-1]
+
+    def flush(self):
+        pass
+
+
+class InteractiveFailureResult(unittest.TextTestResult):
+    """
+    Specialization that implements interactive-on-error style
+    behavior.
+    """
+    ctx = None
+
+    def addFailure(self, test, err):
+        log.error(self._exc_info_to_string(err, test))
+        log.error("Failure in test '{0}', going interactive".format(
+            self.getDescription(test)
+        ))
+        interactive.task(ctx=self.ctx, config=None)
+
+    def addError(self, test, err):
+        log.error(self._exc_info_to_string(err, test))
+        log.error("Error in test '{0}', going interactive".format(
+            self.getDescription(test)
+        ))
+        interactive.task(ctx=self.ctx, config=None)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run the CephFS test cases.
+
+    Run everything in tasks/cephfs/test_*.py:
+
+    ::
+
+        tasks:
+          - install:
+          - ceph:
+          - ceph-fuse:
+          - cephfs_test_runner:
+
+    `modules` argument allows running only some specific modules:
+
+    ::
+
+        tasks:
+            ...
+          - cephfs_test_runner:
+              modules:
+                - tasks.cephfs.test_sessionmap
+                - tasks.cephfs.test_auto_repair
+
+    By default, any cases that can't be run on the current cluster configuration
+    will generate a failure.  When the optional `fail_on_skip` argument is set
+    to false, any tests that can't be run on the current configuration will
+    simply be skipped:
+
+    ::
+        tasks:
+            ...
+         - cephfs_test_runner:
+           fail_on_skip: false
+
+    """
+
+    ceph_cluster = CephCluster(ctx)
+
+    if len(list(misc.all_roles_of_type(ctx.cluster, 'mds'))):
+        mds_cluster = MDSCluster(ctx)
+        fs = Filesystem(ctx)
+    else:
+        mds_cluster = None
+        fs = None
+
+    if len(list(misc.all_roles_of_type(ctx.cluster, 'mgr'))):
+        mgr_cluster = MgrCluster(ctx)
+    else:
+        mgr_cluster = None
+
+    # Mount objects, sorted by ID
+    if hasattr(ctx, 'mounts'):
+        mounts = [v for k, v in sorted(ctx.mounts.items(), lambda a, b: cmp(a[0], b[0]))]
+    else:
+        # The test configuration has a filesystem but no fuse/kclient mounts
+        mounts = []
+
+    decorating_loader = DecoratingLoader({
+        "ctx": ctx,
+        "mounts": mounts,
+        "fs": fs,
+        "ceph_cluster": ceph_cluster,
+        "mds_cluster": mds_cluster,
+        "mgr_cluster": mgr_cluster,
+    })
+
+    fail_on_skip = config.get('fail_on_skip', True)
+
+    # Put useful things onto ctx for interactive debugging
+    ctx.fs = fs
+    ctx.mds_cluster = mds_cluster
+    ctx.mgr_cluster = mgr_cluster
+
+    # Depending on config, either load specific modules, or scan for moduless
+    if config and 'modules' in config and config['modules']:
+        module_suites = []
+        for mod_name in config['modules']:
+            # Test names like cephfs.test_auto_repair
+            module_suites.append(decorating_loader.loadTestsFromName(mod_name))
+        overall_suite = suite.TestSuite(module_suites)
+    else:
+        # Default, run all tests
+        overall_suite = decorating_loader.discover(
+            os.path.join(
+                os.path.dirname(os.path.abspath(__file__)),
+                "cephfs/"
+            )
+        )
+
+    if ctx.config.get("interactive-on-error", False):
+        InteractiveFailureResult.ctx = ctx
+        result_class = InteractiveFailureResult
+    else:
+        result_class = unittest.TextTestResult
+
+    class LoggingResult(result_class):
+        def startTest(self, test):
+            log.info("Starting test: {0}".format(self.getDescription(test)))
+            return super(LoggingResult, self).startTest(test)
+
+        def addSkip(self, test, reason):
+            if fail_on_skip:
+                # Don't just call addFailure because that requires a traceback
+                self.failures.append((test, reason))
+            else:
+                super(LoggingResult, self).addSkip(test, reason)
+
+    # Execute!
+    result = unittest.TextTestRunner(
+        stream=LogStream(),
+        resultclass=LoggingResult,
+        verbosity=2,
+        failfast=True).run(overall_suite)
+
+    if not result.wasSuccessful():
+        result.printErrors()  # duplicate output at end for convenience
+
+        bad_tests = []
+        for test, error in result.errors:
+            bad_tests.append(str(test))
+        for test, failure in result.failures:
+            bad_tests.append(str(test))
+
+        raise RuntimeError("Test failure: {0}".format(", ".join(bad_tests)))
+
+    yield
diff --git a/src/ceph/qa/tasks/check_counter.py b/src/ceph/qa/tasks/check_counter.py
new file mode 100644
index 0000000..a3d84e0
--- /dev/null
+++ b/src/ceph/qa/tasks/check_counter.py
@@ -0,0 +1,96 @@
+
+import logging
+import json
+
+from teuthology.task import Task
+from teuthology import misc
+import ceph_manager
+
+log = logging.getLogger(__name__)
+
+
+class CheckCounter(Task):
+    """
+    Use this task to validate that some daemon perf counters were
+    incremented by the nested tasks.
+
+    Config:
+     'cluster_name': optional, specify which cluster
+     'target': dictionary of daemon type to list of performance counters.
+     'dry_run': just log the value of the counters, don't fail if they
+                aren't nonzero.
+
+    Success condition is that for all of the named counters, at least
+    one of the daemons of that type has the counter nonzero.
+
+    Example to check cephfs dirfrag splits are happening:
+    - install:
+    - ceph:
+    - ceph-fuse:
+    - check-counter:
+        counters:
+            mds:
+                - "mds.dir_split"
+    - workunit: ...
+    """
+
+    def start(self):
+        log.info("START")
+
+    def end(self):
+        cluster_name = self.config.get('cluster_name', None)
+        dry_run = self.config.get('dry_run', False)
+        targets = self.config.get('counters', {})
+
+        if cluster_name is None:
+            cluster_name = self.ctx.managers.keys()[0]
+
+        for daemon_type, counters in targets.items():
+            # List of 'a', 'b', 'c'...
+            daemon_ids = list(misc.all_roles_of_type(self.ctx.cluster, daemon_type))
+            daemons = dict([(daemon_id,
+                             self.ctx.daemons.get_daemon(daemon_type, daemon_id))
+                            for daemon_id in daemon_ids])
+
+            seen = set()
+
+            for daemon_id, daemon in daemons.items():
+                if not daemon.running():
+                    log.info("Ignoring daemon {0}, it isn't running".format(daemon_id))
+                    continue
+                else:
+                    log.debug("Getting stats from {0}".format(daemon_id))
+
+                manager = self.ctx.managers[cluster_name]
+                proc = manager.admin_socket(daemon_type, daemon_id, ["perf", "dump"])
+                response_data = proc.stdout.getvalue().strip()
+                if response_data:
+                    perf_dump = json.loads(response_data)
+                else:
+                    log.warning("No admin socket response from {0}, skipping".format(daemon_id))
+                    continue
+
+                for counter in counters:
+                    subsys, counter_id = counter.split(".")
+                    if subsys not in perf_dump or counter_id not in perf_dump[subsys]:
+                        log.warning("Counter '{0}' not found on daemon {1}.{2}".format(
+                            counter, daemon_type, daemon_id))
+                        continue
+                    value = perf_dump[subsys][counter_id]
+
+                    log.info("Daemon {0}.{1} {2}={3}".format(
+                        daemon_type, daemon_id, counter, value
+                    ))
+
+                    if value > 0:
+                        seen.add(counter)
+
+            if not dry_run:
+                unseen = set(counters) - set(seen)
+                if unseen:
+                    raise RuntimeError("The following counters failed to be set "
+                                       "on {0} daemons: {1}".format(
+                        daemon_type, unseen
+                    ))
+
+task = CheckCounter
diff --git a/src/ceph/qa/tasks/cifs_mount.py b/src/ceph/qa/tasks/cifs_mount.py
new file mode 100644
index 0000000..b282b0b
--- /dev/null
+++ b/src/ceph/qa/tasks/cifs_mount.py
@@ -0,0 +1,137 @@
+"""
+Mount cifs clients.  Unmount when finished.
+"""
+import contextlib
+import logging
+import os
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Mount/unmount a cifs client.
+
+    The config is optional and defaults to mounting on all clients. If
+    a config is given, it is expected to be a list of clients to do
+    this operation on.
+
+    Example that starts smbd and mounts cifs on all nodes::
+
+        tasks:
+        - ceph:
+        - samba:
+        - cifs-mount:
+        - interactive:
+
+    Example that splits smbd and cifs:
+
+        tasks:
+        - ceph:
+        - samba: [samba.0]
+        - cifs-mount: [client.0]
+        - ceph-fuse: [client.1]
+        - interactive:
+
+    Example that specifies the share name:
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - samba:
+            samba.0:
+                cephfuse: "{testdir}/mnt.0"
+        - cifs-mount:
+            client.0:
+                share: cephfuse
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    log.info('Mounting cifs clients...')
+
+    if config is None:
+        config = dict(('client.{id}'.format(id=id_), None)
+                  for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client'))
+    elif isinstance(config, list):
+        config = dict((name, None) for name in config)
+
+    clients = list(teuthology.get_clients(ctx=ctx, roles=config.keys()))
+
+    from .samba import get_sambas
+    samba_roles = ['samba.{id_}'.format(id_=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba')]
+    sambas = list(get_sambas(ctx=ctx, roles=samba_roles))
+    (ip, _) = sambas[0][1].ssh.get_transport().getpeername()
+    log.info('samba ip: {ip}'.format(ip=ip))
+
+    for id_, remote in clients:
+        mnt = os.path.join(teuthology.get_testdir(ctx), 'mnt.{id}'.format(id=id_))
+        log.info('Mounting cifs client.{id} at {remote} {mnt}...'.format(
+                id=id_, remote=remote,mnt=mnt))
+
+        remote.run(
+            args=[
+                'mkdir',
+                '--',
+                mnt,
+                ],
+            )
+
+        rolestr = 'client.{id_}'.format(id_=id_)
+        unc = "ceph"
+        log.info("config: {c}".format(c=config))
+        if config[rolestr] is not None and 'share' in config[rolestr]:
+            unc = config[rolestr]['share']
+
+        remote.run(
+            args=[
+                'sudo',
+                'mount',
+                '-t',
+                'cifs',
+                '//{sambaip}/{unc}'.format(sambaip=ip, unc=unc),
+                '-o',
+                'username=ubuntu,password=ubuntu',
+                mnt,
+                ],
+            )
+
+        remote.run(
+            args=[
+                'sudo',
+                'chown',
+                'ubuntu:ubuntu',
+                '{m}/'.format(m=mnt),
+                ],
+            )
+
+    try:
+        yield
+    finally:
+        log.info('Unmounting cifs clients...')
+        for id_, remote in clients:
+            remote.run(
+                args=[
+                    'sudo',
+                    'umount',
+                    mnt,
+                    ],
+                )
+        for id_, remote in clients:
+            while True:
+                try:
+                    remote.run(
+                        args=[
+                            'rmdir', '--', mnt,
+                            run.Raw('2>&1'),
+                            run.Raw('|'),
+                            'grep', 'Device or resource busy',
+                            ],
+                        )
+                    import time
+                    time.sleep(1)
+                except Exception:
+                    break
diff --git a/src/ceph/qa/tasks/cram.py b/src/ceph/qa/tasks/cram.py
new file mode 100644
index 0000000..02c6667
--- /dev/null
+++ b/src/ceph/qa/tasks/cram.py
@@ -0,0 +1,155 @@
+"""
+Cram tests
+"""
+import logging
+import os
+
+from teuthology import misc as teuthology
+from teuthology.parallel import parallel
+from teuthology.orchestra import run
+from teuthology.config import config as teuth_config
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Run all cram tests from the specified urls on the specified
+    clients. Each client runs tests in parallel.
+
+    Limitations:
+    Tests must have a .t suffix. Tests with duplicate names will
+    overwrite each other, so only the last one will run.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - cram:
+            clients:
+              client.0:
+              - http://download.ceph.com/qa/test.t
+              - http://download.ceph.com/qa/test2.t]
+              client.1: [http://download.ceph.com/qa/test.t]
+            branch: foo
+
+    You can also run a list of cram tests on all clients::
+
+        tasks:
+        - ceph:
+        - cram:
+            clients:
+              all: [http://download.ceph.com/qa/test.t]
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    assert isinstance(config, dict)
+    assert 'clients' in config and isinstance(config['clients'], dict), \
+           'configuration must contain a dictionary of clients'
+
+    clients = teuthology.replace_all_with_clients(ctx.cluster,
+                                                  config['clients'])
+    testdir = teuthology.get_testdir(ctx)
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('workunit', {}))
+
+    refspec = config.get('branch')
+    if refspec is None:
+        refspec = config.get('tag')
+    if refspec is None:
+        refspec = config.get('sha1')
+    if refspec is None:
+        refspec = 'HEAD'
+
+    # hack: the git_url is always ceph-ci or ceph
+    git_url = teuth_config.get_ceph_git_url()
+    repo_name = 'ceph.git'
+    if git_url.count('ceph-ci'):
+        repo_name = 'ceph-ci.git'
+
+    try:
+        for client, tests in clients.iteritems():
+            (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+            client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client)
+            remote.run(
+                args=[
+                    'mkdir', '--', client_dir,
+                    run.Raw('&&'),
+                    'virtualenv', '{tdir}/virtualenv'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    '{tdir}/virtualenv/bin/pip'.format(tdir=testdir),
+                    'install', 'cram==0.6',
+                    ],
+                )
+            for test in tests:
+                url = test.format(repo=repo_name, branch=refspec)
+                log.info('fetching test %s for %s', url, client)
+                assert test.endswith('.t'), 'tests must end in .t'
+                remote.run(
+                    args=[
+                        'wget', '-nc', '-nv', '-P', client_dir, '--', url,
+                        ],
+                    )
+
+        with parallel() as p:
+            for role in clients.iterkeys():
+                p.spawn(_run_tests, ctx, role)
+    finally:
+        for client, tests in clients.iteritems():
+            (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+            client_dir = '{tdir}/archive/cram.{role}'.format(tdir=testdir, role=client)
+            test_files = set([test.rsplit('/', 1)[1] for test in tests])
+
+            # remove test files unless they failed
+            for test_file in test_files:
+                abs_file = os.path.join(client_dir, test_file)
+                remote.run(
+                    args=[
+                        'test', '-f', abs_file + '.err',
+                        run.Raw('||'),
+                        'rm', '-f', '--', abs_file,
+                        ],
+                    )
+
+            # ignore failure since more than one client may
+            # be run on a host, and the client dir should be
+            # non-empty if the test failed
+            remote.run(
+                args=[
+                    'rm', '-rf', '--',
+                    '{tdir}/virtualenv'.format(tdir=testdir),
+                    run.Raw(';'),
+                    'rmdir', '--ignore-fail-on-non-empty', client_dir,
+                    ],
+                )
+
+def _run_tests(ctx, role):
+    """
+    For each role, check to make sure it's a client, then run the cram on that client
+
+    :param ctx: Context
+    :param role: Roles
+    """
+    assert isinstance(role, basestring)
+    PREFIX = 'client.'
+    assert role.startswith(PREFIX)
+    id_ = role[len(PREFIX):]
+    (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+    ceph_ref = ctx.summary.get('ceph-sha1', 'master')
+
+    testdir = teuthology.get_testdir(ctx)
+    log.info('Running tests for %s...', role)
+    remote.run(
+        args=[
+            run.Raw('CEPH_REF={ref}'.format(ref=ceph_ref)),
+            run.Raw('CEPH_ID="{id}"'.format(id=id_)),
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            '{tdir}/virtualenv/bin/cram'.format(tdir=testdir),
+            '-v', '--',
+            run.Raw('{tdir}/archive/cram.{role}/*.t'.format(tdir=testdir, role=role)),
+            ],
+        logger=log.getChild(role),
+        )
diff --git a/src/ceph/qa/tasks/create_verify_lfn_objects.py b/src/ceph/qa/tasks/create_verify_lfn_objects.py
new file mode 100644
index 0000000..01ab1a3
--- /dev/null
+++ b/src/ceph/qa/tasks/create_verify_lfn_objects.py
@@ -0,0 +1,83 @@
+"""
+Rados modle-based integration tests
+"""
+import contextlib
+import logging
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    For each combination of namespace and name_length, create
+    <num_objects> objects with name length <name_length>
+    on entry.  On exit, verify that the objects still exist, can
+    be deleted, and then don't exist.
+
+    Usage::
+
+       create_verify_lfn_objects.py:
+         pool: <pool_name> default: 'data'
+         prefix: <prefix> default: ''
+         namespace: [<namespace>] default: ['']
+         num_objects: [<num_objects>] default: 10
+         name_length: [<name_length>] default: [400]
+    """
+    pool = config.get('pool', 'data')
+    num_objects = config.get('num_objects', 10)
+    name_length = config.get('name_length', [400])
+    namespace = config.get('namespace', [None])
+    prefix = config.get('prefix', None)
+    manager = ctx.managers['ceph']
+
+    objects = []
+    for l in name_length:
+        for ns in namespace:
+            def object_name(i):
+                nslength = 0
+                if namespace is not '':
+                    nslength = len(namespace)
+                numstr = str(i)
+                fillerlen = l - nslength - len(prefix) - len(numstr)
+                assert fillerlen >= 0
+                return prefix + ('a'*fillerlen) + numstr
+            objects += [(ns, object_name(i)) for i in  range(num_objects)]
+
+    for ns, name in objects:
+        err = manager.do_put(
+            pool,
+            name,
+            '/etc/resolv.conf',
+            namespace=ns)
+        log.info("err is " + str(err))
+        assert err == 0
+
+    try:
+        yield
+    finally:
+        log.info('ceph_verify_lfn_objects verifying...')
+        for ns, name in objects:
+            err = manager.do_get(
+                pool,
+                name,
+                namespace=ns)
+            log.info("err is " + str(err))
+            assert err == 0
+
+        log.info('ceph_verify_lfn_objects deleting...')
+        for ns, name in objects:
+            err = manager.do_rm(
+                pool,
+                name,
+                namespace=ns)
+            log.info("err is " + str(err))
+            assert err == 0
+
+        log.info('ceph_verify_lfn_objects verifying absent...')
+        for ns, name in objects:
+            err = manager.do_get(
+                pool,
+                name,
+                namespace=ns)
+            log.info("err is " + str(err))
+            assert err != 0
diff --git a/src/ceph/qa/tasks/devstack.py b/src/ceph/qa/tasks/devstack.py
new file mode 100644
index 0000000..943a9ff
--- /dev/null
+++ b/src/ceph/qa/tasks/devstack.py
@@ -0,0 +1,382 @@
+#!/usr/bin/env python
+import contextlib
+import logging
+from cStringIO import StringIO
+import textwrap
+from configparser import ConfigParser
+import time
+
+from teuthology.orchestra import run
+from teuthology import misc
+from teuthology.contextutil import nested
+
+log = logging.getLogger(__name__)
+
+DEVSTACK_GIT_REPO = 'https://github.com/openstack-dev/devstack.git'
+DS_STABLE_BRANCHES = ("havana", "grizzly")
+
+is_devstack_node = lambda role: role.startswith('devstack')
+is_osd_node = lambda role: role.startswith('osd')
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    if config is None:
+        config = {}
+    if not isinstance(config, dict):
+        raise TypeError("config must be a dict")
+    with nested(lambda: install(ctx=ctx, config=config),
+                lambda: smoke(ctx=ctx, config=config),
+                ):
+        yield
+
+
+@contextlib.contextmanager
+def install(ctx, config):
+    """
+    Install OpenStack DevStack and configure it to use a Ceph cluster for
+    Glance and Cinder.
+
+    Requires one node with a role 'devstack'
+
+    Since devstack runs rampant on the system it's used on, typically you will
+    want to reprovision that machine after using devstack on it.
+
+    Also, the default 2GB of RAM that is given to vps nodes is insufficient. I
+    recommend 4GB. Downburst can be instructed to give 4GB to a vps node by
+    adding this to the yaml:
+
+    downburst:
+        ram: 4G
+
+    This was created using documentation found here:
+        https://github.com/openstack-dev/devstack/blob/master/README.md
+        http://docs.ceph.com/docs/master/rbd/rbd-openstack/
+    """
+    if config is None:
+        config = {}
+    if not isinstance(config, dict):
+        raise TypeError("config must be a dict")
+
+    devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0]
+    an_osd_node = ctx.cluster.only(is_osd_node).remotes.keys()[0]
+
+    devstack_branch = config.get("branch", "master")
+    install_devstack(devstack_node, devstack_branch)
+    try:
+        configure_devstack_and_ceph(ctx, config, devstack_node, an_osd_node)
+        yield
+    finally:
+        pass
+
+
+def install_devstack(devstack_node, branch="master"):
+    log.info("Cloning DevStack repo...")
+
+    args = ['git', 'clone', DEVSTACK_GIT_REPO]
+    devstack_node.run(args=args)
+
+    if branch != "master":
+        if branch in DS_STABLE_BRANCHES and not branch.startswith("stable"):
+            branch = "stable/" + branch
+        log.info("Checking out {branch} branch...".format(branch=branch))
+        cmd = "cd devstack && git checkout " + branch
+        devstack_node.run(args=cmd)
+
+    log.info("Installing DevStack...")
+    args = ['cd', 'devstack', run.Raw('&&'), './stack.sh']
+    devstack_node.run(args=args)
+
+
+def configure_devstack_and_ceph(ctx, config, devstack_node, ceph_node):
+    pool_size = config.get('pool_size', '128')
+    create_pools(ceph_node, pool_size)
+    distribute_ceph_conf(devstack_node, ceph_node)
+    # This is where we would install python-ceph and ceph-common but it appears
+    # the ceph task does that for us.
+    generate_ceph_keys(ceph_node)
+    distribute_ceph_keys(devstack_node, ceph_node)
+    secret_uuid = set_libvirt_secret(devstack_node, ceph_node)
+    update_devstack_config_files(devstack_node, secret_uuid)
+    set_apache_servername(devstack_node)
+    # Rebooting is the most-often-used method of restarting devstack services
+    misc.reboot(devstack_node)
+    start_devstack(devstack_node)
+    restart_apache(devstack_node)
+
+
+def create_pools(ceph_node, pool_size):
+    log.info("Creating pools on Ceph cluster...")
+
+    for pool_name in ['volumes', 'images', 'backups']:
+        args = ['sudo', 'ceph', 'osd', 'pool', 'create', pool_name, pool_size]
+        ceph_node.run(args=args)
+
+
+def distribute_ceph_conf(devstack_node, ceph_node):
+    log.info("Copying ceph.conf to DevStack node...")
+
+    ceph_conf_path = '/etc/ceph/ceph.conf'
+    ceph_conf = misc.get_file(ceph_node, ceph_conf_path, sudo=True)
+    misc.sudo_write_file(devstack_node, ceph_conf_path, ceph_conf)
+
+
+def generate_ceph_keys(ceph_node):
+    log.info("Generating Ceph keys...")
+
+    ceph_auth_cmds = [
+        ['sudo', 'ceph', 'auth', 'get-or-create', 'client.cinder', 'mon',
+            'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=volumes, allow rx pool=images'],  # noqa
+        ['sudo', 'ceph', 'auth', 'get-or-create', 'client.glance', 'mon',
+            'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=images'],  # noqa
+        ['sudo', 'ceph', 'auth', 'get-or-create', 'client.cinder-backup', 'mon',
+            'allow r', 'osd', 'allow class-read object_prefix rbd_children, allow rwx pool=backups'],  # noqa
+    ]
+    for cmd in ceph_auth_cmds:
+        ceph_node.run(args=cmd)
+
+
+def distribute_ceph_keys(devstack_node, ceph_node):
+    log.info("Copying Ceph keys to DevStack node...")
+
+    def copy_key(from_remote, key_name, to_remote, dest_path, owner):
+        key_stringio = StringIO()
+        from_remote.run(
+            args=['sudo', 'ceph', 'auth', 'get-or-create', key_name],
+            stdout=key_stringio)
+        key_stringio.seek(0)
+        misc.sudo_write_file(to_remote, dest_path,
+                             key_stringio, owner=owner)
+    keys = [
+        dict(name='client.glance',
+             path='/etc/ceph/ceph.client.glance.keyring',
+             # devstack appears to just want root:root
+             #owner='glance:glance',
+             ),
+        dict(name='client.cinder',
+             path='/etc/ceph/ceph.client.cinder.keyring',
+             # devstack appears to just want root:root
+             #owner='cinder:cinder',
+             ),
+        dict(name='client.cinder-backup',
+             path='/etc/ceph/ceph.client.cinder-backup.keyring',
+             # devstack appears to just want root:root
+             #owner='cinder:cinder',
+             ),
+    ]
+    for key_dict in keys:
+        copy_key(ceph_node, key_dict['name'], devstack_node,
+                 key_dict['path'], key_dict.get('owner'))
+
+
+def set_libvirt_secret(devstack_node, ceph_node):
+    log.info("Setting libvirt secret...")
+
+    cinder_key_stringio = StringIO()
+    ceph_node.run(args=['sudo', 'ceph', 'auth', 'get-key', 'client.cinder'],
+                  stdout=cinder_key_stringio)
+    cinder_key = cinder_key_stringio.getvalue().strip()
+
+    uuid_stringio = StringIO()
+    devstack_node.run(args=['uuidgen'], stdout=uuid_stringio)
+    uuid = uuid_stringio.getvalue().strip()
+
+    secret_path = '/tmp/secret.xml'
+    secret_template = textwrap.dedent("""
+    <secret ephemeral='no' private='no'>
+        <uuid>{uuid}</uuid>
+        <usage type='ceph'>
+            <name>client.cinder secret</name>
+        </usage>
+    </secret>""")
+    misc.sudo_write_file(devstack_node, secret_path,
+                         secret_template.format(uuid=uuid))
+    devstack_node.run(args=['sudo', 'virsh', 'secret-define', '--file',
+                            secret_path])
+    devstack_node.run(args=['sudo', 'virsh', 'secret-set-value', '--secret',
+                            uuid, '--base64', cinder_key])
+    return uuid
+
+
+def update_devstack_config_files(devstack_node, secret_uuid):
+    log.info("Updating DevStack config files to use Ceph...")
+
+    def backup_config(node, file_name, backup_ext='.orig.teuth'):
+        node.run(args=['cp', '-f', file_name, file_name + backup_ext])
+
+    def update_config(config_name, config_stream, update_dict,
+                      section='DEFAULT'):
+        parser = ConfigParser()
+        parser.read_file(config_stream)
+        for (key, value) in update_dict.items():
+            parser.set(section, key, value)
+        out_stream = StringIO()
+        parser.write(out_stream)
+        out_stream.seek(0)
+        return out_stream
+
+    updates = [
+        dict(name='/etc/glance/glance-api.conf', options=dict(
+            default_store='rbd',
+            rbd_store_user='glance',
+            rbd_store_pool='images',
+            show_image_direct_url='True',)),
+        dict(name='/etc/cinder/cinder.conf', options=dict(
+            volume_driver='cinder.volume.drivers.rbd.RBDDriver',
+            rbd_pool='volumes',
+            rbd_ceph_conf='/etc/ceph/ceph.conf',
+            rbd_flatten_volume_from_snapshot='false',
+            rbd_max_clone_depth='5',
+            glance_api_version='2',
+            rbd_user='cinder',
+            rbd_secret_uuid=secret_uuid,
+            backup_driver='cinder.backup.drivers.ceph',
+            backup_ceph_conf='/etc/ceph/ceph.conf',
+            backup_ceph_user='cinder-backup',
+            backup_ceph_chunk_size='134217728',
+            backup_ceph_pool='backups',
+            backup_ceph_stripe_unit='0',
+            backup_ceph_stripe_count='0',
+            restore_discard_excess_bytes='true',
+            )),
+        dict(name='/etc/nova/nova.conf', options=dict(
+            libvirt_images_type='rbd',
+            libvirt_images_rbd_pool='volumes',
+            libvirt_images_rbd_ceph_conf='/etc/ceph/ceph.conf',
+            rbd_user='cinder',
+            rbd_secret_uuid=secret_uuid,
+            libvirt_inject_password='false',
+            libvirt_inject_key='false',
+            libvirt_inject_partition='-2',
+            )),
+    ]
+
+    for update in updates:
+        file_name = update['name']
+        options = update['options']
+        config_str = misc.get_file(devstack_node, file_name, sudo=True)
+        config_stream = StringIO(config_str)
+        backup_config(devstack_node, file_name)
+        new_config_stream = update_config(file_name, config_stream, options)
+        misc.sudo_write_file(devstack_node, file_name, new_config_stream)
+
+
+def set_apache_servername(node):
+    # Apache complains: "Could not reliably determine the server's fully
+    # qualified domain name, using 127.0.0.1 for ServerName"
+    # So, let's make sure it knows its name.
+    log.info("Setting Apache ServerName...")
+
+    hostname = node.hostname
+    config_file = '/etc/apache2/conf.d/servername'
+    misc.sudo_write_file(node, config_file,
+                         "ServerName {name}".format(name=hostname))
+
+
+def start_devstack(devstack_node):
+    log.info("Patching devstack start script...")
+    # This causes screen to start headless - otherwise rejoin-stack.sh fails
+    # because there is no terminal attached.
+    cmd = "cd devstack && sed -ie 's/screen -c/screen -dm -c/' rejoin-stack.sh"
+    devstack_node.run(args=cmd)
+
+    log.info("Starting devstack...")
+    cmd = "cd devstack && ./rejoin-stack.sh"
+    devstack_node.run(args=cmd)
+
+    # This was added because I was getting timeouts on Cinder requests - which
+    # were trying to access Keystone on port 5000. A more robust way to handle
+    # this would be to introduce a wait-loop on devstack_node that checks to
+    # see if a service is listening on port 5000.
+    log.info("Waiting 30s for devstack to start...")
+    time.sleep(30)
+
+
+def restart_apache(node):
+    node.run(args=['sudo', '/etc/init.d/apache2', 'restart'], wait=True)
+
+
+@contextlib.contextmanager
+def exercise(ctx, config):
+    log.info("Running devstack exercises...")
+
+    if config is None:
+        config = {}
+    if not isinstance(config, dict):
+        raise TypeError("config must be a dict")
+
+    devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0]
+
+    # TODO: save the log *and* preserve failures
+    #devstack_archive_dir = create_devstack_archive(ctx, devstack_node)
+
+    try:
+        #cmd = "cd devstack && ./exercise.sh 2>&1 | tee {dir}/exercise.log".format(  # noqa
+        #    dir=devstack_archive_dir)
+        cmd = "cd devstack && ./exercise.sh"
+        devstack_node.run(args=cmd, wait=True)
+        yield
+    finally:
+        pass
+
+
+def create_devstack_archive(ctx, devstack_node):
+    test_dir = misc.get_testdir(ctx)
+    devstack_archive_dir = "{test_dir}/archive/devstack".format(
+        test_dir=test_dir)
+    devstack_node.run(args="mkdir -p " + devstack_archive_dir)
+    return devstack_archive_dir
+
+
+@contextlib.contextmanager
+def smoke(ctx, config):
+    log.info("Running a basic smoketest...")
+
+    devstack_node = ctx.cluster.only(is_devstack_node).remotes.keys()[0]
+    an_osd_node = ctx.cluster.only(is_osd_node).remotes.keys()[0]
+
+    try:
+        create_volume(devstack_node, an_osd_node, 'smoke0', 1)
+        yield
+    finally:
+        pass
+
+
+def create_volume(devstack_node, ceph_node, vol_name, size):
+    """
+    :param size: The size of the volume, in GB
+    """
+    size = str(size)
+    log.info("Creating a {size}GB volume named {name}...".format(
+        name=vol_name,
+        size=size))
+    args = ['source', 'devstack/openrc', run.Raw('&&'), 'cinder', 'create',
+            '--display-name', vol_name, size]
+    out_stream = StringIO()
+    devstack_node.run(args=args, stdout=out_stream, wait=True)
+    vol_info = parse_os_table(out_stream.getvalue())
+    log.debug("Volume info: %s", str(vol_info))
+
+    out_stream = StringIO()
+    try:
+        ceph_node.run(args="rbd --id cinder ls -l volumes", stdout=out_stream,
+                      wait=True)
+    except run.CommandFailedError:
+        log.debug("Original rbd call failed; retrying without '--id cinder'")
+        ceph_node.run(args="rbd ls -l volumes", stdout=out_stream,
+                      wait=True)
+
+    assert vol_info['id'] in out_stream.getvalue(), \
+        "Volume not found on Ceph cluster"
+    assert vol_info['size'] == size, \
+        "Volume size on Ceph cluster is different than specified"
+    return vol_info['id']
+
+
+def parse_os_table(table_str):
+    out_dict = dict()
+    for line in table_str.split('\n'):
+        if line.startswith('|'):
+            items = line.split()
+            out_dict[items[1]] = items[3]
+    return out_dict
diff --git a/src/ceph/qa/tasks/die_on_err.py b/src/ceph/qa/tasks/die_on_err.py
new file mode 100644
index 0000000..bf422ae
--- /dev/null
+++ b/src/ceph/qa/tasks/die_on_err.py
@@ -0,0 +1,70 @@
+"""
+Raise exceptions on osd coredumps or test err directories
+"""
+import contextlib
+import logging
+import time
+from teuthology.orchestra import run
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Die if {testdir}/err exists or if an OSD dumps core
+    """
+    if config is None:
+        config = {}
+
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    log.info('num_osds is %s' % num_osds)
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < num_osds:
+        time.sleep(10)
+
+    testdir = teuthology.get_testdir(ctx)
+
+    while True:
+        for i in range(num_osds):
+            (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.iterkeys()
+            p = osd_remote.run(
+                args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ],
+                wait=True,
+                check_status=False,
+            )
+            exit_status = p.exitstatus
+
+            if exit_status == 0:
+                log.info("osd %d has an error" % i)
+                raise Exception("osd %d error" % i)
+
+            log_path = '/var/log/ceph/osd.%d.log' % (i)
+
+            p = osd_remote.run(
+                args = [
+                         'tail', '-1', log_path,
+                         run.Raw('|'),
+                         'grep', '-q', 'end dump'
+                       ],
+                wait=True,
+                check_status=False,
+            )
+            exit_status = p.exitstatus
+
+            if exit_status == 0:
+                log.info("osd %d dumped core" % i)
+                raise Exception("osd %d dumped core" % i)
+
+        time.sleep(5)
diff --git a/src/ceph/qa/tasks/divergent_priors.py b/src/ceph/qa/tasks/divergent_priors.py
new file mode 100644
index 0000000..12ea933
--- /dev/null
+++ b/src/ceph/qa/tasks/divergent_priors.py
@@ -0,0 +1,160 @@
+"""
+Special case divergence test
+"""
+import logging
+import time
+
+from teuthology import misc as teuthology
+from util.rados import rados
+
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+    """
+    Test handling of divergent entries with prior_version
+    prior to log_tail
+
+    overrides:
+      ceph:
+        conf:
+          osd:
+            debug osd: 5
+
+    Requires 3 osds on a single test node.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'divergent_priors task only accepts a dict for configuration'
+
+    manager = ctx.managers['ceph']
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+    manager.raw_cluster_cmd('osd', 'set', 'noin')
+    manager.raw_cluster_cmd('osd', 'set', 'nodown')
+    manager.wait_for_clean()
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+    dummyfile2 = '/etc/resolv.conf'
+
+    # create 1 pg pool
+    log.info('creating foo')
+    manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
+
+    osds = [0, 1, 2]
+    for i in osds:
+        manager.set_config(i, osd_min_pg_log_entries=10)
+        manager.set_config(i, osd_max_pg_log_entries=10)
+        manager.set_config(i, osd_pg_log_trim_min=5)
+
+    # determine primary
+    divergent = manager.get_pg_primary('foo', 0)
+    log.info("primary and soon to be divergent is %d", divergent)
+    non_divergent = list(osds)
+    non_divergent.remove(divergent)
+
+    log.info('writing initial objects')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    # write 100 objects
+    for i in range(100):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+    manager.wait_for_clean()
+
+    # blackhole non_divergent
+    log.info("blackholing osds %s", str(non_divergent))
+    for i in non_divergent:
+        manager.set_config(i, objectstore_blackhole=1)
+
+    DIVERGENT_WRITE = 5
+    DIVERGENT_REMOVE = 5
+    # Write some soon to be divergent
+    log.info('writing divergent objects')
+    for i in range(DIVERGENT_WRITE):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
+                         dummyfile2], wait=False)
+    # Remove some soon to be divergent
+    log.info('remove divergent objects')
+    for i in range(DIVERGENT_REMOVE):
+        rados(ctx, mon, ['-p', 'foo', 'rm',
+                         'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
+    time.sleep(10)
+    mon.run(
+        args=['killall', '-9', 'rados'],
+        wait=True,
+        check_status=False)
+
+    # kill all the osds but leave divergent in
+    log.info('killing all the osds')
+    for i in osds:
+        manager.kill_osd(i)
+    for i in osds:
+        manager.mark_down_osd(i)
+    for i in non_divergent:
+        manager.mark_out_osd(i)
+
+    # bring up non-divergent
+    log.info("bringing up non_divergent %s", str(non_divergent))
+    for i in non_divergent:
+        manager.revive_osd(i)
+    for i in non_divergent:
+        manager.mark_in_osd(i)
+
+    # write 1 non-divergent object (ensure that old divergent one is divergent)
+    objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
+    log.info('writing non-divergent object ' + objname)
+    rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
+
+    manager.wait_for_recovery()
+
+    # ensure no recovery of up osds first
+    log.info('delay recovery')
+    for i in non_divergent:
+        manager.wait_run_admin_socket(
+            'osd', i, ['set_recovery_delay', '100000'])
+
+    # bring in our divergent friend
+    log.info("revive divergent %d", divergent)
+    manager.raw_cluster_cmd('osd', 'set', 'noup')
+    manager.revive_osd(divergent)
+
+    log.info('delay recovery divergent')
+    manager.wait_run_admin_socket(
+        'osd', divergent, ['set_recovery_delay', '100000'])
+
+    manager.raw_cluster_cmd('osd', 'unset', 'noup')
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+    log.info('wait for peering')
+    rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+    # At this point the divergent_priors should have been detected
+
+    log.info("killing divergent %d", divergent)
+    manager.kill_osd(divergent)
+    log.info("reviving divergent %d", divergent)
+    manager.revive_osd(divergent)
+
+    time.sleep(20)
+
+    log.info('allowing recovery')
+    # Set osd_recovery_delay_start back to 0 and kick the queue
+    for i in osds:
+        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
+                                    'kick_recovery_wq', ' 0')
+
+    log.info('reading divergent objects')
+    for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
+        exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
+                                       '/tmp/existing'])
+        assert exit_status is 0
+
+    log.info("success")
diff --git a/src/ceph/qa/tasks/divergent_priors2.py b/src/ceph/qa/tasks/divergent_priors2.py
new file mode 100644
index 0000000..0ed7532
--- /dev/null
+++ b/src/ceph/qa/tasks/divergent_priors2.py
@@ -0,0 +1,190 @@
+"""
+Special case divergence test with ceph-objectstore-tool export/remove/import
+"""
+import logging
+import time
+from cStringIO import StringIO
+
+from teuthology import misc as teuthology
+from util.rados import rados
+import os
+
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+    """
+    Test handling of divergent entries with prior_version
+    prior to log_tail and a ceph-objectstore-tool export/import
+
+    overrides:
+      ceph:
+        conf:
+          osd:
+            debug osd: 5
+
+    Requires 3 osds on a single test node.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'divergent_priors task only accepts a dict for configuration'
+
+    manager = ctx.managers['ceph']
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+    manager.raw_cluster_cmd('osd', 'set', 'noin')
+    manager.raw_cluster_cmd('osd', 'set', 'nodown')
+    manager.wait_for_clean()
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+    dummyfile2 = '/etc/resolv.conf'
+    testdir = teuthology.get_testdir(ctx)
+
+    # create 1 pg pool
+    log.info('creating foo')
+    manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
+
+    osds = [0, 1, 2]
+    for i in osds:
+        manager.set_config(i, osd_min_pg_log_entries=10)
+        manager.set_config(i, osd_max_pg_log_entries=10)
+        manager.set_config(i, osd_pg_log_trim_min=5)
+
+    # determine primary
+    divergent = manager.get_pg_primary('foo', 0)
+    log.info("primary and soon to be divergent is %d", divergent)
+    non_divergent = list(osds)
+    non_divergent.remove(divergent)
+
+    log.info('writing initial objects')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    # write 100 objects
+    for i in range(100):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+    manager.wait_for_clean()
+
+    # blackhole non_divergent
+    log.info("blackholing osds %s", str(non_divergent))
+    for i in non_divergent:
+        manager.set_config(i, objectstore_blackhole=1)
+
+    DIVERGENT_WRITE = 5
+    DIVERGENT_REMOVE = 5
+    # Write some soon to be divergent
+    log.info('writing divergent objects')
+    for i in range(DIVERGENT_WRITE):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
+                         dummyfile2], wait=False)
+    # Remove some soon to be divergent
+    log.info('remove divergent objects')
+    for i in range(DIVERGENT_REMOVE):
+        rados(ctx, mon, ['-p', 'foo', 'rm',
+                         'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
+    time.sleep(10)
+    mon.run(
+        args=['killall', '-9', 'rados'],
+        wait=True,
+        check_status=False)
+
+    # kill all the osds but leave divergent in
+    log.info('killing all the osds')
+    for i in osds:
+        manager.kill_osd(i)
+    for i in osds:
+        manager.mark_down_osd(i)
+    for i in non_divergent:
+        manager.mark_out_osd(i)
+
+    # bring up non-divergent
+    log.info("bringing up non_divergent %s", str(non_divergent))
+    for i in non_divergent:
+        manager.revive_osd(i)
+    for i in non_divergent:
+        manager.mark_in_osd(i)
+
+    # write 1 non-divergent object (ensure that old divergent one is divergent)
+    objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
+    log.info('writing non-divergent object ' + objname)
+    rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
+
+    manager.wait_for_recovery()
+
+    # ensure no recovery of up osds first
+    log.info('delay recovery')
+    for i in non_divergent:
+        manager.wait_run_admin_socket(
+            'osd', i, ['set_recovery_delay', '100000'])
+
+    # bring in our divergent friend
+    log.info("revive divergent %d", divergent)
+    manager.raw_cluster_cmd('osd', 'set', 'noup')
+    manager.revive_osd(divergent)
+
+    log.info('delay recovery divergent')
+    manager.wait_run_admin_socket(
+        'osd', divergent, ['set_recovery_delay', '100000'])
+
+    manager.raw_cluster_cmd('osd', 'unset', 'noup')
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+    log.info('wait for peering')
+    rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+    # At this point the divergent_priors should have been detected
+
+    log.info("killing divergent %d", divergent)
+    manager.kill_osd(divergent)
+
+    # Export a pg
+    (exp_remote,) = ctx.\
+        cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
+    FSPATH = manager.get_filepath()
+    JPATH = os.path.join(FSPATH, "journal")
+    prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+              "--data-path {fpath} --journal-path {jpath} "
+              "--log-file="
+              "/var/log/ceph/objectstore_tool.$$.log ".
+              format(fpath=FSPATH, jpath=JPATH))
+    pid = os.getpid()
+    expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
+    cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
+           format(id=divergent, file=expfile))
+    proc = exp_remote.run(args=cmd, wait=True,
+                          check_status=False, stdout=StringIO())
+    assert proc.exitstatus == 0
+
+    cmd = ((prefix + "--op import --file {file}").
+           format(id=divergent, file=expfile))
+    proc = exp_remote.run(args=cmd, wait=True,
+                          check_status=False, stdout=StringIO())
+    assert proc.exitstatus == 0
+
+    log.info("reviving divergent %d", divergent)
+    manager.revive_osd(divergent)
+    manager.wait_run_admin_socket('osd', divergent, ['dump_ops_in_flight'])
+    time.sleep(20);
+
+    log.info('allowing recovery')
+    # Set osd_recovery_delay_start back to 0 and kick the queue
+    for i in osds:
+        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
+                                    'kick_recovery_wq', ' 0')
+
+    log.info('reading divergent objects')
+    for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
+        exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
+                                       '/tmp/existing'])
+        assert exit_status is 0
+
+    cmd = 'rm {file}'.format(file=expfile)
+    exp_remote.run(args=cmd, wait=True)
+    log.info("success")
diff --git a/src/ceph/qa/tasks/dnsmasq.py b/src/ceph/qa/tasks/dnsmasq.py
new file mode 100644
index 0000000..ee01b17
--- /dev/null
+++ b/src/ceph/qa/tasks/dnsmasq.py
@@ -0,0 +1,102 @@
+"""
+Task for dnsmasq configuration
+"""
+import contextlib
+import logging
+
+from teuthology import misc
+from teuthology.exceptions import ConfigError
+from teuthology import contextutil
+from util import get_remote_for_role
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def setup_dnsmasq(remote, cnames):
+    """ configure dnsmasq on the given remote, adding each cname given """
+    log.info('Configuring dnsmasq on remote %s..', remote.name)
+
+    # back up existing resolv.conf
+    resolv_conf = misc.get_file(remote, '/etc/resolv.conf')
+    # point resolv.conf to local dnsmasq
+    misc.sudo_write_file(remote, '/etc/resolv.conf',
+                         "nameserver 127.0.0.1\n")
+
+    # add address entries to /etc/dnsmasq.d/ceph
+    dnsmasq = "server=8.8.8.8\nserver=8.8.4.4\n"
+    address_template = "address=/{cname}/{ip_address}\n"
+    for cname, ip_address in cnames.iteritems():
+        dnsmasq += address_template.format(cname=cname, ip_address=ip_address)
+    misc.sudo_write_file(remote, '/etc/dnsmasq.d/ceph', dnsmasq)
+
+    remote.run(args=['cat', '/etc/dnsmasq.d/ceph'])
+    # restart dnsmasq
+    remote.run(args=['sudo', 'systemctl', 'restart', 'dnsmasq'])
+    remote.run(args=['sudo', 'systemctl', 'status', 'dnsmasq'])
+    # verify dns name is set
+    remote.run(args=['ping', '-c', '4', cnames.keys()[0]])
+
+    yield
+
+    log.info('Removing dnsmasq configuration from remote %s..', remote.name)
+    # restore resolv.conf
+    misc.sudo_write_file(remote, '/etc/resolv.conf', resolv_conf)
+    # restart dnsmasq
+    remote.run(args=['sudo', 'systemctl', 'restart', 'dnsmasq'])
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Configures dnsmasq to add cnames for teuthology remotes. The task expects a
+    dictionary, where each key is a role. If all cnames for that role use the
+    same address as that role, the cnames can be given as a list. For example,
+    this entry configures dnsmasq on the remote associated with client.0, adding
+    two cnames for the ip address associated with client.0:
+
+        - dnsmasq:
+            client.0:
+            - client0.example.com
+            - c0.example.com
+
+    If the addresses do not all match the given role, a dictionary can be given
+    to specify the ip address by its target role. For example:
+
+        - dnsmasq:
+            client.0:
+              client.0.example.com: client.0
+              client.1.example.com: client.1
+    """
+    # apply overrides
+    overrides = config.get('overrides', {})
+    misc.deep_merge(config, overrides.get('dnsmasq', {}))
+
+    # multiple roles may map to the same remote, so collect names by remote
+    remote_names = {}
+    for role, cnames in config.iteritems():
+        remote = get_remote_for_role(ctx, role)
+        if remote is None:
+            raise ConfigError('no remote for role %s' % role)
+
+        names = remote_names.get(remote, {})
+
+        if isinstance(cnames, list):
+            # when given a list of cnames, point to local ip
+            for cname in cnames:
+                names[cname] = remote.ip_address
+        elif isinstance(cnames, dict):
+            # when given a dict, look up the remote ip for each
+            for cname, client in cnames.iteritems():
+                r = get_remote_for_role(ctx, client)
+                if r is None:
+                    raise ConfigError('no remote for role %s' % client)
+                names[cname] = r.ip_address
+
+        remote_names[remote] = names
+
+    # run a subtask for each unique remote
+    subtasks = []
+    for remote, cnames in remote_names.iteritems():
+        subtasks.extend([ lambda r=remote, cn=cnames: setup_dnsmasq(r, cn) ])
+
+    with contextutil.nested(*subtasks):
+        yield
diff --git a/src/ceph/qa/tasks/dump_stuck.py b/src/ceph/qa/tasks/dump_stuck.py
new file mode 100644
index 0000000..39429d2
--- /dev/null
+++ b/src/ceph/qa/tasks/dump_stuck.py
@@ -0,0 +1,162 @@
+"""
+Dump_stuck command
+"""
+import logging
+import re
+import time
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
+    """
+    Do checks.  Make sure get_stuck_pgs return the right amout of information, then
+    extract health information from the raw_cluster_cmd and compare the results with
+    values passed in.  This passes if all asserts pass.
+ 
+    :param num_manager: Ceph manager
+    :param num_inactive: number of inaactive pages that are stuck
+    :param num_unclean: number of unclean pages that are stuck
+    :paran num_stale: number of stale pages that are stuck
+    :param timeout: timeout value for get_stuck_pgs calls
+    """
+    inactive = manager.get_stuck_pgs('inactive', timeout)
+    unclean = manager.get_stuck_pgs('unclean', timeout)
+    stale = manager.get_stuck_pgs('stale', timeout)
+    log.info('inactive %s / %d,  unclean %s / %d,  stale %s / %d',
+             len(inactive), num_inactive,
+             len(unclean), num_unclean,
+             len(stale), num_stale)
+    assert len(inactive) == num_inactive
+    assert len(unclean) == num_unclean
+    assert len(stale) == num_stale
+
+def task(ctx, config):
+    """
+    Test the dump_stuck command.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    assert config is None, \
+        'dump_stuck requires no configuration'
+    assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
+        'dump_stuck requires exactly 2 osds'
+
+    timeout = 60
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_clean(timeout)
+
+    manager.raw_cluster_cmd('tell', 'mon.0', 'injectargs', '--',
+#                            '--mon-osd-report-timeout 90',
+                            '--mon-pg-stuck-threshold 10')
+
+    # all active+clean
+    check_stuck(
+        manager,
+        num_inactive=0,
+        num_unclean=0,
+        num_stale=0,
+        )
+    num_pgs = manager.get_num_pgs()
+
+    manager.mark_out_osd(0)
+    time.sleep(timeout)
+    manager.flush_pg_stats([1])
+    manager.wait_for_recovery(timeout)
+
+    # all active+clean+remapped
+    check_stuck(
+        manager,
+        num_inactive=0,
+        num_unclean=0,
+        num_stale=0,
+        )
+
+    manager.mark_in_osd(0)
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_clean(timeout)
+
+    # all active+clean
+    check_stuck(
+        manager,
+        num_inactive=0,
+        num_unclean=0,
+        num_stale=0,
+        )
+
+    log.info('stopping first osd')
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+    manager.wait_for_active(timeout)
+
+    log.info('waiting for all to be unclean')
+    starttime = time.time()
+    done = False
+    while not done:
+        try:
+            check_stuck(
+                manager,
+                num_inactive=0,
+                num_unclean=num_pgs,
+                num_stale=0,
+                )
+            done = True
+        except AssertionError:
+            # wait up to 15 minutes to become stale
+            if time.time() - starttime > 900:
+                raise
+
+
+    log.info('stopping second osd')
+    manager.kill_osd(1)
+    manager.mark_down_osd(1)
+
+    log.info('waiting for all to be stale')
+    starttime = time.time()
+    done = False
+    while not done:
+        try:
+            check_stuck(
+                manager,
+                num_inactive=0,
+                num_unclean=num_pgs,
+                num_stale=num_pgs,
+                )
+            done = True
+        except AssertionError:
+            # wait up to 15 minutes to become stale
+            if time.time() - starttime > 900:
+                raise
+
+    log.info('reviving')
+    for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
+        manager.revive_osd(id_)
+        manager.mark_in_osd(id_)
+    while True:
+        try:
+            manager.flush_pg_stats([0, 1])
+            break
+        except Exception:
+            log.exception('osds must not be started yet, waiting...')
+            time.sleep(1)
+    manager.wait_for_clean(timeout)
+
+    check_stuck(
+        manager,
+        num_inactive=0,
+        num_unclean=0,
+        num_stale=0,
+        )
diff --git a/src/ceph/qa/tasks/ec_lost_unfound.py b/src/ceph/qa/tasks/ec_lost_unfound.py
new file mode 100644
index 0000000..cc0bdb2
--- /dev/null
+++ b/src/ceph/qa/tasks/ec_lost_unfound.py
@@ -0,0 +1,158 @@
+"""
+Lost_unfound
+"""
+from teuthology.orchestra import run
+import logging
+import ceph_manager
+from teuthology import misc as teuthology
+from util.rados import rados
+import time
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling of lost objects on an ec pool.
+
+    A pretty rigid cluster is brought up andtested by this task
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'lost_unfound task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    manager.wait_for_clean()
+
+    profile = config.get('erasure_code_profile', {
+        'k': '2',
+        'm': '2',
+        'crush-failure-domain': 'osd'
+    })
+    profile_name = profile.get('name', 'lost_unfound')
+    manager.create_erasure_code_profile(profile_name, profile)
+    pool = manager.create_pool_with_unique_name(
+        erasure_code_profile_name=profile_name,
+        min_size=2)
+
+    # something that is always there, readable and never empty
+    dummyfile = '/etc/group'
+
+    # kludge to make sure they get a map
+    rados(ctx, mon, ['-p', pool, 'put', 'dummy', dummyfile])
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_recovery()
+
+    # create old objects
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', pool, 'rm', 'existed_%d' % f])
+
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.1',
+            'injectargs',
+            '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
+            )
+
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+    manager.kill_osd(3)
+    manager.mark_down_osd(3)
+    
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', pool, 'put', 'new_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', pool, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', pool, 'put', 'existing_%d' % f, dummyfile])
+
+    # take out osd.1 and a necessary shard of those objects.
+    manager.kill_osd(1)
+    manager.mark_down_osd(1)
+    manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
+    manager.revive_osd(0)
+    manager.wait_till_osd_is_up(0)
+    manager.revive_osd(3)
+    manager.wait_till_osd_is_up(3)
+
+    manager.flush_pg_stats([0, 2, 3])
+    manager.wait_till_active()
+    manager.flush_pg_stats([0, 2, 3])
+
+    # verify that there are unfound objects
+    unfound = manager.get_num_unfound_objects()
+    log.info("there are %d unfound objects" % unfound)
+    assert unfound
+
+    testdir = teuthology.get_testdir(ctx)
+    procs = []
+    if config.get('parallel_bench', True):
+        procs.append(mon.run(
+            args=[
+                "/bin/sh", "-c",
+                " ".join(['adjust-ulimits',
+                          'ceph-coverage',
+                          '{tdir}/archive/coverage',
+                          'rados',
+                          '--no-log-to-stderr',
+                          '--name', 'client.admin',
+                          '-b', str(4<<10),
+                          '-p' , pool,
+                          '-t', '20',
+                          'bench', '240', 'write',
+                      ]).format(tdir=testdir),
+            ],
+            logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
+            stdin=run.PIPE,
+            wait=False
+        ))
+    time.sleep(10)
+
+    # mark stuff lost
+    pgs = manager.get_pg_stats()
+    for pg in pgs:
+        if pg['stat_sum']['num_objects_unfound'] > 0:
+            # verify that i can list them direct from the osd
+            log.info('listing missing/lost in %s state %s', pg['pgid'],
+                     pg['state']);
+            m = manager.list_pg_missing(pg['pgid'])
+            log.info('%s' % m)
+            assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
+
+            log.info("reverting unfound in %s", pg['pgid'])
+            manager.raw_cluster_cmd('pg', pg['pgid'],
+                                    'mark_unfound_lost', 'delete')
+        else:
+            log.info("no unfound in %s", pg['pgid'])
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
+    manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
+    manager.raw_cluster_cmd('tell', 'osd.3', 'debug', 'kick_recovery_wq', '5')
+    manager.flush_pg_stats([0, 2, 3])
+    manager.wait_for_recovery()
+
+    if not config.get('parallel_bench', True):
+        time.sleep(20)
+
+    # verify result
+    for f in range(1, 10):
+        err = rados(ctx, mon, ['-p', pool, 'get', 'new_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', pool, 'get', 'existed_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', pool, 'get', 'existing_%d' % f, '-'])
+        assert err
+
+    # see if osd.1 can cope
+    manager.revive_osd(1)
+    manager.wait_till_osd_is_up(1)
+    manager.wait_for_clean()
+    run.wait(procs)
diff --git a/src/ceph/qa/tasks/exec_on_cleanup.py b/src/ceph/qa/tasks/exec_on_cleanup.py
new file mode 100644
index 0000000..e3c09d5
--- /dev/null
+++ b/src/ceph/qa/tasks/exec_on_cleanup.py
@@ -0,0 +1,62 @@
+"""
+Exececute custom commands during unwind/cleanup
+"""
+import logging
+import contextlib
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Execute commands on a given role
+
+        tasks:
+        - ceph:
+        - kclient: [client.a]
+        - exec:
+            client.a:
+              - "echo 'module libceph +p' > /sys/kernel/debug/dynamic_debug/control"
+              - "echo 'module ceph +p' > /sys/kernel/debug/dynamic_debug/control"
+        - interactive:
+
+    It stops and fails with the first command that does not return on success. It means
+    that if the first command fails, the second won't run at all.
+
+    To avoid confusion it is recommended to explicitly enclose the commands in 
+    double quotes. For instance if the command is false (without double quotes) it will
+    be interpreted as a boolean by the YAML parser.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    try:
+        yield
+    finally:
+        log.info('Executing custom commands...')
+        assert isinstance(config, dict), "task exec got invalid config"
+
+        testdir = teuthology.get_testdir(ctx)
+
+        if 'all' in config and len(config) == 1:
+            a = config['all']
+            roles = teuthology.all_roles(ctx.cluster)
+            config = dict((id_, a) for id_ in roles)
+
+            for role, ls in config.iteritems():
+                (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+                log.info('Running commands on role %s host %s', role, remote.name)
+                for c in ls:
+                    c.replace('$TESTDIR', testdir)
+                    remote.run(
+                        args=[
+                            'sudo',
+                            'TESTDIR={tdir}'.format(tdir=testdir),
+                            'bash',
+                            '-c',
+                            c],
+                    )
+
diff --git a/src/ceph/qa/tasks/filestore_idempotent.py b/src/ceph/qa/tasks/filestore_idempotent.py
new file mode 100644
index 0000000..4e2a228
--- /dev/null
+++ b/src/ceph/qa/tasks/filestore_idempotent.py
@@ -0,0 +1,81 @@
+"""
+Filestore/filejournal handler
+"""
+import logging
+from teuthology.orchestra import run
+import random
+
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test filestore/filejournal handling of non-idempotent events.
+
+    Currently this is a kludge; we require the ceph task preceeds us just
+    so that we get the tarball installed to run the test binary.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    # just use the first client...
+    client = clients[0];
+    (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+
+    testdir = teuthology.get_testdir(ctx)
+
+    dir = '%s/ceph.data/test.%s' % (testdir, client)
+
+    seed = str(int(random.uniform(1,100)))
+
+    try:
+        log.info('creating a working dir')
+        remote.run(args=['mkdir', dir])
+        remote.run(
+            args=[
+                'cd', dir,
+                run.Raw('&&'),
+                'wget','-q', '-Orun_seed_to.sh',
+                'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to.sh;hb=HEAD',
+                run.Raw('&&'),
+                'wget','-q', '-Orun_seed_to_range.sh',
+                'http://git.ceph.com/?p=ceph.git;a=blob_plain;f=src/test/objectstore/run_seed_to_range.sh;hb=HEAD',
+                run.Raw('&&'),
+                'chmod', '+x', 'run_seed_to.sh', 'run_seed_to_range.sh',
+                ]);
+
+        log.info('running a series of tests')
+        proc = remote.run(
+            args=[
+                'cd', dir,
+                run.Raw('&&'),
+                './run_seed_to_range.sh', seed, '50', '300',
+                ],
+            wait=False,
+            check_status=False)
+        result = proc.wait()
+
+        if result != 0:
+            remote.run(
+                args=[
+                    'cp', '-a', dir, '{tdir}/archive/idempotent_failure'.format(tdir=testdir),
+                    ])
+            raise Exception("./run_seed_to_range.sh errored out")
+
+    finally:
+        remote.run(args=[
+                'rm', '-rf', '--', dir
+                ])
+
diff --git a/src/ceph/qa/tasks/kclient.py b/src/ceph/qa/tasks/kclient.py
new file mode 100644
index 0000000..7cc7ada
--- /dev/null
+++ b/src/ceph/qa/tasks/kclient.py
@@ -0,0 +1,137 @@
+"""
+Mount/unmount a ``kernel`` client.
+"""
+import contextlib
+import logging
+
+from teuthology.misc import deep_merge
+from teuthology.orchestra.run import CommandFailedError
+from teuthology import misc
+from teuthology.contextutil import MaxWhileTries
+from cephfs.kernel_mount import KernelMount
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Mount/unmount a ``kernel`` client.
+
+    The config is optional and defaults to mounting on all clients. If
+    a config is given, it is expected to be a list of clients to do
+    this operation on. This lets you e.g. set up one client with
+    ``ceph-fuse`` and another with ``kclient``.
+
+    Example that mounts all clients::
+
+        tasks:
+        - ceph:
+        - kclient:
+        - interactive:
+
+    Example that uses both ``kclient` and ``ceph-fuse``::
+
+        tasks:
+        - ceph:
+        - ceph-fuse: [client.0]
+        - kclient: [client.1]
+        - interactive:
+
+
+    Pass a dictionary instead of lists to specify per-client config:
+
+        tasks:
+        -kclient:
+            client.0:
+                debug: true
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    log.info('Mounting kernel clients...')
+    assert config is None or isinstance(config, list) or isinstance(config, dict), \
+        "task kclient got invalid config"
+
+    if config is None:
+        config = ['client.{id}'.format(id=id_)
+                  for id_ in misc.all_roles_of_type(ctx.cluster, 'client')]
+
+    if isinstance(config, list):
+        client_roles = config
+        config = dict([r, dict()] for r in client_roles)
+    elif isinstance(config, dict):
+        client_roles = filter(lambda x: 'client.' in x, config.keys())
+    else:
+        raise ValueError("Invalid config object: {0} ({1})".format(config, config.__class__))
+
+    # config has been converted to a dict by this point
+    overrides = ctx.config.get('overrides', {})
+    deep_merge(config, overrides.get('kclient', {}))
+
+    clients = list(misc.get_clients(ctx=ctx, roles=client_roles))
+
+    test_dir = misc.get_testdir(ctx)
+
+    # Assemble mon addresses
+    remotes_and_roles = ctx.cluster.remotes.items()
+    roles = [roles for (remote_, roles) in remotes_and_roles]
+    ips = [remote_.ssh.get_transport().getpeername()[0]
+           for (remote_, _) in remotes_and_roles]
+    mons = misc.get_mons(roles, ips).values()
+
+    mounts = {}
+    for id_, remote in clients:
+        client_config = config.get("client.%s" % id_)
+        if client_config is None:
+            client_config = {}
+
+        if config.get("disabled", False) or not client_config.get('mounted', True):
+            continue
+
+        kernel_mount = KernelMount(
+            mons,
+            test_dir,
+            id_,
+            remote,
+            ctx.teuthology_config.get('ipmi_user', None),
+            ctx.teuthology_config.get('ipmi_password', None),
+            ctx.teuthology_config.get('ipmi_domain', None)
+        )
+
+        mounts[id_] = kernel_mount
+
+        if client_config.get('debug', False):
+            remote.run(args=["sudo", "bash", "-c", "echo 'module ceph +p' > /sys/kernel/debug/dynamic_debug/control"])
+            remote.run(args=["sudo", "bash", "-c", "echo 'module libceph +p' > /sys/kernel/debug/dynamic_debug/control"])
+
+        kernel_mount.mount()
+
+
+    def umount_all():
+        log.info('Unmounting kernel clients...')
+
+        forced = False
+        for mount in mounts.values():
+            if mount.is_mounted():
+                try:
+                    mount.umount()
+                except (CommandFailedError, MaxWhileTries):
+                    log.warn("Ordinary umount failed, forcing...")
+                    forced = True
+                    mount.umount_wait(force=True)
+
+        return forced
+
+    ctx.mounts = mounts
+    try:
+        yield mounts
+    except:
+        umount_all()  # ignore forced retval, we are already in error handling
+    finally:
+
+        forced = umount_all()
+        if forced:
+            # The context managers within the kclient manager worked (i.e.
+            # the test workload passed) but for some reason we couldn't
+            # umount, so turn this into a test failure.
+            raise RuntimeError("Kernel mounts did not umount cleanly")
diff --git a/src/ceph/qa/tasks/locktest.py b/src/ceph/qa/tasks/locktest.py
new file mode 100755
index 0000000..9de5ba4
--- /dev/null
+++ b/src/ceph/qa/tasks/locktest.py
@@ -0,0 +1,134 @@
+"""
+locktests
+"""
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Run locktests, from the xfstests suite, on the given
+    clients. Whether the clients are ceph-fuse or kernel does not
+    matter, and the two clients can refer to the same mount.
+
+    The config is a list of two clients to run the locktest on. The
+    first client will be the host.
+
+    For example:
+       tasks:
+       - ceph:
+       - ceph-fuse: [client.0, client.1]
+       - locktest:
+           [client.0, client.1]
+
+    This task does not yield; there would be little point.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+
+    assert isinstance(config, list)
+    log.info('fetching and building locktests...')
+    (host,) = ctx.cluster.only(config[0]).remotes
+    (client,) = ctx.cluster.only(config[1]).remotes
+    ( _, _, host_id) = config[0].partition('.')
+    ( _, _, client_id) = config[1].partition('.')
+    testdir = teuthology.get_testdir(ctx)
+    hostmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=host_id)
+    clientmnt = '{tdir}/mnt.{id}'.format(tdir=testdir, id=client_id)
+
+    try:
+        for client_name in config:
+            log.info('building on {client_}'.format(client_=client_name))
+            ctx.cluster.only(client_name).run(
+                args=[
+                    # explicitly does not support multiple autotest tasks
+                    # in a single run; the result archival would conflict
+                    'mkdir', '{tdir}/archive/locktest'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'mkdir', '{tdir}/locktest'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'wget',
+                    '-nv',
+                    'https://raw.github.com/gregsfortytwo/xfstests-ceph/master/src/locktest.c',
+                    '-O', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'g++', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+                    '-o', '{tdir}/locktest/locktest'.format(tdir=testdir)
+                    ],
+                logger=log.getChild('locktest_client.{id}'.format(id=client_name)),
+                )
+
+        log.info('built locktest on each client')
+
+        host.run(args=['sudo', 'touch',
+                       '{mnt}/locktestfile'.format(mnt=hostmnt),
+                       run.Raw('&&'),
+                       'sudo', 'chown', 'ubuntu.ubuntu',
+                       '{mnt}/locktestfile'.format(mnt=hostmnt)
+                       ]
+                 )
+
+        log.info('starting on host')
+        hostproc = host.run(
+            args=[
+                '{tdir}/locktest/locktest'.format(tdir=testdir),
+                '-p', '6788',
+                '-d',
+                '{mnt}/locktestfile'.format(mnt=hostmnt),
+                ],
+            wait=False,
+            logger=log.getChild('locktest.host'),
+            )
+        log.info('starting on client')
+        (_,_,hostaddr) = host.name.partition('@')
+        clientproc = client.run(
+            args=[
+                '{tdir}/locktest/locktest'.format(tdir=testdir),
+                '-p', '6788',
+                '-d',
+                '-h', hostaddr,
+                '{mnt}/locktestfile'.format(mnt=clientmnt),
+                ],
+            logger=log.getChild('locktest.client'),
+            wait=False
+            )
+
+        hostresult = hostproc.wait()
+        clientresult = clientproc.wait()
+        if (hostresult != 0) or (clientresult != 0):
+            raise Exception("Did not pass locking test!")
+        log.info('finished locktest executable with results {r} and {s}'. \
+                     format(r=hostresult, s=clientresult))
+
+    finally:
+        log.info('cleaning up host dir')
+        host.run(
+            args=[
+                'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rmdir', '{tdir}/locktest'
+                ],
+            logger=log.getChild('.{id}'.format(id=config[0])),
+            )
+        log.info('cleaning up client dir')
+        client.run(
+            args=[
+                'mkdir', '-p', '{tdir}/locktest'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rm', '-f', '{tdir}/locktest/locktest.c'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rm', '-f', '{tdir}/locktest/locktest'.format(tdir=testdir),
+                run.Raw('&&'),
+                'rmdir', '{tdir}/locktest'.format(tdir=testdir)
+                ],
+            logger=log.getChild('.{id}'.format(\
+                    id=config[1])),
+            )
diff --git a/src/ceph/qa/tasks/logrotate.conf b/src/ceph/qa/tasks/logrotate.conf
new file mode 100644
index 0000000..b0cb801
--- /dev/null
+++ b/src/ceph/qa/tasks/logrotate.conf
@@ -0,0 +1,13 @@
+/var/log/ceph/*{daemon_type}*.log {{
+    rotate 100
+    size {max_size}
+    compress
+    sharedscripts
+    postrotate
+        killall {daemon_type} -1 || true
+    endscript
+    missingok
+    notifempty
+    su root root
+}}
+
diff --git a/src/ceph/qa/tasks/lost_unfound.py b/src/ceph/qa/tasks/lost_unfound.py
new file mode 100644
index 0000000..1cc588b
--- /dev/null
+++ b/src/ceph/qa/tasks/lost_unfound.py
@@ -0,0 +1,176 @@
+"""
+Lost_unfound
+"""
+import logging
+import time
+import ceph_manager
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+from util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling of lost objects.
+
+    A pretty rigid cluseter is brought up andtested by this task
+    """
+    POOL = 'unfound_pool'
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'lost_unfound task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+    manager.wait_for_clean()
+
+    manager.create_pool(POOL)
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+
+    # take an osd out until the very end
+    manager.kill_osd(2)
+    manager.mark_down_osd(2)
+    manager.mark_out_osd(2)
+
+    # kludge to make sure they get a map
+    rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile])
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_recovery()
+
+    # create old objects
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f])
+
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.1',
+            'injectargs',
+            '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
+            )
+
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+    
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+
+    # bring osd.0 back up, let it peer, but don't replicate the new
+    # objects...
+    log.info('osd.0 command_args is %s' % 'foo')
+    log.info(ctx.daemons.get_daemon('osd', 0).command_args)
+    ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([
+            '--osd-recovery-delay-start', '1000'
+            ])
+    manager.revive_osd(0)
+    manager.mark_in_osd(0)
+    manager.wait_till_osd_is_up(0)
+
+    manager.flush_pg_stats([1, 0])
+    manager.wait_till_active()
+
+    # take out osd.1 and the only copy of those objects.
+    manager.kill_osd(1)
+    manager.mark_down_osd(1)
+    manager.mark_out_osd(1)
+    manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
+
+    # bring up osd.2 so that things would otherwise, in theory, recovery fully
+    manager.revive_osd(2)
+    manager.mark_in_osd(2)
+    manager.wait_till_osd_is_up(2)
+
+    manager.flush_pg_stats([0, 2])
+    manager.wait_till_active()
+    manager.flush_pg_stats([0, 2])
+
+    # verify that there are unfound objects
+    unfound = manager.get_num_unfound_objects()
+    log.info("there are %d unfound objects" % unfound)
+    assert unfound
+
+    testdir = teuthology.get_testdir(ctx)
+    procs = []
+    if config.get('parallel_bench', True):
+        procs.append(mon.run(
+            args=[
+                "/bin/sh", "-c",
+                " ".join(['adjust-ulimits',
+                          'ceph-coverage',
+                          '{tdir}/archive/coverage',
+                          'rados',
+                          '--no-log-to-stderr',
+                          '--name', 'client.admin',
+                          '-b', str(4<<10),
+                          '-p' , POOL,
+                          '-t', '20',
+                          'bench', '240', 'write',
+                      ]).format(tdir=testdir),
+            ],
+            logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
+            stdin=run.PIPE,
+            wait=False
+        ))
+    time.sleep(10)
+
+    # mark stuff lost
+    pgs = manager.get_pg_stats()
+    for pg in pgs:
+        if pg['stat_sum']['num_objects_unfound'] > 0:
+            primary = 'osd.%d' % pg['acting'][0]
+
+            # verify that i can list them direct from the osd
+            log.info('listing missing/lost in %s state %s', pg['pgid'],
+                     pg['state']);
+            m = manager.list_pg_missing(pg['pgid'])
+            #log.info('%s' % m)
+            assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
+            num_unfound=0
+            for o in m['objects']:
+                if len(o['locations']) == 0:
+                    num_unfound += 1
+            assert m['num_unfound'] == num_unfound
+
+            log.info("reverting unfound in %s on %s", pg['pgid'], primary)
+            manager.raw_cluster_cmd('pg', pg['pgid'],
+                                    'mark_unfound_lost', 'revert')
+        else:
+            log.info("no unfound in %s", pg['pgid'])
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
+    manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
+    manager.flush_pg_stats([0, 2])
+    manager.wait_for_recovery()
+
+    # verify result
+    for f in range(1, 10):
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-'])
+        assert not err
+
+    # see if osd.1 can cope
+    manager.revive_osd(1)
+    manager.mark_in_osd(1)
+    manager.wait_till_osd_is_up(1)
+    manager.wait_for_clean()
+    run.wait(procs)
diff --git a/src/ceph/qa/tasks/manypools.py b/src/ceph/qa/tasks/manypools.py
new file mode 100644
index 0000000..1ddcba5
--- /dev/null
+++ b/src/ceph/qa/tasks/manypools.py
@@ -0,0 +1,73 @@
+"""
+Force pg creation on all osds
+"""
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+import logging
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Create the specified number of pools and write 16 objects to them (thereby forcing
+    the PG creation on each OSD). This task creates pools from all the clients,
+    in parallel. It is easy to add other daemon types which have the appropriate
+    permissions, but I don't think anything else does.
+    The config is just the number of pools to create. I recommend setting
+    "mon create pg interval" to a very low value in your ceph config to speed
+    this up.
+    
+    You probably want to do this to look at memory consumption, and
+    maybe to test how performance changes with the number of PGs. For example:
+    
+    tasks:
+    - ceph:
+        config:
+          mon:
+            mon create pg interval: 1
+    - manypools: 3000
+    - radosbench:
+        clients: [client.0]
+        time: 360
+    """
+    
+    log.info('creating {n} pools'.format(n=config))
+    
+    poolnum = int(config)
+    creator_remotes = []
+    client_roles = teuthology.all_roles_of_type(ctx.cluster, 'client')
+    log.info('got client_roles={client_roles_}'.format(client_roles_=client_roles))
+    for role in client_roles:
+        log.info('role={role_}'.format(role_=role))
+        (creator_remote, ) = ctx.cluster.only('client.{id}'.format(id=role)).remotes.iterkeys()
+        creator_remotes.append((creator_remote, 'client.{id}'.format(id=role)))
+
+    remaining_pools = poolnum
+    poolprocs=dict()
+    while (remaining_pools > 0):
+        log.info('{n} pools remaining to create'.format(n=remaining_pools))
+	for remote, role_ in creator_remotes:
+            poolnum = remaining_pools
+            remaining_pools -= 1
+            if remaining_pools < 0:
+                continue
+            log.info('creating pool{num} on {role}'.format(num=poolnum, role=role_))
+	    proc = remote.run(
+	        args=[
+		    'rados',
+		    '--name', role_,
+		    'mkpool', 'pool{num}'.format(num=poolnum), '-1',
+		    run.Raw('&&'),
+		    'rados',
+		    '--name', role_,
+		    '--pool', 'pool{num}'.format(num=poolnum),
+		    'bench', '0', 'write', '-t', '16', '--block-size', '1'
+		    ],
+		wait = False
+	    )
+            log.info('waiting for pool and object creates')
+	    poolprocs[remote] = proc
+        
+        run.wait(poolprocs.itervalues())
+    
+    log.info('created all {n} pools and wrote 16 objects to each'.format(n=poolnum))
diff --git a/src/ceph/qa/tasks/mds_creation_failure.py b/src/ceph/qa/tasks/mds_creation_failure.py
new file mode 100644
index 0000000..d1de156
--- /dev/null
+++ b/src/ceph/qa/tasks/mds_creation_failure.py
@@ -0,0 +1,85 @@
+
+import logging
+import contextlib
+import time
+import ceph_manager
+from teuthology import misc
+from teuthology.orchestra.run import CommandFailedError, Raw
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Go through filesystem creation with a synthetic failure in an MDS
+    in its 'up:creating' state, to exercise the retry behaviour.
+    """
+    # Grab handles to the teuthology objects of interest
+    mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
+    if len(mdslist) != 1:
+        # Require exactly one MDS, the code path for creation failure when
+        # a standby is available is different
+        raise RuntimeError("This task requires exactly one MDS")
+
+    mds_id = mdslist[0]
+    (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.iterkeys()
+    manager = ceph_manager.CephManager(
+        mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
+    )
+
+    # Stop MDS
+    manager.raw_cluster_cmd('mds', 'set', "max_mds", "0")
+    mds = ctx.daemons.get_daemon('mds', mds_id)
+    mds.stop()
+    manager.raw_cluster_cmd('mds', 'fail', mds_id)
+
+    # Reset the filesystem so that next start will go into CREATING
+    manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
+    manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")
+
+    # Start the MDS with mds_kill_create_at set, it will crash during creation
+    mds.restart_with_args(["--mds_kill_create_at=1"])
+    try:
+        mds.wait_for_exit()
+    except CommandFailedError as e:
+        if e.exitstatus == 1:
+            log.info("MDS creation killed as expected")
+        else:
+            log.error("Unexpected status code %s" % e.exitstatus)
+            raise
+
+    # Since I have intentionally caused a crash, I will clean up the resulting core
+    # file to avoid task.internal.coredump seeing it as a failure.
+    log.info("Removing core file from synthetic MDS failure")
+    mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
+
+    # It should have left the MDS map state still in CREATING
+    status = manager.get_mds_status(mds_id)
+    assert status['state'] == 'up:creating'
+
+    # Start the MDS again without the kill flag set, it should proceed with creation successfully
+    mds.restart()
+
+    # Wait for state ACTIVE
+    t = 0
+    create_timeout = 120
+    while True:
+        status = manager.get_mds_status(mds_id)
+        if status['state'] == 'up:active':
+            log.info("MDS creation completed successfully")
+            break
+        elif status['state'] == 'up:creating':
+            log.info("MDS still in creating state")
+            if t > create_timeout:
+                log.error("Creating did not complete within %ss" % create_timeout)
+                raise RuntimeError("Creating did not complete within %ss" % create_timeout)
+            t += 1
+            time.sleep(1)
+        else:
+            log.error("Unexpected MDS state: %s" % status['state'])
+            assert(status['state'] in ['up:active', 'up:creating'])
+
+    # The system should be back up in a happy healthy state, go ahead and run any further tasks
+    # inside this context.
+    yield
diff --git a/src/ceph/qa/tasks/mds_thrash.py b/src/ceph/qa/tasks/mds_thrash.py
new file mode 100644
index 0000000..75d236d
--- /dev/null
+++ b/src/ceph/qa/tasks/mds_thrash.py
@@ -0,0 +1,555 @@
+"""
+Thrash mds by simulating failures
+"""
+import logging
+import contextlib
+import ceph_manager
+import itertools
+import random
+import signal
+import time
+
+from gevent import sleep
+from gevent.greenlet import Greenlet
+from gevent.event import Event
+from teuthology import misc as teuthology
+
+from tasks.cephfs.filesystem import MDSCluster, Filesystem
+
+log = logging.getLogger(__name__)
+
+class DaemonWatchdog(Greenlet):
+    """
+    DaemonWatchdog::
+
+    Watch Ceph daemons for failures. If an extended failure is detected (i.e.
+    not intentional), then the watchdog will unmount file systems and send
+    SIGTERM to all daemons. The duration of an extended failure is configurable
+    with watchdog_daemon_timeout.
+
+    watchdog_daemon_timeout [default: 300]: number of seconds a daemon
+        is allowed to be failed before the watchdog will bark.
+    """
+
+    def __init__(self, ctx, manager, config, thrashers):
+        Greenlet.__init__(self)
+        self.ctx = ctx
+        self.config = config
+        self.e = None
+        self.logger = log.getChild('daemon_watchdog')
+        self.manager = manager
+        self.name = 'watchdog'
+        self.stopping = Event()
+        self.thrashers = thrashers
+
+    def _run(self):
+        try:
+            self.watch()
+        except Exception as e:
+            # See _run exception comment for MDSThrasher
+            self.e = e
+            self.logger.exception("exception:")
+            # allow successful completion so gevent doesn't see an exception...
+
+    def log(self, x):
+        """Write data to logger"""
+        self.logger.info(x)
+
+    def stop(self):
+        self.stopping.set()
+
+    def bark(self):
+        self.log("BARK! unmounting mounts and killing all daemons")
+        for mount in self.ctx.mounts.values():
+            try:
+                mount.umount_wait(force=True)
+            except:
+                self.logger.exception("ignoring exception:")
+        daemons = []
+        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.manager.cluster)))
+        daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.manager.cluster)))
+        for daemon in daemons:
+            try:
+                daemon.signal(signal.SIGTERM)
+            except:
+                self.logger.exception("ignoring exception:")
+
+    def watch(self):
+        self.log("watchdog starting")
+        daemon_timeout = int(self.config.get('watchdog_daemon_timeout', 300))
+        daemon_failure_time = {}
+        while not self.stopping.is_set():
+            bark = False
+            now = time.time()
+
+            mons = self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.manager.cluster)
+            mdss = self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.manager.cluster)
+            clients = self.ctx.daemons.iter_daemons_of_role('client', cluster=self.manager.cluster)
+
+            #for daemon in mons:
+            #    self.log("mon daemon {role}.{id}: running={r}".format(role=daemon.role, id=daemon.id_, r=daemon.running() and not daemon.proc.finished))
+            #for daemon in mdss:
+            #    self.log("mds daemon {role}.{id}: running={r}".format(role=daemon.role, id=daemon.id_, r=daemon.running() and not daemon.proc.finished))
+
+            daemon_failures = []
+            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons))
+            daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss))
+            for daemon in daemon_failures:
+                name = daemon.role + '.' + daemon.id_
+                dt = daemon_failure_time.setdefault(name, (daemon, now))
+                assert dt[0] is daemon
+                delta = now-dt[1]
+                self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
+                if delta > daemon_timeout:
+                    bark = True
+
+            # If a daemon is no longer failed, remove it from tracking:
+            for name in daemon_failure_time.keys():
+                if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
+                    self.log("daemon {name} has been restored".format(name=name))
+                    del daemon_failure_time[name]
+
+            for thrasher in self.thrashers:
+                if thrasher.e is not None:
+                    self.log("thrasher on fs.{name} failed".format(name=thrasher.fs.name))
+                    bark = True
+
+            if bark:
+                self.bark()
+                return
+
+            sleep(5)
+
+        self.log("watchdog finished")
+
+class MDSThrasher(Greenlet):
+    """
+    MDSThrasher::
+
+    The MDSThrasher thrashes MDSs during execution of other tasks (workunits, etc).
+
+    The config is optional.  Many of the config parameters are a a maximum value
+    to use when selecting a random value from a range.  To always use the maximum
+    value, set no_random to true.  The config is a dict containing some or all of:
+
+    max_thrash: [default: 1] the maximum number of active MDSs per FS that will be thrashed at
+      any given time.
+
+    max_thrash_delay: [default: 30] maximum number of seconds to delay before
+      thrashing again.
+
+    max_replay_thrash_delay: [default: 4] maximum number of seconds to delay while in
+      the replay state before thrashing.
+
+    max_revive_delay: [default: 10] maximum number of seconds to delay before
+      bringing back a thrashed MDS.
+
+    randomize: [default: true] enables randomization and use the max/min values
+
+    seed: [no default] seed the random number generator
+
+    thrash_in_replay: [default: 0.0] likelihood that the MDS will be thrashed
+      during replay.  Value should be between 0.0 and 1.0.
+
+    thrash_max_mds: [default: 0.05] likelihood that the max_mds of the mds
+      cluster will be modified to a value [1, current) or (current, starting
+      max_mds]. When reduced, randomly selected MDSs other than rank 0 will be
+      deactivated to reach the new max_mds.  Value should be between 0.0 and 1.0.
+
+    thrash_while_stopping: [default: false] thrash an MDS while there
+      are MDS in up:stopping (because max_mds was changed and some
+      MDS were deactivated).
+
+    thrash_weights: allows specific MDSs to be thrashed more/less frequently.
+      This option overrides anything specified by max_thrash.  This option is a
+      dict containing mds.x: weight pairs.  For example, [mds.a: 0.7, mds.b:
+      0.3, mds.c: 0.0].  Each weight is a value from 0.0 to 1.0.  Any MDSs not
+      specified will be automatically given a weight of 0.0 (not thrashed).
+      For a given MDS, by default the trasher delays for up to
+      max_thrash_delay, trashes, waits for the MDS to recover, and iterates.
+      If a non-zero weight is specified for an MDS, for each iteration the
+      thrasher chooses whether to thrash during that iteration based on a
+      random value [0-1] not exceeding the weight of that MDS.
+
+    Examples::
+
+
+      The following example sets the likelihood that mds.a will be thrashed
+      to 80%, mds.b to 20%, and other MDSs will not be thrashed.  It also sets the
+      likelihood that an MDS will be thrashed in replay to 40%.
+      Thrash weights do not have to sum to 1.
+
+      tasks:
+      - ceph:
+      - mds_thrash:
+          thrash_weights:
+            - mds.a: 0.8
+            - mds.b: 0.2
+          thrash_in_replay: 0.4
+      - ceph-fuse:
+      - workunit:
+          clients:
+            all: [suites/fsx.sh]
+
+      The following example disables randomization, and uses the max delay values:
+
+      tasks:
+      - ceph:
+      - mds_thrash:
+          max_thrash_delay: 10
+          max_revive_delay: 1
+          max_replay_thrash_delay: 4
+
+    """
+
+    def __init__(self, ctx, manager, config, fs, max_mds):
+        Greenlet.__init__(self)
+
+        self.config = config
+        self.ctx = ctx
+        self.e = None
+        self.logger = log.getChild('fs.[{f}]'.format(f = fs.name))
+        self.fs = fs
+        self.manager = manager
+        self.max_mds = max_mds
+        self.name = 'thrasher.fs.[{f}]'.format(f = fs.name)
+        self.stopping = Event()
+
+        self.randomize = bool(self.config.get('randomize', True))
+        self.thrash_max_mds = float(self.config.get('thrash_max_mds', 0.05))
+        self.max_thrash = int(self.config.get('max_thrash', 1))
+        self.max_thrash_delay = float(self.config.get('thrash_delay', 120.0))
+        self.thrash_in_replay = float(self.config.get('thrash_in_replay', False))
+        assert self.thrash_in_replay >= 0.0 and self.thrash_in_replay <= 1.0, 'thrash_in_replay ({v}) must be between [0.0, 1.0]'.format(
+            v=self.thrash_in_replay)
+        self.max_replay_thrash_delay = float(self.config.get('max_replay_thrash_delay', 4.0))
+        self.max_revive_delay = float(self.config.get('max_revive_delay', 10.0))
+
+    def _run(self):
+        try:
+            self.do_thrash()
+        except Exception as e:
+            # Log exceptions here so we get the full backtrace (gevent loses them).
+            # Also allow succesful completion as gevent exception handling is a broken mess:
+            #
+            # 2017-02-03T14:34:01.259 CRITICAL:root:  File "gevent.libev.corecext.pyx", line 367, in gevent.libev.corecext.loop.handle_error (src/gevent/libev/gevent.corecext.c:5051)
+            #   File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 558, in handle_error
+            #     self.print_exception(context, type, value, tb)
+            #   File "/home/teuthworker/src/git.ceph.com_git_teuthology_master/virtualenv/local/lib/python2.7/site-packages/gevent/hub.py", line 605, in print_exception
+            #     traceback.print_exception(type, value, tb, file=errstream)
+            #   File "/usr/lib/python2.7/traceback.py", line 124, in print_exception
+            #     _print(file, 'Traceback (most recent call last):')
+            #   File "/usr/lib/python2.7/traceback.py", line 13, in _print
+            #     file.write(str+terminator)
+            # 2017-02-03T14:34:01.261 CRITICAL:root:IOError
+            self.e = e
+            self.logger.exception("exception:")
+            # allow successful completion so gevent doesn't see an exception...
+
+    def log(self, x):
+        """Write data to logger assigned to this MDThrasher"""
+        self.logger.info(x)
+
+    def stop(self):
+        self.stopping.set()
+
+    def kill_mds(self, mds):
+        if self.config.get('powercycle'):
+            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
+                         remotes.iterkeys())
+            self.log('kill_mds on mds.{m} doing powercycle of {s}'.
+                     format(m=mds, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_off()
+        else:
+            self.ctx.daemons.get_daemon('mds', mds).stop()
+
+    @staticmethod
+    def _assert_ipmi(remote):
+        assert remote.console.has_ipmi_credentials, (
+            "powercycling requested but RemoteConsole is not "
+            "initialized.  Check ipmi config.")
+
+    def revive_mds(self, mds, standby_for_rank=None):
+        """
+        Revive mds -- do an ipmpi powercycle (if indicated by the config)
+        and then restart (using --hot-standby if specified.
+        """
+        if self.config.get('powercycle'):
+            (remote,) = (self.ctx.cluster.only('mds.{m}'.format(m=mds)).
+                         remotes.iterkeys())
+            self.log('revive_mds on mds.{m} doing powercycle of {s}'.
+                     format(m=mds, s=remote.name))
+            self._assert_ipmi(remote)
+            remote.console.power_on()
+            self.manager.make_admin_daemon_dir(self.ctx, remote)
+        args = []
+        if standby_for_rank:
+            args.extend(['--hot-standby', standby_for_rank])
+        self.ctx.daemons.get_daemon('mds', mds).restart(*args)
+
+    def wait_for_stable(self, rank = None, gid = None):
+        self.log('waiting for mds cluster to stabilize...')
+        for itercount in itertools.count():
+            status = self.fs.status()
+            max_mds = status.get_fsmap(self.fs.id)['mdsmap']['max_mds']
+            ranks = list(status.get_ranks(self.fs.id))
+            stopping = filter(lambda info: "up:stopping" == info['state'], ranks)
+            actives = filter(lambda info: "up:active" == info['state'] and "laggy_since" not in info, ranks)
+
+            if not bool(self.config.get('thrash_while_stopping', False)) and len(stopping) > 0:
+                if itercount % 5 == 0:
+                    self.log('cluster is considered unstable while MDS are in up:stopping (!thrash_while_stopping)')
+            else:
+                if rank is not None:
+                    try:
+                        info = status.get_rank(self.fs.id, rank)
+                        if info['gid'] != gid and "up:active" == info['state']:
+                            self.log('mds.{name} has gained rank={rank}, replacing gid={gid}'.format(name = info['name'], rank = rank, gid = gid))
+                            return status
+                    except:
+                        pass # no rank present
+                    if len(actives) >= max_mds:
+                        # no replacement can occur!
+                        self.log("cluster has %d actives (max_mds is %d), no MDS can replace rank %d".format(len(actives), max_mds, rank))
+                        return status
+                else:
+                    if len(actives) >= max_mds:
+                        self.log('mds cluster has {count} alive and active, now stable!'.format(count = len(actives)))
+                        return status, None
+            if itercount > 300/2: # 5 minutes
+                 raise RuntimeError('timeout waiting for cluster to stabilize')
+            elif itercount % 5 == 0:
+                self.log('mds map: {status}'.format(status=status))
+            else:
+                self.log('no change')
+            sleep(2)
+
+    def do_thrash(self):
+        """
+        Perform the random thrashing action
+        """
+
+        self.log('starting mds_do_thrash for fs {fs}'.format(fs = self.fs.name))
+        stats = {
+            "max_mds": 0,
+            "deactivate": 0,
+            "kill": 0,
+        }
+
+        while not self.stopping.is_set():
+            delay = self.max_thrash_delay
+            if self.randomize:
+                delay = random.randrange(0.0, self.max_thrash_delay)
+
+            if delay > 0.0:
+                self.log('waiting for {delay} secs before thrashing'.format(delay=delay))
+                self.stopping.wait(delay)
+                if self.stopping.is_set():
+                    continue
+
+            status = self.fs.status()
+
+            if random.random() <= self.thrash_max_mds:
+                max_mds = status.get_fsmap(self.fs.id)['mdsmap']['max_mds']
+                options = range(1, max_mds)+range(max_mds+1, self.max_mds+1)
+                if len(options) > 0:
+                    sample = random.sample(options, 1)
+                    new_max_mds = sample[0]
+                    self.log('thrashing max_mds: %d -> %d' % (max_mds, new_max_mds))
+                    self.fs.set_max_mds(new_max_mds)
+                    stats['max_mds'] += 1
+
+                    targets = filter(lambda r: r['rank'] >= new_max_mds, status.get_ranks(self.fs.id))
+                    if len(targets) > 0:
+                        # deactivate mds in decending order
+                        targets = sorted(targets, key=lambda r: r['rank'], reverse=True)
+                        for target in targets:
+                            self.log("deactivating rank %d" % target['rank'])
+                            self.fs.deactivate(target['rank'])
+                            stats['deactivate'] += 1
+                            status = self.wait_for_stable()[0]
+                    else:
+                        status = self.wait_for_stable()[0]
+
+            count = 0
+            for info in status.get_ranks(self.fs.id):
+                name = info['name']
+                label = 'mds.' + name
+                rank = info['rank']
+                gid = info['gid']
+
+                # if thrash_weights isn't specified and we've reached max_thrash,
+                # we're done
+                count = count + 1
+                if 'thrash_weights' not in self.config and count > self.max_thrash:
+                    break
+
+                weight = 1.0
+                if 'thrash_weights' in self.config:
+                    weight = self.config['thrash_weights'].get(label, '0.0')
+                skip = random.randrange(0.0, 1.0)
+                if weight <= skip:
+                    self.log('skipping thrash iteration with skip ({skip}) > weight ({weight})'.format(skip=skip, weight=weight))
+                    continue
+
+                self.log('kill {label} (rank={rank})'.format(label=label, rank=rank))
+                self.kill_mds(name)
+                stats['kill'] += 1
+
+                # wait for mon to report killed mds as crashed
+                last_laggy_since = None
+                itercount = 0
+                while True:
+                    status = self.fs.status()
+                    info = status.get_mds(name)
+                    if not info:
+                        break
+                    if 'laggy_since' in info:
+                        last_laggy_since = info['laggy_since']
+                        break
+                    if any([(f == name) for f in status.get_fsmap(self.fs.id)['mdsmap']['failed']]):
+                        break
+                    self.log(
+                        'waiting till mds map indicates {label} is laggy/crashed, in failed state, or {label} is removed from mdsmap'.format(
+                            label=label))
+                    itercount = itercount + 1
+                    if itercount > 10:
+                        self.log('mds map: {status}'.format(status=status))
+                    sleep(2)
+
+                if last_laggy_since:
+                    self.log(
+                        '{label} reported laggy/crashed since: {since}'.format(label=label, since=last_laggy_since))
+                else:
+                    self.log('{label} down, removed from mdsmap'.format(label=label, since=last_laggy_since))
+
+                # wait for a standby mds to takeover and become active
+                status = self.wait_for_stable(rank, gid)
+
+                # wait for a while before restarting old active to become new
+                # standby
+                delay = self.max_revive_delay
+                if self.randomize:
+                    delay = random.randrange(0.0, self.max_revive_delay)
+
+                self.log('waiting for {delay} secs before reviving {label}'.format(
+                    delay=delay, label=label))
+                sleep(delay)
+
+                self.log('reviving {label}'.format(label=label))
+                self.revive_mds(name)
+
+                for itercount in itertools.count():
+                    if itercount > 300/2: # 5 minutes
+                        raise RuntimeError('timeout waiting for MDS to revive')
+                    status = self.fs.status()
+                    info = status.get_mds(name)
+                    if info and info['state'] in ('up:standby', 'up:standby-replay', 'up:active'):
+                        self.log('{label} reported in {state} state'.format(label=label, state=info['state']))
+                        break
+                    self.log(
+                        'waiting till mds map indicates {label} is in active, standby or standby-replay'.format(label=label))
+                    sleep(2)
+
+        for stat in stats:
+            self.log("stat['{key}'] = {value}".format(key = stat, value = stats[stat]))
+
+             # don't do replay thrashing right now
+#            for info in status.get_replays(self.fs.id):
+#                # this might race with replay -> active transition...
+#                if status['state'] == 'up:replay' and random.randrange(0.0, 1.0) < self.thrash_in_replay:
+#                    delay = self.max_replay_thrash_delay
+#                    if self.randomize:
+#                        delay = random.randrange(0.0, self.max_replay_thrash_delay)
+#                sleep(delay)
+#                self.log('kill replaying mds.{id}'.format(id=self.to_kill))
+#                self.kill_mds(self.to_kill)
+#
+#                delay = self.max_revive_delay
+#                if self.randomize:
+#                    delay = random.randrange(0.0, self.max_revive_delay)
+#
+#                self.log('waiting for {delay} secs before reviving mds.{id}'.format(
+#                    delay=delay, id=self.to_kill))
+#                sleep(delay)
+#
+#                self.log('revive mds.{id}'.format(id=self.to_kill))
+#                self.revive_mds(self.to_kill)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Stress test the mds by thrashing while another task/workunit
+    is running.
+
+    Please refer to MDSThrasher class for further information on the
+    available options.
+    """
+
+    mds_cluster = MDSCluster(ctx)
+
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'mds_thrash task only accepts a dict for configuration'
+    mdslist = list(teuthology.all_roles_of_type(ctx.cluster, 'mds'))
+    assert len(mdslist) > 1, \
+        'mds_thrash task requires at least 2 metadata servers'
+
+    # choose random seed
+    if 'seed' in config:
+        seed = int(config['seed'])
+    else:
+        seed = int(time.time())
+    log.info('mds thrasher using random seed: {seed}'.format(seed=seed))
+    random.seed(seed)
+
+    (first,) = ctx.cluster.only('mds.{_id}'.format(_id=mdslist[0])).remotes.iterkeys()
+    manager = ceph_manager.CephManager(
+        first, ctx=ctx, logger=log.getChild('ceph_manager'),
+    )
+
+    # make sure everyone is in active, standby, or standby-replay
+    log.info('Wait for all MDSs to reach steady state...')
+    status = mds_cluster.status()
+    while True:
+        steady = True
+        for info in status.get_all():
+            state = info['state']
+            if state not in ('up:active', 'up:standby', 'up:standby-replay'):
+                steady = False
+                break
+        if steady:
+            break
+        sleep(2)
+        status = mds_cluster.status()
+    log.info('Ready to start thrashing')
+
+    thrashers = []
+
+    watchdog = DaemonWatchdog(ctx, manager, config, thrashers)
+    watchdog.start()
+
+    manager.wait_for_clean()
+    assert manager.is_clean()
+    for fs in status.get_filesystems():
+        thrasher = MDSThrasher(ctx, manager, config, Filesystem(ctx, fs['id']), fs['mdsmap']['max_mds'])
+        thrasher.start()
+        thrashers.append(thrasher)
+
+    try:
+        log.debug('Yielding')
+        yield
+    finally:
+        log.info('joining mds_thrashers')
+        for thrasher in thrashers:
+            thrasher.stop()
+            if thrasher.e:
+                raise RuntimeError('error during thrashing')
+            thrasher.join()
+        log.info('done joining')
+
+        watchdog.stop()
+        watchdog.join()
diff --git a/src/ceph/qa/tasks/metadata.yaml b/src/ceph/qa/tasks/metadata.yaml
new file mode 100644
index 0000000..ccdc3b0
--- /dev/null
+++ b/src/ceph/qa/tasks/metadata.yaml
@@ -0,0 +1,2 @@
+instance-id: test
+local-hostname: test
diff --git a/src/ceph/qa/tasks/mgr/__init__.py b/src/ceph/qa/tasks/mgr/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/ceph/qa/tasks/mgr/__init__.py
diff --git a/src/ceph/qa/tasks/mgr/mgr_test_case.py b/src/ceph/qa/tasks/mgr/mgr_test_case.py
new file mode 100644
index 0000000..ec3f98d
--- /dev/null
+++ b/src/ceph/qa/tasks/mgr/mgr_test_case.py
@@ -0,0 +1,170 @@
+
+from unittest import case
+import json
+import logging
+
+from teuthology import misc
+from tasks.ceph_test_case import CephTestCase
+
+# TODO move definition of CephCluster away from the CephFS stuff
+from tasks.cephfs.filesystem import CephCluster
+
+
+log = logging.getLogger(__name__)
+
+
+class MgrCluster(CephCluster):
+    def __init__(self, ctx):
+        super(MgrCluster, self).__init__(ctx)
+        self.mgr_ids = list(misc.all_roles_of_type(ctx.cluster, 'mgr'))
+
+        if len(self.mgr_ids) == 0:
+            raise RuntimeError(
+                "This task requires at least one manager daemon")
+
+        self.mgr_daemons = dict(
+            [(mgr_id, self._ctx.daemons.get_daemon('mgr', mgr_id)) for mgr_id
+             in self.mgr_ids])
+
+    def mgr_stop(self, mgr_id):
+        self.mgr_daemons[mgr_id].stop()
+
+    def mgr_fail(self, mgr_id):
+        self.mon_manager.raw_cluster_cmd("mgr", "fail", mgr_id)
+
+    def mgr_restart(self, mgr_id):
+        self.mgr_daemons[mgr_id].restart()
+
+    def get_mgr_map(self):
+        status = json.loads(
+            self.mon_manager.raw_cluster_cmd("status", "--format=json-pretty"))
+
+        return status["mgrmap"]
+
+    def get_active_id(self):
+        return self.get_mgr_map()["active_name"]
+
+    def get_standby_ids(self):
+        return [s['name'] for s in self.get_mgr_map()["standbys"]]
+
+    def set_module_localized_conf(self, module, mgr_id, key, val):
+        self.mon_manager.raw_cluster_cmd("config-key", "set",
+                                         "mgr/{0}/{1}/{2}".format(
+                                             module, mgr_id, key
+                                         ), val)
+
+
+class MgrTestCase(CephTestCase):
+    MGRS_REQUIRED = 1
+
+    def setUp(self):
+        super(MgrTestCase, self).setUp()
+
+        # The test runner should have populated this
+        assert self.mgr_cluster is not None
+
+        if len(self.mgr_cluster.mgr_ids) < self.MGRS_REQUIRED:
+            raise case.SkipTest("Only have {0} manager daemons, "
+                                "{1} are required".format(
+                len(self.mgr_cluster.mgr_ids), self.MGRS_REQUIRED))
+
+        # Restart all the daemons
+        for daemon in self.mgr_cluster.mgr_daemons.values():
+            daemon.stop()
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_fail(mgr_id)
+
+        for daemon in self.mgr_cluster.mgr_daemons.values():
+            daemon.restart()
+
+        # Wait for an active to come up
+        self.wait_until_true(lambda: self.mgr_cluster.get_active_id() != "",
+                             timeout=20)
+
+        expect_standbys = set(self.mgr_cluster.mgr_ids) \
+                          - {self.mgr_cluster.get_active_id()}
+        self.wait_until_true(
+            lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+            timeout=20)
+
+    def _load_module(self, module_name):
+        loaded = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+                   "mgr", "module", "ls"))['enabled_modules']
+        if module_name in loaded:
+            # The enable command is idempotent, but our wait for a restart
+            # isn't, so let's return now if it's already loaded
+            return
+
+        initial_gid = self.mgr_cluster.get_mgr_map()['active_gid']
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable",
+                                         module_name)
+
+        # Wait for the module to load
+        def has_restarted():
+            mgr_map = self.mgr_cluster.get_mgr_map()
+            done = mgr_map['active_gid'] != initial_gid and mgr_map['available']
+            if done:
+                log.info("Restarted after module load (new active {0}/{1})".format(
+                    mgr_map['active_name'] , mgr_map['active_gid']))
+            return done
+        self.wait_until_true(has_restarted, timeout=30)
+
+
+    def _get_uri(self, service_name):
+        # Little dict hack so that I can assign into this from
+        # the get_or_none function
+        mgr_map = {'x': None}
+
+        def _get_or_none():
+            mgr_map['x'] = self.mgr_cluster.get_mgr_map()
+            result = mgr_map['x']['services'].get(service_name, None)
+            return result
+
+        self.wait_until_true(lambda: _get_or_none() is not None, 30)
+
+        uri = mgr_map['x']['services'][service_name]
+
+        log.info("Found {0} at {1} (daemon {2}/{3})".format(
+            service_name, uri, mgr_map['x']['active_name'],
+            mgr_map['x']['active_gid']))
+
+        return uri
+
+
+    def _assign_ports(self, module_name, config_name, min_port=7789):
+        """
+        To avoid the need to run lots of hosts in teuthology tests to
+        get different URLs per mgr, we will hand out different ports
+        to each mgr here.
+
+        This is already taken care of for us when running in a vstart
+        environment.
+        """
+        # Start handing out ports well above Ceph's range.
+        assign_port = min_port
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_stop(mgr_id)
+            self.mgr_cluster.mgr_fail(mgr_id)
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            log.info("Using port {0} for {1} on mgr.{2}".format(
+                assign_port, module_name, mgr_id
+            ))
+            self.mgr_cluster.set_module_localized_conf(module_name, mgr_id,
+                                                       config_name,
+                                                       str(assign_port))
+            assign_port += 1
+
+        for mgr_id in self.mgr_cluster.mgr_ids:
+            self.mgr_cluster.mgr_restart(mgr_id)
+
+        def is_available():
+            mgr_map = self.mgr_cluster.get_mgr_map()
+            done = mgr_map['available']
+            if done:
+                log.info("Available after assign ports (new active {0}/{1})".format(
+                    mgr_map['active_name'] , mgr_map['active_gid']))
+            return done
+        self.wait_until_true(is_available, timeout=30)
diff --git a/src/ceph/qa/tasks/mgr/test_dashboard.py b/src/ceph/qa/tasks/mgr/test_dashboard.py
new file mode 100644
index 0000000..3b8a2cc
--- /dev/null
+++ b/src/ceph/qa/tasks/mgr/test_dashboard.py
@@ -0,0 +1,70 @@
+
+
+from mgr_test_case import MgrTestCase
+
+import logging
+import requests
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDashboard(MgrTestCase):
+    MGRS_REQUIRED = 3
+
+    def test_standby(self):
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        original_active = self.mgr_cluster.get_active_id()
+
+        original_uri = self._get_uri("dashboard")
+        log.info("Originally running at {0}".format(original_uri))
+
+        self.mgr_cluster.mgr_fail(original_active)
+
+        failed_over_uri = self._get_uri("dashboard")
+        log.info("After failover running at {0}".format(original_uri))
+
+        self.assertNotEqual(original_uri, failed_over_uri)
+
+        # The original active daemon should have come back up as a standby
+        # and be doing redirects to the new active daemon
+        r = requests.get(original_uri, allow_redirects=False)
+        self.assertEqual(r.status_code, 303)
+        self.assertEqual(r.headers['Location'], failed_over_uri)
+
+    def test_urls(self):
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        base_uri = self._get_uri("dashboard")
+
+        # This is a very simple smoke test to check that the dashboard can
+        # give us a 200 response to requests.  We're not testing that
+        # the content is correct or even renders!
+
+        urls = [
+            "/health",
+            "/servers",
+            "/osd/",
+            "/osd/perf/0",
+            "/rbd_mirroring",
+            "/rbd_iscsi"
+        ]
+
+        failures = []
+
+        for url in urls:
+            r = requests.get(base_uri + url, allow_redirects=False)
+            if r.status_code >= 300 and r.status_code < 400:
+                log.error("Unexpected redirect to: {0} (from {1})".format(
+                    r.headers['Location'], base_uri))
+            if r.status_code != 200:
+                failures.append(url)
+
+            log.info("{0}: {1} ({2} bytes)".format(
+                url, r.status_code, len(r.content)
+            ))
+
+        self.assertListEqual(failures, [])
diff --git a/src/ceph/qa/tasks/mgr/test_failover.py b/src/ceph/qa/tasks/mgr/test_failover.py
new file mode 100644
index 0000000..0dd9cb7
--- /dev/null
+++ b/src/ceph/qa/tasks/mgr/test_failover.py
@@ -0,0 +1,144 @@
+
+import logging
+import json
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class TestFailover(MgrTestCase):
+    MGRS_REQUIRED = 2
+
+    def test_timeout(self):
+        """
+        That when an active mgr stops responding, a standby is promoted
+        after mon_mgr_beacon_grace.
+        """
+
+        # Query which mgr is active
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        # Stop that daemon
+        self.mgr_cluster.mgr_stop(original_active)
+
+        # Assert that the other mgr becomes active
+        self.wait_until_true(
+            lambda: self.mgr_cluster.get_active_id() in original_standbys,
+            timeout=60
+        )
+
+        self.mgr_cluster.mgr_restart(original_active)
+        self.wait_until_true(
+            lambda: original_active in self.mgr_cluster.get_standby_ids(),
+            timeout=10
+        )
+
+    def test_timeout_nostandby(self):
+        """
+        That when an active mgr stop responding, and no standby is
+        available, the active mgr is removed from the map anyway.
+        """
+        # Query which mgr is active
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        for s in original_standbys:
+            self.mgr_cluster.mgr_stop(s)
+            self.mgr_cluster.mgr_fail(s)
+
+        self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
+        self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+
+        grace = int(self.mgr_cluster.get_config("mon_mgr_beacon_grace"))
+        log.info("Should time out in about {0} seconds".format(grace))
+
+        self.mgr_cluster.mgr_stop(original_active)
+
+        # Now wait for the mon to notice the mgr is gone and remove it
+        # from the map.
+        self.wait_until_equal(
+            lambda: self.mgr_cluster.get_active_id(),
+            "",
+            timeout=grace * 2
+        )
+
+        self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
+        self.assertEqual(self.mgr_cluster.get_active_id(), "")
+
+    def test_explicit_fail(self):
+        """
+        That when a user explicitly fails a daemon, a standby immediately
+        replaces it.
+        :return:
+        """
+        # Query which mgr is active
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        self.mgr_cluster.mgr_fail(original_active)
+
+        # A standby should take over
+        self.wait_until_true(
+            lambda: self.mgr_cluster.get_active_id() in original_standbys,
+            timeout=60
+        )
+
+        # The one we failed should come back as a standby (he isn't
+        # really dead)
+        self.wait_until_true(
+            lambda: original_active in self.mgr_cluster.get_standby_ids(),
+            timeout=10
+        )
+
+        # Both daemons should have fully populated metadata
+        # (regression test for http://tracker.ceph.com/issues/21260)
+        meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+            "mgr", "metadata"))
+        id_to_meta = dict([(i['id'], i) for i in meta])
+        for i in [original_active] + original_standbys:
+            self.assertIn(i, id_to_meta)
+            self.assertIn('ceph_version', id_to_meta[i])
+
+        # We should be able to fail back over again: the exercises
+        # our re-initialization of the python runtime within
+        # a single process lifetime.
+
+        # Get rid of any bystander standbys so that the original_active
+        # will be selected as next active.
+        new_active = self.mgr_cluster.get_active_id()
+        for daemon in original_standbys:
+            if daemon != new_active:
+                self.mgr_cluster.mgr_stop(daemon)
+                self.mgr_cluster.mgr_fail(daemon)
+
+        self.assertListEqual(self.mgr_cluster.get_standby_ids(),
+                             [original_active])
+
+        self.mgr_cluster.mgr_stop(new_active)
+        self.mgr_cluster.mgr_fail(new_active)
+
+        self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+        self.assertEqual(self.mgr_cluster.get_standby_ids(), [])
+
+    def test_standby_timeout(self):
+        """
+        That when a standby daemon stops sending beacons, it is
+        removed from the list of standbys
+        :return:
+        """
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        victim = original_standbys[0]
+        self.mgr_cluster.mgr_stop(victim)
+
+        expect_standbys = set(original_standbys) - {victim}
+
+        self.wait_until_true(
+            lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+            timeout=60
+        )
+        self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
diff --git a/src/ceph/qa/tasks/mgr/test_module_selftest.py b/src/ceph/qa/tasks/mgr/test_module_selftest.py
new file mode 100644
index 0000000..2776fb8
--- /dev/null
+++ b/src/ceph/qa/tasks/mgr/test_module_selftest.py
@@ -0,0 +1,74 @@
+
+import time
+import requests
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+
+class TestModuleSelftest(MgrTestCase):
+    """
+    That modules with a self-test command can be loaded and execute it
+    without errors.
+
+    This is not a substitute for really testing the modules, but it
+    is quick and is designed to catch regressions that could occur
+    if data structures change in a way that breaks how the modules
+    touch them.
+    """
+    MGRS_REQUIRED = 1
+
+    def _selftest_plugin(self, module_name):
+        self._load_module(module_name)
+
+        # Execute the module's self-test routine
+        self.mgr_cluster.mon_manager.raw_cluster_cmd(module_name, "self-test")
+
+    def test_zabbix(self):
+        self._selftest_plugin("zabbix")
+
+    def test_prometheus(self):
+        self._selftest_plugin("prometheus")
+
+    def test_influx(self):
+        self._selftest_plugin("influx")
+
+    def test_selftest_run(self):
+        self._load_module("selftest")
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", "run")
+
+    def test_selftest_command_spam(self):
+        # Use the selftest module to stress the mgr daemon
+        self._load_module("selftest")
+
+        # Use the dashboard to test that the mgr is still able to do its job
+        self._assign_ports("dashboard", "server_port")
+        self._load_module("dashboard")
+
+        original_active = self.mgr_cluster.get_active_id()
+        original_standbys = self.mgr_cluster.get_standby_ids()
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "start",
+                                                     "command_spam")
+
+        dashboard_uri = self._get_uri("dashboard")
+
+        delay = 10
+        periods = 10
+        for i in range(0, periods):
+            t1 = time.time()
+            # Check that an HTTP module remains responsive
+            r = requests.get(dashboard_uri)
+            self.assertEqual(r.status_code, 200)
+
+            # Check that a native non-module command remains responsive
+            self.mgr_cluster.mon_manager.raw_cluster_cmd("osd", "df")
+
+            time.sleep(delay - (time.time() - t1))
+
+        self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+                                                     "background", "stop")
+
+        # Check that all mgr daemons are still running
+        self.assertEqual(original_active, self.mgr_cluster.get_active_id())
+        self.assertEqual(original_standbys, self.mgr_cluster.get_standby_ids())
diff --git a/src/ceph/qa/tasks/mon_clock_skew_check.py b/src/ceph/qa/tasks/mon_clock_skew_check.py
new file mode 100644
index 0000000..547339f
--- /dev/null
+++ b/src/ceph/qa/tasks/mon_clock_skew_check.py
@@ -0,0 +1,76 @@
+"""
+Handle clock skews in monitors.
+"""
+import logging
+import contextlib
+import ceph_manager
+import time
+import gevent
+from StringIO import StringIO
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+class ClockSkewCheck:
+    """
+    Check if there are any clock skews among the monitors in the
+    quorum.
+
+    This task accepts the following options:
+
+    interval     amount of seconds to wait before check. (default: 30.0)
+    expect-skew  'true' or 'false', to indicate whether to expect a skew during
+                 the run or not. If 'true', the test will fail if no skew is
+                 found, and succeed if a skew is indeed found; if 'false', it's
+                 the other way around. (default: false)
+
+    - mon_clock_skew_check:
+        expect-skew: true
+    """
+
+    def __init__(self, ctx, manager, config, logger):
+        self.ctx = ctx
+        self.manager = manager
+
+        self.stopping = False
+        self.logger = logger
+        self.config = config
+
+        if self.config is None:
+            self.config = dict()
+
+
+def task(ctx, config):
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'mon_clock_skew_check task only accepts a dict for configuration'
+    interval = float(config.get('interval', 30.0))
+    expect_skew = config.get('expect-skew', False)
+
+    log.info('Beginning mon_clock_skew_check...')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    quorum_size = len(teuthology.get_mon_names(ctx))
+    manager.wait_for_mon_quorum_size(quorum_size)
+
+    # wait a bit
+    log.info('sleeping for {s} seconds'.format(
+        s=interval))
+    time.sleep(interval)
+
+    health = manager.get_mon_health(True)
+    log.info('got health %s' % health)
+    if expect_skew:
+        if 'MON_CLOCK_SKEW' not in health['checks']:
+            raise RuntimeError('expected MON_CLOCK_SKEW but got none')
+    else:
+        if 'MON_CLOCK_SKEW' in health['checks']:
+            raise RuntimeError('got MON_CLOCK_SKEW but expected none')
+
diff --git a/src/ceph/qa/tasks/mon_recovery.py b/src/ceph/qa/tasks/mon_recovery.py
new file mode 100644
index 0000000..bfa2cdf
--- /dev/null
+++ b/src/ceph/qa/tasks/mon_recovery.py
@@ -0,0 +1,80 @@
+"""
+Monitor recovery
+"""
+import logging
+import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test monitor recovery.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)]
+    log.info("mon ids = %s" % mons)
+
+    manager.wait_for_mon_quorum_size(len(mons))
+
+    log.info('verifying all monitors are in the quorum')
+    for m in mons:
+        s = manager.get_mon_status(m)
+        assert s['state'] == 'leader' or s['state'] == 'peon'
+        assert len(s['quorum']) == len(mons)
+
+    log.info('restarting each monitor in turn')
+    for m in mons:
+        # stop a monitor
+        manager.kill_mon(m)
+        manager.wait_for_mon_quorum_size(len(mons) - 1)
+
+        # restart
+        manager.revive_mon(m)
+        manager.wait_for_mon_quorum_size(len(mons))
+
+    # in forward and reverse order,
+    rmons = mons
+    rmons.reverse()
+    for mons in mons, rmons:
+        log.info('stopping all monitors')
+        for m in mons:
+            manager.kill_mon(m)
+
+        log.info('forming a minimal quorum for %s, then adding monitors' % mons)
+        qnum = (len(mons) / 2) + 1
+        num = 0
+        for m in mons:
+            manager.revive_mon(m)
+            num += 1
+            if num >= qnum:
+                manager.wait_for_mon_quorum_size(num)
+
+    # on both leader and non-leader ranks...
+    for rank in [0, 1]:
+        # take one out
+        log.info('removing mon %s' % mons[rank])
+        manager.kill_mon(mons[rank])
+        manager.wait_for_mon_quorum_size(len(mons) - 1)
+
+        log.info('causing some monitor log activity')
+        m = 30
+        for n in range(1, m):
+            manager.raw_cluster_cmd('log', '%d of %d' % (n, m))
+
+        log.info('adding mon %s back in' % mons[rank])
+        manager.revive_mon(mons[rank])
+        manager.wait_for_mon_quorum_size(len(mons))
diff --git a/src/ceph/qa/tasks/mon_seesaw.py b/src/ceph/qa/tasks/mon_seesaw.py
new file mode 100644
index 0000000..b101c0e
--- /dev/null
+++ b/src/ceph/qa/tasks/mon_seesaw.py
@@ -0,0 +1,198 @@
+from cStringIO import StringIO
+
+import contextlib
+import logging
+import random
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+
+from ceph_manager import CephManager, write_conf
+
+
+log = logging.getLogger(__name__)
+
+
+def _get_mons(ctx):
+    return [name[len('mon.'):] for name in teuthology.get_mon_names(ctx)]
+
+
+# teuthology prepares the monitor IPs (and ports) in get_mons(), we can
+# enumerate all monitor ports ([6789..]), and find the next available one.
+def _get_next_port(ctx, ip, cluster):
+    # assuming we have only one cluster here.
+    used = []
+    for name in teuthology.get_mon_names(ctx, cluster):
+        addr = ctx.ceph[cluster].conf[name]['mon addr']
+        mon_ip, mon_port = addr.split(':')
+        if mon_ip != ip:
+            continue
+        used.append(int(mon_port))
+    port = 6789
+    used.sort()
+    for p in used:
+        if p != port:
+            break
+        port += 1
+    return port
+
+
+def _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path):
+    # co-locate a new monitor on remote where an existing monitor is hosted
+    cluster = manager.cluster
+    remote.run(args=['sudo', 'mkdir', '-p', data_path])
+    keyring_path = '/etc/ceph/{cluster}.keyring'.format(
+        cluster=manager.cluster)
+    testdir = teuthology.get_testdir(ctx)
+    monmap_path = '{tdir}/{cluster}.monmap'.format(tdir=testdir,
+                                                   cluster=cluster)
+    manager.raw_cluster_cmd('mon', 'getmap', '-o', monmap_path)
+    if manager.controller != remote:
+        monmap = teuthology.get_file(manager.controller, monmap_path)
+        teuthology.write_file(remote, monmap_path, StringIO(monmap))
+    remote.run(
+        args=[
+            'sudo',
+            'ceph-mon',
+            '--cluster', cluster,
+            '--mkfs',
+            '-i', mon,
+            '--monmap', monmap_path,
+            '--keyring', keyring_path])
+    if manager.controller != remote:
+        teuthology.delete_file(remote, monmap_path)
+    # raw_cluster_cmd() is performed using sudo, so sudo here also.
+    teuthology.delete_file(manager.controller, monmap_path, sudo=True)
+    # update ceph.conf so that the ceph CLI is able to connect to the cluster
+    if conf_path:
+        ip = remote.ip_address
+        port = _get_next_port(ctx, ip, cluster)
+        mon_addr = '{ip}:{port}'.format(ip=ip, port=port)
+        ctx.ceph[cluster].conf[name] = {'mon addr': mon_addr}
+        write_conf(ctx, conf_path, cluster)
+
+
+def _teardown_mon(ctx, manager, remote, name, data_path, conf_path):
+    cluster = manager.cluster
+    del ctx.ceph[cluster].conf[name]
+    write_conf(ctx, conf_path, cluster)
+    remote.run(args=['sudo', 'rm', '-rf', data_path])
+
+
+@contextlib.contextmanager
+def _prepare_mon(ctx, manager, remote, mon):
+    cluster = manager.cluster
+    data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
+        cluster=cluster, id=mon)
+    conf_path = '/etc/ceph/{cluster}.conf'.format(cluster=cluster)
+    name = 'mon.{0}'.format(mon)
+    _setup_mon(ctx, manager, remote, mon, name, data_path, conf_path)
+    yield
+    _teardown_mon(ctx, manager, remote, name,
+                  data_path, conf_path)
+
+
+# run_daemon() in ceph.py starts a herd of daemons of the same type, but
+# _run_daemon() starts only one instance.
+@contextlib.contextmanager
+def _run_daemon(ctx, remote, cluster, type_, id_):
+    testdir = teuthology.get_testdir(ctx)
+    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+    daemon_signal = 'kill'
+    run_cmd = [
+        'sudo',
+        'adjust-ulimits',
+        'ceph-coverage',
+        coverage_dir,
+        'daemon-helper',
+        daemon_signal,
+    ]
+    run_cmd_tail = [
+        'ceph-%s' % (type_),
+        '-f',
+        '--cluster', cluster,
+        '-i', id_]
+    run_cmd.extend(run_cmd_tail)
+    ctx.daemons.add_daemon(remote, type_, id_,
+                           cluster=cluster,
+                           args=run_cmd,
+                           logger=log.getChild(type_),
+                           stdin=run.PIPE,
+                           wait=False)
+    daemon = ctx.daemons.get_daemon(type_, id_, cluster)
+    yield daemon
+    daemon.stop()
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    replace a monitor with a newly added one, and then revert this change
+
+    How it works::
+    1. add a mon with specified id (mon.victim_prime)
+    2. wait for quorum
+    3. remove a monitor with specified id (mon.victim), mon.victim will commit
+       suicide
+    4. wait for quorum
+    5. <yield>
+    5. add mon.a back, and start it
+    6. wait for quorum
+    7. remove mon.a_prime
+
+    Options::
+    victim       the id of the mon to be removed (pick a random mon by default)
+    replacer     the id of the new mon (use "${victim}_prime" if not specified)
+    """
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    manager = CephManager(mon, ctx=ctx, logger=log.getChild('ceph_manager'))
+
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        "task ceph only supports a dictionary for configuration"
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('mon_seesaw', {}))
+    victim = config.get('victim', random.choice(_get_mons(ctx)))
+    replacer = config.get('replacer', '{0}_prime'.format(victim))
+    remote = manager.find_remote('mon', victim)
+    quorum = manager.get_mon_quorum()
+    cluster = manager.cluster
+    log.info('replacing {victim} with {replacer}'.format(victim=victim,
+                                                         replacer=replacer))
+    with _prepare_mon(ctx, manager, remote, replacer):
+        with _run_daemon(ctx, remote, cluster, 'mon', replacer):
+            # replacer will join the quorum automatically
+            manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
+            # if we don't remove the victim from monmap, there is chance that
+            # we are leaving the new joiner with a monmap of 2 mon, and it will
+            # not able to reach the other one, it will be keeping probing for
+            # ever.
+            log.info('removing {mon}'.format(mon=victim))
+            manager.raw_cluster_cmd('mon', 'remove', victim)
+            manager.wait_for_mon_quorum_size(len(quorum), 10)
+            # the victim will commit suicide after being removed from
+            # monmap, let's wait until it stops.
+            ctx.daemons.get_daemon('mon', victim, cluster).wait(10)
+            try:
+                # perform other tasks
+                yield
+            finally:
+                # bring the victim back online
+                # nuke the monstore of victim, otherwise it will refuse to boot
+                # with following message:
+                #
+                # not in monmap and have been in a quorum before; must have
+                # been removed
+                log.info('re-adding {mon}'.format(mon=victim))
+                data_path = '/var/lib/ceph/mon/{cluster}-{id}'.format(
+                    cluster=cluster, id=victim)
+                remote.run(args=['sudo', 'rm', '-rf', data_path])
+                name = 'mon.{0}'.format(victim)
+                _setup_mon(ctx, manager, remote, victim, name, data_path, None)
+                log.info('reviving {mon}'.format(mon=victim))
+                manager.revive_mon(victim)
+                manager.wait_for_mon_quorum_size(len(quorum) + 1, 10)
+                manager.raw_cluster_cmd('mon', 'remove', replacer)
+                manager.wait_for_mon_quorum_size(len(quorum), 10)
diff --git a/src/ceph/qa/tasks/mon_thrash.py b/src/ceph/qa/tasks/mon_thrash.py
new file mode 100644
index 0000000..0754bcd
--- /dev/null
+++ b/src/ceph/qa/tasks/mon_thrash.py
@@ -0,0 +1,343 @@
+"""
+Monitor thrash
+"""
+import logging
+import contextlib
+import ceph_manager
+import random
+import time
+import gevent
+import json
+import math
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def _get_mons(ctx):
+    """
+    Get monitor names from the context value.
+    """
+    mons = [f[len('mon.'):] for f in teuthology.get_mon_names(ctx)]
+    return mons
+
+class MonitorThrasher:
+    """
+    How it works::
+
+    - pick a monitor
+    - kill it
+    - wait for quorum to be formed
+    - sleep for 'revive_delay' seconds
+    - revive monitor
+    - wait for quorum to be formed
+    - sleep for 'thrash_delay' seconds
+
+    Options::
+
+    seed                Seed to use on the RNG to reproduce a previous
+                        behaviour (default: None; i.e., not set)
+    revive_delay        Number of seconds to wait before reviving
+                        the monitor (default: 10)
+    thrash_delay        Number of seconds to wait in-between
+                        test iterations (default: 0)
+    thrash_store        Thrash monitor store before killing the monitor being thrashed (default: False)
+    thrash_store_probability  Probability of thrashing a monitor's store
+                              (default: 50)
+    thrash_many         Thrash multiple monitors instead of just one. If
+                        'maintain-quorum' is set to False, then we will
+                        thrash up to as many monitors as there are
+                        available. (default: False)
+    maintain_quorum     Always maintain quorum, taking care on how many
+                        monitors we kill during the thrashing. If we
+                        happen to only have one or two monitors configured,
+                        if this option is set to True, then we won't run
+                        this task as we cannot guarantee maintenance of
+                        quorum. Setting it to false however would allow the
+                        task to run with as many as just one single monitor.
+                        (default: True)
+    freeze_mon_probability: how often to freeze the mon instead of killing it,
+                        in % (default: 0)
+    freeze_mon_duration: how many seconds to freeze the mon (default: 15)
+    scrub               Scrub after each iteration (default: True)
+
+    Note: if 'store-thrash' is set to True, then 'maintain-quorum' must also
+          be set to True.
+
+    For example::
+
+    tasks:
+    - ceph:
+    - mon_thrash:
+        revive_delay: 20
+        thrash_delay: 1
+        thrash_store: true
+        thrash_store_probability: 40
+        seed: 31337
+        maintain_quorum: true
+        thrash_many: true
+    - ceph-fuse:
+    - workunit:
+        clients:
+          all:
+            - mon/workloadgen.sh
+    """
+    def __init__(self, ctx, manager, config, logger):
+        self.ctx = ctx
+        self.manager = manager
+        self.manager.wait_for_clean()
+
+        self.stopping = False
+        self.logger = logger
+        self.config = config
+
+        if self.config is None:
+            self.config = dict()
+
+        """ Test reproducibility """
+        self.random_seed = self.config.get('seed', None)
+
+        if self.random_seed is None:
+            self.random_seed = int(time.time())
+
+        self.rng = random.Random()
+        self.rng.seed(int(self.random_seed))
+
+        """ Monitor thrashing """
+        self.revive_delay = float(self.config.get('revive_delay', 10.0))
+        self.thrash_delay = float(self.config.get('thrash_delay', 0.0))
+
+        self.thrash_many = self.config.get('thrash_many', False)
+        self.maintain_quorum = self.config.get('maintain_quorum', True)
+
+        self.scrub = self.config.get('scrub', True)
+
+        self.freeze_mon_probability = float(self.config.get('freeze_mon_probability', 10))
+        self.freeze_mon_duration = float(self.config.get('freeze_mon_duration', 15.0))
+
+        assert self.max_killable() > 0, \
+            'Unable to kill at least one monitor with the current config.'
+
+        """ Store thrashing """
+        self.store_thrash = self.config.get('store_thrash', False)
+        self.store_thrash_probability = int(
+            self.config.get('store_thrash_probability', 50))
+        if self.store_thrash:
+            assert self.store_thrash_probability > 0, \
+                'store_thrash is set, probability must be > 0'
+            assert self.maintain_quorum, \
+                'store_thrash = true must imply maintain_quorum = true'
+
+        self.thread = gevent.spawn(self.do_thrash)
+
+    def log(self, x):
+        """
+        locally log info messages
+        """
+        self.logger.info(x)
+
+    def do_join(self):
+        """
+        Break out of this processes thrashing loop.
+        """
+        self.stopping = True
+        self.thread.get()
+
+    def should_thrash_store(self):
+        """
+        If allowed, indicate that we should thrash a certain percentage of
+        the time as determined by the store_thrash_probability value.
+        """
+        if not self.store_thrash:
+            return False
+        return self.rng.randrange(0, 101) < self.store_thrash_probability
+
+    def thrash_store(self, mon):
+        """
+        Thrash the monitor specified.
+        :param mon: monitor to thrash
+        """
+        addr = self.ctx.ceph['ceph'].conf['mon.%s' % mon]['mon addr']
+        self.log('thrashing mon.{id}@{addr} store'.format(id=mon, addr=addr))
+        out = self.manager.raw_cluster_cmd('-m', addr, 'sync', 'force')
+        j = json.loads(out)
+        assert j['ret'] == 0, \
+            'error forcing store sync on mon.{id}:\n{ret}'.format(
+                id=mon,ret=out)
+
+    def should_freeze_mon(self):
+        """
+        Indicate that we should freeze a certain percentago of the time
+        as determined by the freeze_mon_probability value.
+        """
+        return self.rng.randrange(0, 101) < self.freeze_mon_probability
+
+    def freeze_mon(self, mon):
+        """
+        Send STOP signal to freeze the monitor.
+        """
+        log.info('Sending STOP to mon %s', mon)
+        self.manager.signal_mon(mon, 19)  # STOP
+
+    def unfreeze_mon(self, mon):
+        """
+        Send CONT signal to unfreeze the monitor.
+        """
+        log.info('Sending CONT to mon %s', mon)
+        self.manager.signal_mon(mon, 18)  # CONT
+
+    def kill_mon(self, mon):
+        """
+        Kill the monitor specified
+        """
+        self.log('killing mon.{id}'.format(id=mon))
+        self.manager.kill_mon(mon)
+
+    def revive_mon(self, mon):
+        """
+        Revive the monitor specified
+        """
+        self.log('killing mon.{id}'.format(id=mon))
+        self.log('reviving mon.{id}'.format(id=mon))
+        self.manager.revive_mon(mon)
+
+    def max_killable(self):
+        """
+        Return the maximum number of monitors we can kill.
+        """
+        m = len(_get_mons(self.ctx))
+        if self.maintain_quorum:
+            return max(math.ceil(m/2.0)-1, 0)
+        else:
+            return m
+
+    def do_thrash(self):
+        """
+        Cotinuously loop and thrash the monitors.
+        """
+        self.log('start thrashing')
+        self.log('seed: {s}, revive delay: {r}, thrash delay: {t} '\
+                   'thrash many: {tm}, maintain quorum: {mq} '\
+                   'store thrash: {st}, probability: {stp} '\
+                   'freeze mon: prob {fp} duration {fd}'.format(
+                s=self.random_seed,r=self.revive_delay,t=self.thrash_delay,
+                tm=self.thrash_many, mq=self.maintain_quorum,
+                st=self.store_thrash,stp=self.store_thrash_probability,
+                fp=self.freeze_mon_probability,fd=self.freeze_mon_duration,
+                ))
+
+        while not self.stopping:
+            mons = _get_mons(self.ctx)
+            self.manager.wait_for_mon_quorum_size(len(mons))
+            self.log('making sure all monitors are in the quorum')
+            for m in mons:
+                s = self.manager.get_mon_status(m)
+                assert s['state'] == 'leader' or s['state'] == 'peon'
+                assert len(s['quorum']) == len(mons)
+
+            kill_up_to = self.rng.randrange(1, self.max_killable()+1)
+            mons_to_kill = self.rng.sample(mons, kill_up_to)
+            self.log('monitors to thrash: {m}'.format(m=mons_to_kill))
+
+            mons_to_freeze = []
+            for mon in mons:
+                if mon in mons_to_kill:
+                    continue
+                if self.should_freeze_mon():
+                    mons_to_freeze.append(mon)
+            self.log('monitors to freeze: {m}'.format(m=mons_to_freeze))
+
+            for mon in mons_to_kill:
+                self.log('thrashing mon.{m}'.format(m=mon))
+
+                """ we only thrash stores if we are maintaining quorum """
+                if self.should_thrash_store() and self.maintain_quorum:
+                    self.thrash_store(mon)
+
+                self.kill_mon(mon)
+
+            if mons_to_freeze:
+                for mon in mons_to_freeze:
+                    self.freeze_mon(mon)
+                self.log('waiting for {delay} secs to unfreeze mons'.format(
+                    delay=self.freeze_mon_duration))
+                time.sleep(self.freeze_mon_duration)
+                for mon in mons_to_freeze:
+                    self.unfreeze_mon(mon)
+
+            if self.maintain_quorum:
+                self.manager.wait_for_mon_quorum_size(len(mons)-len(mons_to_kill))
+                for m in mons:
+                    if m in mons_to_kill:
+                        continue
+                    s = self.manager.get_mon_status(m)
+                    assert s['state'] == 'leader' or s['state'] == 'peon'
+                    assert len(s['quorum']) == len(mons)-len(mons_to_kill)
+
+            self.log('waiting for {delay} secs before reviving monitors'.format(
+                delay=self.revive_delay))
+            time.sleep(self.revive_delay)
+
+            for mon in mons_to_kill:
+                self.revive_mon(mon)
+            # do more freezes
+            if mons_to_freeze:
+                for mon in mons_to_freeze:
+                    self.freeze_mon(mon)
+                self.log('waiting for {delay} secs to unfreeze mons'.format(
+                    delay=self.freeze_mon_duration))
+                time.sleep(self.freeze_mon_duration)
+                for mon in mons_to_freeze:
+                    self.unfreeze_mon(mon)
+
+            self.manager.wait_for_mon_quorum_size(len(mons))
+            for m in mons:
+                s = self.manager.get_mon_status(m)
+                assert s['state'] == 'leader' or s['state'] == 'peon'
+                assert len(s['quorum']) == len(mons)
+
+            if self.scrub:
+                self.log('triggering scrub')
+                try:
+                    self.manager.raw_cluster_cmd('scrub')
+                except Exception:
+                    log.exception("Saw exception while triggering scrub")
+
+            if self.thrash_delay > 0.0:
+                self.log('waiting for {delay} secs before continuing thrashing'.format(
+                    delay=self.thrash_delay))
+                time.sleep(self.thrash_delay)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Stress test the monitor by thrashing them while another task/workunit
+    is running.
+
+    Please refer to MonitorThrasher class for further information on the
+    available options.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'mon_thrash task only accepts a dict for configuration'
+    assert len(_get_mons(ctx)) > 2, \
+        'mon_thrash task requires at least 3 monitors'
+    log.info('Beginning mon_thrash...')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+    thrash_proc = MonitorThrasher(ctx,
+        manager, config,
+        logger=log.getChild('mon_thrasher'))
+    try:
+        log.debug('Yielding')
+        yield
+    finally:
+        log.info('joining mon_thrasher')
+        thrash_proc.do_join()
+        mons = _get_mons(ctx)
+        manager.wait_for_mon_quorum_size(len(mons))
diff --git a/src/ceph/qa/tasks/multibench.py b/src/ceph/qa/tasks/multibench.py
new file mode 100644
index 0000000..53b1aa5
--- /dev/null
+++ b/src/ceph/qa/tasks/multibench.py
@@ -0,0 +1,60 @@
+"""
+Multibench testing
+"""
+import contextlib
+import logging
+import radosbench
+import time
+import copy
+import gevent
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run multibench
+
+    The config should be as follows:
+
+    multibench:
+        time: <seconds to run total>
+        segments: <number of concurrent benches>
+        radosbench: <config for radosbench>
+
+    example:
+
+    tasks:
+    - ceph:
+    - multibench:
+        clients: [client.0]
+        time: 360
+    - interactive:
+    """
+    log.info('Beginning multibench...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+
+    def run_one(num):
+        """Run test spawn from gevent"""
+        start = time.time()
+        if not config.get('radosbench'):
+            benchcontext = {}
+        else:
+            benchcontext = copy.copy(config.get('radosbench'))
+        iterations = 0
+        while time.time() - start < int(config.get('time', 600)):
+            log.info("Starting iteration %s of segment %s"%(iterations, num))
+            benchcontext['pool'] = str(num) + "-" + str(iterations)
+            with radosbench.task(ctx, benchcontext):
+                time.sleep()
+            iterations += 1
+    log.info("Starting %s threads"%(str(config.get('segments', 3)),))
+    segments = [
+        gevent.spawn(run_one, i)
+        for i in range(0, int(config.get('segments', 3)))]
+
+    try:
+        yield
+    finally:
+        [i.get() for i in segments]
diff --git a/src/ceph/qa/tasks/object_source_down.py b/src/ceph/qa/tasks/object_source_down.py
new file mode 100644
index 0000000..9705d7c
--- /dev/null
+++ b/src/ceph/qa/tasks/object_source_down.py
@@ -0,0 +1,101 @@
+"""
+Test Object locations going down
+"""
+import logging
+import ceph_manager
+import time
+from teuthology import misc as teuthology
+from util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling of object location going down
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'lost_unfound task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.wait_for_clean()
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+
+    # take 0, 1 out
+    manager.mark_out_osd(0)
+    manager.mark_out_osd(1)
+    manager.wait_for_clean()
+
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.0',
+            'injectargs',
+            '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+            )
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.1',
+            'injectargs',
+            '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+            )
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.2',
+            'injectargs',
+            '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+            )
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.3',
+            'injectargs',
+            '--osd-recovery-delay-start 10000 --osd-min-pg-log-entries 100000000'
+            )
+
+    # kludge to make sure they get a map
+    rados(ctx, mon, ['-p', 'data', 'put', 'dummy', dummyfile])
+
+    # create old objects
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', 'data', 'put', 'existing_%d' % f, dummyfile])
+
+    manager.mark_out_osd(3)
+    manager.wait_till_active()
+
+    manager.mark_in_osd(0)
+    manager.wait_till_active()
+
+    manager.flush_pg_stats([2, 0])
+
+    manager.mark_out_osd(2)
+    manager.wait_till_active()
+
+    # bring up 1
+    manager.mark_in_osd(1)
+    manager.wait_till_active()
+
+    manager.flush_pg_stats([0, 1])
+    log.info("Getting unfound objects")
+    unfound = manager.get_num_unfound_objects()
+    assert not unfound
+
+    manager.kill_osd(2)
+    manager.mark_down_osd(2)
+    manager.kill_osd(3)
+    manager.mark_down_osd(3)
+
+    manager.flush_pg_stats([0, 1])
+    log.info("Getting unfound objects")
+    unfound = manager.get_num_unfound_objects()
+    assert unfound
diff --git a/src/ceph/qa/tasks/omapbench.py b/src/ceph/qa/tasks/omapbench.py
new file mode 100644
index 0000000..e026c74
--- /dev/null
+++ b/src/ceph/qa/tasks/omapbench.py
@@ -0,0 +1,83 @@
+"""
+Run omapbench executable within teuthology
+"""
+import contextlib
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run omapbench
+
+    The config should be as follows::
+
+		  omapbench:
+		      clients: [client list]
+		      threads: <threads at once>
+		      objects: <number of objects to write>
+		      entries: <number of entries per object map>
+		      keysize: <number of characters per object map key>
+		      valsize: <number of characters per object map val>
+		      increment: <interval to show in histogram (in ms)>
+		      omaptype: <how the omaps should be generated>
+
+    example::
+
+		  tasks:
+		  - ceph:
+		  - omapbench:
+		      clients: [client.0]
+		      threads: 30
+		      objects: 1000
+		      entries: 10
+		      keysize: 10
+		      valsize: 100
+		      increment: 100
+		      omaptype: uniform
+		  - interactive:
+    """
+    log.info('Beginning omapbench...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+    omapbench = {}
+    testdir = teuthology.get_testdir(ctx)
+    print(str(config.get('increment',-1)))
+    for role in config.get('clients', ['client.0']):
+        assert isinstance(role, basestring)
+        PREFIX = 'client.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+        proc = remote.run(
+            args=[
+                "/bin/sh", "-c",
+                " ".join(['adjust-ulimits',
+                          'ceph-coverage',
+                          '{tdir}/archive/coverage',
+                          'omapbench',
+                          '--name', role[len(PREFIX):],
+                          '-t', str(config.get('threads', 30)),
+                          '-o', str(config.get('objects', 1000)),
+                          '--entries', str(config.get('entries',10)),
+                          '--keysize', str(config.get('keysize',10)),
+                          '--valsize', str(config.get('valsize',1000)),
+                          '--inc', str(config.get('increment',10)),
+                          '--omaptype', str(config.get('omaptype','uniform'))
+                          ]).format(tdir=testdir),
+                ],
+            logger=log.getChild('omapbench.{id}'.format(id=id_)),
+            stdin=run.PIPE,
+            wait=False
+            )
+        omapbench[id_] = proc
+
+    try:
+        yield
+    finally:
+        log.info('joining omapbench')
+        run.wait(omapbench.itervalues())
diff --git a/src/ceph/qa/tasks/osd_backfill.py b/src/ceph/qa/tasks/osd_backfill.py
new file mode 100644
index 0000000..04658d2
--- /dev/null
+++ b/src/ceph/qa/tasks/osd_backfill.py
@@ -0,0 +1,104 @@
+"""
+Osd backfill test
+"""
+import logging
+import ceph_manager
+import time
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+
+def rados_start(ctx, remote, cmd):
+    """
+    Run a remote rados command (currently used to only write data)
+    """
+    log.info("rados %s" % ' '.join(cmd))
+    testdir = teuthology.get_testdir(ctx)
+    pre = [
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'rados',
+        ];
+    pre.extend(cmd)
+    proc = remote.run(
+        args=pre,
+        wait=False,
+        )
+    return proc
+
+def task(ctx, config):
+    """
+    Test backfill
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'thrashosds task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    log.info('num_osds is %s' % num_osds)
+    assert num_osds == 3
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
+
+    # write some data
+    p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096',
+                          '--no-cleanup'])
+    err = p.wait()
+    log.info('err is %d' % err)
+
+    # mark osd.0 out to trigger a rebalance/backfill
+    manager.mark_out_osd(0)
+
+    # also mark it down to it won't be included in pg_temps
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+
+    # wait for everything to peer and be happy...
+    manager.flush_pg_stats([1, 2])
+    manager.wait_for_recovery()
+
+    # write some new data
+    p = rados_start(ctx, mon, ['-p', 'rbd', 'bench', '30', 'write', '-b', '4096',
+                          '--no-cleanup'])
+
+    time.sleep(15)
+
+    # blackhole + restart osd.1
+    # this triggers a divergent backfill target
+    manager.blackhole_kill_osd(1)
+    time.sleep(2)
+    manager.revive_osd(1)
+
+    # wait for our writes to complete + succeed
+    err = p.wait()
+    log.info('err is %d' % err)
+
+    # wait for osd.1 and osd.2 to be up
+    manager.wait_till_osd_is_up(1)
+    manager.wait_till_osd_is_up(2)
+
+    # cluster must recover
+    manager.flush_pg_stats([1, 2])
+    manager.wait_for_recovery()
+
+    # re-add osd.0
+    manager.revive_osd(0)
+    manager.flush_pg_stats([1, 2])
+    manager.wait_for_clean()
+
+
diff --git a/src/ceph/qa/tasks/osd_failsafe_enospc.py b/src/ceph/qa/tasks/osd_failsafe_enospc.py
new file mode 100644
index 0000000..6910854
--- /dev/null
+++ b/src/ceph/qa/tasks/osd_failsafe_enospc.py
@@ -0,0 +1,218 @@
+"""
+Handle osdfailsafe configuration settings (nearfull ratio and full ratio)
+"""
+from cStringIO import StringIO
+import logging
+import time
+
+from teuthology.orchestra import run
+from util.rados import rados
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling of osd_failsafe_nearfull_ratio and osd_failsafe_full_ratio
+    configuration settings
+
+    In order for test to pass must use log-whitelist as follows
+
+        tasks:
+            - chef:
+            - install:
+            - ceph:
+                log-whitelist: ['OSD near full', 'OSD full dropping all updates']
+            - osd_failsafe_enospc:
+
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'osd_failsafe_enospc task only accepts a dict for configuration'
+
+    # Give 2 seconds for injectargs + osd_op_complaint_time (30) + 2 * osd_heartbeat_interval (6) + 6 padding
+    sleep_time = 50
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+    dummyfile2 = '/etc/resolv.conf'
+
+    manager = ctx.managers['ceph']
+
+    # create 1 pg pool with 1 rep which can only be on osd.0
+    osds = manager.get_osd_dump()
+    for osd in osds:
+        if osd['osd'] != 0:
+            manager.mark_out_osd(osd['osd'])
+
+    log.info('creating pool foo')
+    manager.create_pool("foo")
+    manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'size', '1')
+
+    # State NONE -> NEAR
+    log.info('1. Verify warning messages when exceeding nearfull_ratio')
+
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    proc = mon.run(
+             args=[
+                 'sudo',
+                 'daemon-helper',
+                 'kill',
+                 'ceph', '-w'
+             ],
+             stdin=run.PIPE,
+             stdout=StringIO(),
+             wait=False,
+        )
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .00001')
+
+    time.sleep(sleep_time)
+    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+    proc.wait()
+
+    lines = proc.stdout.getvalue().split('\n')
+
+    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+    assert count == 2, 'Incorrect number of warning messages expected 2 got %d' % count
+    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
+
+    # State NEAR -> FULL
+    log.info('2. Verify error messages when exceeding full_ratio')
+
+    proc = mon.run(
+             args=[
+                 'sudo',
+                 'daemon-helper',
+                 'kill',
+                 'ceph', '-w'
+             ],
+             stdin=run.PIPE,
+             stdout=StringIO(),
+             wait=False,
+        )
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
+
+    time.sleep(sleep_time)
+    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+    proc.wait()
+
+    lines = proc.stdout.getvalue().split('\n')
+
+    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
+
+    log.info('3. Verify write failure when exceeding full_ratio')
+
+    # Write data should fail
+    ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile1', dummyfile])
+    assert ret != 0, 'Expected write failure but it succeeded with exit status 0'
+
+    # Put back default
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
+    time.sleep(10)
+
+    # State FULL -> NEAR
+    log.info('4. Verify write success when NOT exceeding full_ratio')
+
+    # Write should succeed
+    ret = rados(ctx, mon, ['-p', 'foo', 'put', 'newfile2', dummyfile2])
+    assert ret == 0, 'Expected write to succeed, but got exit status %d' % ret
+
+    log.info('5. Verify warning messages again when exceeding nearfull_ratio')
+
+    proc = mon.run(
+             args=[
+                 'sudo',
+                 'daemon-helper',
+                 'kill',
+                 'ceph', '-w'
+             ],
+             stdin=run.PIPE,
+             stdout=StringIO(),
+             wait=False,
+        )
+
+    time.sleep(sleep_time)
+    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+    proc.wait()
+
+    lines = proc.stdout.getvalue().split('\n')
+
+    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+    assert count == 1 or count == 2, 'Incorrect number of warning messages expected 1 or 2 got %d' % count
+    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_nearfull_ratio .90')
+    time.sleep(10)
+
+    # State NONE -> FULL
+    log.info('6. Verify error messages again when exceeding full_ratio')
+
+    proc = mon.run(
+             args=[
+                 'sudo',
+                 'daemon-helper',
+                 'kill',
+                 'ceph', '-w'
+             ],
+             stdin=run.PIPE,
+             stdout=StringIO(),
+             wait=False,
+        )
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .00001')
+
+    time.sleep(sleep_time)
+    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+    proc.wait()
+
+    lines = proc.stdout.getvalue().split('\n')
+
+    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
+    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+    assert count == 2, 'Incorrect number of error messages expected 2 got %d' % count
+
+    # State FULL -> NONE
+    log.info('7. Verify no messages settings back to default')
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'injectargs', '--osd_failsafe_full_ratio .97')
+    time.sleep(10)
+
+    proc = mon.run(
+             args=[
+                 'sudo',
+                 'daemon-helper',
+                 'kill',
+                 'ceph', '-w'
+             ],
+             stdin=run.PIPE,
+             stdout=StringIO(),
+             wait=False,
+        )
+
+    time.sleep(sleep_time)
+    proc.stdin.close() # causes daemon-helper send SIGKILL to ceph -w
+    proc.wait()
+
+    lines = proc.stdout.getvalue().split('\n')
+
+    count = len(filter(lambda line: '[WRN] OSD near full' in line, lines))
+    assert count == 0, 'Incorrect number of warning messages expected 0 got %d' % count
+    count = len(filter(lambda line: '[ERR] OSD full dropping all updates' in line, lines))
+    assert count == 0, 'Incorrect number of error messages expected 0 got %d' % count
+
+    log.info('Test Passed')
+
+    # Bring all OSDs back in
+    manager.remove_pool("foo")
+    for osd in osds:
+        if osd['osd'] != 0:
+            manager.mark_in_osd(osd['osd'])
diff --git a/src/ceph/qa/tasks/osd_max_pg_per_osd.py b/src/ceph/qa/tasks/osd_max_pg_per_osd.py
new file mode 100644
index 0000000..b4e2aa4
--- /dev/null
+++ b/src/ceph/qa/tasks/osd_max_pg_per_osd.py
@@ -0,0 +1,126 @@
+import logging
+import random
+
+
+log = logging.getLogger(__name__)
+
+
+def pg_num_in_all_states(pgs, *states):
+    return sum(1 for state in pgs.itervalues()
+               if all(s in state for s in states))
+
+
+def pg_num_in_any_state(pgs, *states):
+    return sum(1 for state in pgs.itervalues()
+               if any(s in state for s in states))
+
+
+def test_create_from_mon(ctx, config):
+    """
+    osd should stop creating new pools if the number of pg it servers
+    exceeds the max-pg-per-osd setting, and it should resume the previously
+    suspended pg creations once the its pg number drops down below the setting
+    How it works::
+    1. set the hard limit of pg-per-osd to "2"
+    2. create pool.a with pg_num=2
+       # all pgs should be active+clean
+    2. create pool.b with pg_num=2
+       # new pgs belonging to this pool should be unknown (the primary osd
+       reaches the limit) or creating (replica osd reaches the limit)
+    3. remove pool.a
+    4. all pg belonging to pool.b should be active+clean
+    """
+    pg_num = config.get('pg_num', 2)
+    manager = ctx.managers['ceph']
+    log.info('1. creating pool.a')
+    pool_a = manager.create_pool_with_unique_name(pg_num)
+    manager.wait_for_clean()
+    assert manager.get_num_active_clean() == pg_num
+
+    log.info('2. creating pool.b')
+    pool_b = manager.create_pool_with_unique_name(pg_num)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+    pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+    assert pg_pending == pg_num
+
+    log.info('3. removing pool.a')
+    manager.remove_pool(pool_a)
+    pg_states = manager.wait_till_pg_convergence(300)
+    assert len(pg_states) == pg_num
+    pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+    assert pg_created == pg_num
+
+    # cleanup
+    manager.remove_pool(pool_b)
+
+
+def test_create_from_peer(ctx, config):
+    """
+    osd should stop creating new pools if the number of pg it servers
+    exceeds the max-pg-per-osd setting, and it should resume the previously
+    suspended pg creations once the its pg number drops down below the setting
+
+    How it works::
+    0. create 4 OSDs.
+    1. create pool.a with pg_num=1, size=2
+       pg will be mapped to osd.0, and osd.1, and it should be active+clean
+    2. create pool.b with pg_num=1, size=2.
+       if the pgs stuck in creating, delete the pool since the pool and try
+       again, eventually we'll get the pool to land on the other 2 osds that
+       aren't occupied by pool.a. (this will also verify that pgs for deleted
+       pools get cleaned out of the creating wait list.)
+    3. mark an osd out. verify that some pgs get stuck stale or peering.
+    4. delete a pool, verify pgs go active.
+    """
+    pg_num = config.get('pg_num', 1)
+    pool_size = config.get('pool_size', 2)
+    from_primary = config.get('from_primary', True)
+
+    manager = ctx.managers['ceph']
+    log.info('1. creating pool.a')
+    pool_a = manager.create_pool_with_unique_name(pg_num)
+    manager.wait_for_clean()
+    assert manager.get_num_active_clean() == pg_num
+
+    log.info('2. creating pool.b')
+    while True:
+        pool_b = manager.create_pool_with_unique_name(pg_num)
+        pg_states = manager.wait_till_pg_convergence(300)
+        pg_created = pg_num_in_all_states(pg_states, 'active', 'clean')
+        assert pg_created >= pg_num
+        pg_pending = pg_num_in_any_state(pg_states, 'unknown', 'creating')
+        assert pg_pending == pg_num * 2 - pg_created
+        if pg_created == pg_num * 2:
+            break
+        manager.remove_pool(pool_b)
+
+    log.info('3. mark an osd out')
+    pg_stats = manager.get_pg_stats()
+    pg = random.choice(pg_stats)
+    if from_primary:
+        victim = pg['acting'][-1]
+    else:
+        victim = pg['acting'][0]
+    manager.mark_out_osd(victim)
+    pg_states = manager.wait_till_pg_convergence(300)
+    pg_stuck = pg_num_in_any_state(pg_states, 'activating', 'stale', 'peering')
+    assert pg_stuck > 0
+
+    log.info('4. removing pool.b')
+    manager.remove_pool(pool_b)
+    manager.wait_for_clean(30)
+
+    # cleanup
+    manager.remove_pool(pool_a)
+
+
+def task(ctx, config):
+    assert isinstance(config, dict), \
+        'osd_max_pg_per_osd task only accepts a dict for config'
+    manager = ctx.managers['ceph']
+    if config.get('test_create_from_mon', True):
+        test_create_from_mon(ctx, config)
+    else:
+        test_create_from_peer(ctx, config)
diff --git a/src/ceph/qa/tasks/osd_recovery.py b/src/ceph/qa/tasks/osd_recovery.py
new file mode 100644
index 0000000..41e86d6
--- /dev/null
+++ b/src/ceph/qa/tasks/osd_recovery.py
@@ -0,0 +1,193 @@
+"""
+osd recovery
+"""
+import logging
+import ceph_manager
+import time
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+
+def rados_start(testdir, remote, cmd):
+    """
+    Run a remote rados command (currently used to only write data)
+    """
+    log.info("rados %s" % ' '.join(cmd))
+    pre = [
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'rados',
+        ];
+    pre.extend(cmd)
+    proc = remote.run(
+        args=pre,
+        wait=False,
+        )
+    return proc
+
+def task(ctx, config):
+    """
+    Test (non-backfill) recovery
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'task only accepts a dict for configuration'
+    testdir = teuthology.get_testdir(ctx)
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    log.info('num_osds is %s' % num_osds)
+    assert num_osds == 3
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
+
+    # test some osdmap flags
+    manager.raw_cluster_cmd('osd', 'set', 'noin')
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+    manager.raw_cluster_cmd('osd', 'set', 'noup')
+    manager.raw_cluster_cmd('osd', 'set', 'nodown')
+    manager.raw_cluster_cmd('osd', 'unset', 'noin')
+    manager.raw_cluster_cmd('osd', 'unset', 'noout')
+    manager.raw_cluster_cmd('osd', 'unset', 'noup')
+    manager.raw_cluster_cmd('osd', 'unset', 'nodown')
+
+    # write some new data
+    p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '20', 'write', '-b', '4096',
+                          '--no-cleanup'])
+
+    time.sleep(15)
+
+    # trigger a divergent target:
+    #  blackhole + restart osd.1 (shorter log)
+    manager.blackhole_kill_osd(1)
+    #  kill osd.2 (longer log... we'll make it divergent below)
+    manager.kill_osd(2)
+    time.sleep(2)
+    manager.revive_osd(1)
+
+    # wait for our writes to complete + succeed
+    err = p.wait()
+    log.info('err is %d' % err)
+
+    # cluster must repeer
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_active_or_down()
+
+    # write some more (make sure osd.2 really is divergent)
+    p = rados_start(testdir, mon, ['-p', 'rbd', 'bench', '15', 'write', '-b', '4096'])
+    p.wait()
+
+    # revive divergent osd
+    manager.revive_osd(2)
+
+    while len(manager.get_osd_status()['up']) < 3:
+        log.info('waiting a bit...')
+        time.sleep(2)
+    log.info('3 are up!')
+
+    # cluster must recover
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
+
+
+def test_incomplete_pgs(ctx, config):
+    """
+    Test handling of incomplete pgs.  Requires 4 osds.
+    """
+    testdir = teuthology.get_testdir(ctx)
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    log.info('num_osds is %s' % num_osds)
+    assert num_osds == 4
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 4:
+        time.sleep(10)
+
+    manager.flush_pg_stats([0, 1, 2, 3])
+    manager.wait_for_clean()
+
+    log.info('Testing incomplete pgs...')
+
+    for i in range(4):
+        manager.set_config(
+            i,
+            osd_recovery_delay_start=1000)
+
+    # move data off of osd.0, osd.1
+    manager.raw_cluster_cmd('osd', 'out', '0', '1')
+    manager.flush_pg_stats([0, 1, 2, 3], [0, 1])
+    manager.wait_for_clean()
+
+    # lots of objects in rbd (no pg log, will backfill)
+    p = rados_start(testdir, mon,
+                    ['-p', 'rbd', 'bench', '20', 'write', '-b', '1',
+                     '--no-cleanup'])
+    p.wait()
+
+    # few objects in rbd pool (with pg log, normal recovery)
+    for f in range(1, 20):
+        p = rados_start(testdir, mon, ['-p', 'rbd', 'put',
+                              'foo.%d' % f, '/etc/passwd'])
+        p.wait()
+
+    # move it back
+    manager.raw_cluster_cmd('osd', 'in', '0', '1')
+    manager.raw_cluster_cmd('osd', 'out', '2', '3')
+    time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2, 3], [2, 3])
+    time.sleep(10)
+    manager.wait_for_active()
+
+    assert not manager.is_clean()
+    assert not manager.is_recovered()
+
+    # kill 2 + 3
+    log.info('stopping 2,3')
+    manager.kill_osd(2)
+    manager.kill_osd(3)
+    log.info('...')
+    manager.raw_cluster_cmd('osd', 'down', '2', '3')
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_active_or_down()
+
+    assert manager.get_num_down() > 0
+
+    # revive 2 + 3
+    manager.revive_osd(2)
+    manager.revive_osd(3)
+    while len(manager.get_osd_status()['up']) < 4:
+        log.info('waiting a bit...')
+        time.sleep(2)
+    log.info('all are up!')
+
+    for i in range(4):
+        manager.kick_recovery_wq(i)
+
+    # cluster must recover
+    manager.wait_for_clean()
diff --git a/src/ceph/qa/tasks/peer.py b/src/ceph/qa/tasks/peer.py
new file mode 100644
index 0000000..9850da1
--- /dev/null
+++ b/src/ceph/qa/tasks/peer.py
@@ -0,0 +1,90 @@
+"""
+Peer test (Single test, not much configurable here)
+"""
+import logging
+import json
+import time
+
+import ceph_manager
+from teuthology import misc as teuthology
+from util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test peering.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'peer task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
+
+    for i in range(3):
+        manager.set_config(
+            i,
+            osd_recovery_delay_start=120)
+
+    # take on osd down
+    manager.kill_osd(2)
+    manager.mark_down_osd(2)
+
+    # kludge to make sure they get a map
+    rados(ctx, mon, ['-p', 'data', 'get', 'dummy', '-'])
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_recovery()
+
+    # kill another and revive 2, so that some pgs can't peer.
+    manager.kill_osd(1)
+    manager.mark_down_osd(1)
+    manager.revive_osd(2)
+    manager.wait_till_osd_is_up(2)
+
+    manager.flush_pg_stats([0, 2])
+
+    manager.wait_for_active_or_down()
+
+    manager.flush_pg_stats([0, 2])
+
+    # look for down pgs
+    num_down_pgs = 0
+    pgs = manager.get_pg_stats()
+    for pg in pgs:
+        out = manager.raw_cluster_cmd('pg', pg['pgid'], 'query')
+	log.debug("out string %s",out)
+        j = json.loads(out)
+        log.info("pg is %s, query json is %s", pg, j)
+
+        if pg['state'].count('down'):
+            num_down_pgs += 1
+            # verify that it is blocked on osd.1
+            rs = j['recovery_state']
+            assert len(rs) >= 2
+            assert rs[0]['name'] == 'Started/Primary/Peering/Down'
+            assert rs[1]['name'] == 'Started/Primary/Peering'
+            assert rs[1]['blocked']
+            assert rs[1]['down_osds_we_would_probe'] == [1]
+            assert len(rs[1]['peering_blocked_by']) == 1
+            assert rs[1]['peering_blocked_by'][0]['osd'] == 1
+
+    assert num_down_pgs > 0
+
+    # bring it all back
+    manager.revive_osd(1)
+    manager.wait_till_osd_is_up(1)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
diff --git a/src/ceph/qa/tasks/peering_speed_test.py b/src/ceph/qa/tasks/peering_speed_test.py
new file mode 100644
index 0000000..ab53238
--- /dev/null
+++ b/src/ceph/qa/tasks/peering_speed_test.py
@@ -0,0 +1,87 @@
+"""
+Remotely run peering tests.
+"""
+import logging
+import time
+
+log = logging.getLogger(__name__)
+
+from args import argify
+
+POOLNAME = "POOLNAME"
+ARGS = [
+    ('num_pgs', 'number of pgs to create', 256, int),
+    ('max_time', 'seconds to complete peering', 0, int),
+    ('runs', 'trials to run', 10, int),
+    ('num_objects', 'objects to create', 256 * 1024, int),
+    ('object_size', 'size in bytes for objects', 64, int),
+    ('creation_time_limit', 'time limit for pool population', 60*60, int),
+    ('create_threads', 'concurrent writes for create', 256, int)
+    ]
+
+def setup(ctx, config):
+    """
+    Setup peering test on remotes.
+    """
+    manager = ctx.managers['ceph']
+    manager.clear_pools()
+    manager.create_pool(POOLNAME, config.num_pgs)
+    log.info("populating pool")
+    manager.rados_write_objects(
+        POOLNAME,
+        config.num_objects,
+        config.object_size,
+        config.creation_time_limit,
+        config.create_threads)
+    log.info("done populating pool")
+
+def do_run(ctx, config):
+    """
+    Perform the test.
+    """
+    start = time.time()
+    # mark in osd
+    manager = ctx.managers['ceph']
+    manager.mark_in_osd(0)
+    log.info("writing out objects")
+    manager.rados_write_objects(
+        POOLNAME,
+        config.num_pgs, # write 1 object per pg or so
+        1,
+        config.creation_time_limit,
+        config.num_pgs, # lots of concurrency
+        cleanup = True)
+    peering_end = time.time()
+
+    log.info("peering done, waiting on recovery")
+    manager.wait_for_clean()
+
+    log.info("recovery done")
+    recovery_end = time.time()
+    if config.max_time:
+        assert(peering_end - start < config.max_time)
+    manager.mark_out_osd(0)
+    manager.wait_for_clean()
+    return {
+        'time_to_active': peering_end - start,
+        'time_to_clean': recovery_end - start
+        }
+
+@argify("peering_speed_test", ARGS)
+def task(ctx, config):
+    """
+    Peering speed test
+    """
+    setup(ctx, config)
+    manager = ctx.managers['ceph']
+    manager.mark_out_osd(0)
+    manager.wait_for_clean()
+    ret = []
+    for i in range(config.runs):
+        log.info("Run {i}".format(i = i))
+        ret.append(do_run(ctx, config))
+
+    manager.mark_in_osd(0)
+    ctx.summary['recovery_times'] = {
+        'runs': ret
+        }
diff --git a/src/ceph/qa/tasks/populate_rbd_pool.py b/src/ceph/qa/tasks/populate_rbd_pool.py
new file mode 100644
index 0000000..db67d60
--- /dev/null
+++ b/src/ceph/qa/tasks/populate_rbd_pool.py
@@ -0,0 +1,82 @@
+"""
+Populate rbd pools
+"""
+import contextlib
+import logging
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Populate <num_pools> pools with prefix <pool_prefix> with <num_images>
+    rbd images at <num_snaps> snaps
+
+    The config could be as follows::
+
+        populate_rbd_pool:
+          client: <client>
+          pool_prefix: foo
+          num_pools: 5
+          num_images: 10
+          num_snaps: 3
+          image_size: 10737418240
+    """
+    if config is None:
+        config = {}
+    client = config.get("client", "client.0")
+    pool_prefix = config.get("pool_prefix", "foo")
+    num_pools = config.get("num_pools", 2)
+    num_images = config.get("num_images", 20)
+    num_snaps = config.get("num_snaps", 4)
+    image_size = config.get("image_size", 100)
+    write_size = config.get("write_size", 1024*1024)
+    write_threads = config.get("write_threads", 10)
+    write_total_per_snap = config.get("write_total_per_snap", 1024*1024*30)
+
+    (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+
+    for poolid in range(num_pools):
+        poolname = "%s-%s" % (pool_prefix, str(poolid))
+        log.info("Creating pool %s" % (poolname,))
+        ctx.managers['ceph'].create_pool(poolname)
+        for imageid in range(num_images):
+            imagename = "rbd-%s" % (str(imageid),)
+            log.info("Creating imagename %s" % (imagename,))
+            remote.run(
+                args = [
+                    "rbd",
+                    "create",
+                    imagename,
+                    "--image-format", "1",
+                    "--size", str(image_size),
+                    "--pool", str(poolname)])
+            def bench_run():
+                remote.run(
+                    args = [
+                        "rbd",
+                        "bench-write",
+                        imagename,
+                        "--pool", poolname,
+                        "--io-size", str(write_size),
+                        "--io-threads", str(write_threads),
+                        "--io-total", str(write_total_per_snap),
+                        "--io-pattern", "rand"])
+            log.info("imagename %s first bench" % (imagename,))
+            bench_run()
+            for snapid in range(num_snaps):
+                snapname = "snap-%s" % (str(snapid),)
+                log.info("imagename %s creating snap %s" % (imagename, snapname))
+                remote.run(
+                    args = [
+                        "rbd", "snap", "create",
+                        "--pool", poolname,
+                        "--snap", snapname,
+                        imagename
+                        ])
+                bench_run()
+
+    try:
+        yield
+    finally:
+        log.info('done')
diff --git a/src/ceph/qa/tasks/qemu.py b/src/ceph/qa/tasks/qemu.py
new file mode 100644
index 0000000..82252e1
--- /dev/null
+++ b/src/ceph/qa/tasks/qemu.py
@@ -0,0 +1,577 @@
+"""
+Qemu task
+"""
+from cStringIO import StringIO
+
+import contextlib
+import logging
+import os
+import yaml
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from tasks import rbd
+from teuthology.orchestra import run
+from teuthology.config import config as teuth_config
+
+log = logging.getLogger(__name__)
+
+DEFAULT_NUM_DISKS = 2
+DEFAULT_IMAGE_URL = 'http://download.ceph.com/qa/ubuntu-12.04.qcow2'
+DEFAULT_IMAGE_SIZE = 10240 # in megabytes
+DEFAULT_CPUS = 1
+DEFAULT_MEM = 4096 # in megabytes
+
+def create_images(ctx, config, managers):
+    for client, client_config in config.iteritems():
+        disks = client_config.get('disks', DEFAULT_NUM_DISKS)
+        if not isinstance(disks, list):
+            disks = [{} for n in range(int(disks))]
+        clone = client_config.get('clone', False)
+        assert disks, 'at least one rbd device must be used'
+        for i, disk in enumerate(disks[1:]):
+            create_config = {
+                client: {
+                    'image_name': '{client}.{num}'.format(client=client,
+                                                          num=i + 1),
+                    'image_format': 2 if clone else 1,
+                    'image_size': (disk or {}).get('image_size',
+                                                   DEFAULT_IMAGE_SIZE),
+                    }
+                }
+            managers.append(
+                lambda create_config=create_config:
+                rbd.create_image(ctx=ctx, config=create_config)
+                )
+
+def create_clones(ctx, config, managers):
+    for client, client_config in config.iteritems():
+        clone = client_config.get('clone', False)
+        if clone:
+            num_disks = client_config.get('disks', DEFAULT_NUM_DISKS)
+            if isinstance(num_disks, list):
+                num_disks = len(num_disks)
+            for i in xrange(num_disks):
+                create_config = {
+                    client: {
+                        'image_name':
+                        '{client}.{num}-clone'.format(client=client, num=i),
+                        'parent_name':
+                        '{client}.{num}'.format(client=client, num=i),
+                        }
+                    }
+                managers.append(
+                    lambda create_config=create_config:
+                    rbd.clone_image(ctx=ctx, config=create_config)
+                    )
+
+@contextlib.contextmanager
+def create_dirs(ctx, config):
+    """
+    Handle directory creation and cleanup
+    """
+    testdir = teuthology.get_testdir(ctx)
+    for client, client_config in config.iteritems():
+        assert 'test' in client_config, 'You must specify a test to run'
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        remote.run(
+            args=[
+                'install', '-d', '-m0755', '--',
+                '{tdir}/qemu'.format(tdir=testdir),
+                '{tdir}/archive/qemu'.format(tdir=testdir),
+                ]
+            )
+    try:
+        yield
+    finally:
+        for client, client_config in config.iteritems():
+            assert 'test' in client_config, 'You must specify a test to run'
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            remote.run(
+                args=[
+                    'rmdir', '{tdir}/qemu'.format(tdir=testdir), run.Raw('||'), 'true',
+                    ]
+                )
+
+@contextlib.contextmanager
+def generate_iso(ctx, config):
+    """Execute system commands to generate iso"""
+    log.info('generating iso...')
+    testdir = teuthology.get_testdir(ctx)
+
+    # use ctx.config instead of config, because config has been
+    # through teuthology.replace_all_with_clients()
+    refspec = ctx.config.get('branch')
+    if refspec is None:
+        refspec = ctx.config.get('tag')
+    if refspec is None:
+        refspec = ctx.config.get('sha1')
+    if refspec is None:
+        refspec = 'HEAD'
+
+    # hack: the git_url is always ceph-ci or ceph
+    git_url = teuth_config.get_ceph_git_url()
+    repo_name = 'ceph.git'
+    if git_url.count('ceph-ci'):
+        repo_name = 'ceph-ci.git'
+
+    for client, client_config in config.iteritems():
+        assert 'test' in client_config, 'You must specify a test to run'
+        test_url = client_config['test'].format(repo=repo_name, branch=refspec)
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        src_dir = os.path.dirname(__file__)
+        userdata_path = os.path.join(testdir, 'qemu', 'userdata.' + client)
+        metadata_path = os.path.join(testdir, 'qemu', 'metadata.' + client)
+
+        with file(os.path.join(src_dir, 'userdata_setup.yaml'), 'rb') as f:
+            test_setup = ''.join(f.readlines())
+            # configuring the commands to setup the nfs mount
+            mnt_dir = "/export/{client}".format(client=client)
+            test_setup = test_setup.format(
+                mnt_dir=mnt_dir
+            )
+
+        with file(os.path.join(src_dir, 'userdata_teardown.yaml'), 'rb') as f:
+            test_teardown = ''.join(f.readlines())
+
+        user_data = test_setup
+        if client_config.get('type', 'filesystem') == 'filesystem':
+            num_disks = client_config.get('disks', DEFAULT_NUM_DISKS)
+            if isinstance(num_disks, list):
+                num_disks = len(num_disks)
+            for i in xrange(1, num_disks):
+                dev_letter = chr(ord('a') + i)
+                user_data += """
+- |
+  #!/bin/bash
+  mkdir /mnt/test_{dev_letter}
+  mkfs -t xfs /dev/vd{dev_letter}
+  mount -t xfs /dev/vd{dev_letter} /mnt/test_{dev_letter}
+""".format(dev_letter=dev_letter)
+
+        user_data += """
+- |
+  #!/bin/bash
+  test -d /etc/ceph || mkdir /etc/ceph
+  cp /mnt/cdrom/ceph.* /etc/ceph/
+"""
+
+        cloud_config_archive = client_config.get('cloud_config_archive', [])
+        if cloud_config_archive:
+          user_data += yaml.safe_dump(cloud_config_archive, default_style='|',
+                                      default_flow_style=False)
+
+        # this may change later to pass the directories as args to the
+        # script or something. xfstests needs that.
+        user_data += """
+- |
+  #!/bin/bash
+  test -d /mnt/test_b && cd /mnt/test_b
+  /mnt/cdrom/test.sh > /mnt/log/test.log 2>&1 && touch /mnt/log/success
+""" + test_teardown
+
+        user_data = user_data.format(
+            ceph_branch=ctx.config.get('branch'),
+            ceph_sha1=ctx.config.get('sha1'))
+        teuthology.write_file(remote, userdata_path, StringIO(user_data))
+
+        with file(os.path.join(src_dir, 'metadata.yaml'), 'rb') as f:
+            teuthology.write_file(remote, metadata_path, f)
+
+        test_file = '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client)
+
+        log.info('fetching test %s for %s', test_url, client)
+        remote.run(
+            args=[
+                'wget', '-nv', '-O', test_file,
+                test_url,
+                run.Raw('&&'),
+                'chmod', '755', test_file,
+                ],
+            )
+        remote.run(
+            args=[
+                'genisoimage', '-quiet', '-input-charset', 'utf-8',
+                '-volid', 'cidata', '-joliet', '-rock',
+                '-o', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
+                '-graft-points',
+                'user-data={userdata}'.format(userdata=userdata_path),
+                'meta-data={metadata}'.format(metadata=metadata_path),
+                'ceph.conf=/etc/ceph/ceph.conf',
+                'ceph.keyring=/etc/ceph/ceph.keyring',
+                'test.sh={file}'.format(file=test_file),
+                ],
+            )
+    try:
+        yield
+    finally:
+        for client in config.iterkeys():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            remote.run(
+                args=[
+                    'rm', '-f',
+                    '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
+                    os.path.join(testdir, 'qemu', 'userdata.' + client),
+                    os.path.join(testdir, 'qemu', 'metadata.' + client),
+                    '{tdir}/qemu/{client}.test.sh'.format(tdir=testdir, client=client),
+                    ],
+                )
+
+@contextlib.contextmanager
+def download_image(ctx, config):
+    """Downland base image, remove image file when done"""
+    log.info('downloading base image')
+    testdir = teuthology.get_testdir(ctx)
+    for client, client_config in config.iteritems():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        base_file = '{tdir}/qemu/base.{client}.qcow2'.format(tdir=testdir, client=client)
+        image_url = client_config.get('image_url', DEFAULT_IMAGE_URL)
+        remote.run(
+            args=[
+                'wget', '-nv', '-O', base_file, image_url,
+                ]
+            )
+
+        disks = client_config.get('disks', None)
+        if not isinstance(disks, list):
+            disks = [{}]
+        image_name = '{client}.0'.format(client=client)
+        image_size = (disks[0] or {}).get('image_size', DEFAULT_IMAGE_SIZE)
+        remote.run(
+            args=[
+                'qemu-img', 'convert', '-f', 'qcow2', '-O', 'raw',
+                base_file, 'rbd:rbd/{image_name}'.format(image_name=image_name)
+                ]
+            )
+        remote.run(
+            args=[
+                'rbd', 'resize',
+                '--size={image_size}M'.format(image_size=image_size),
+                image_name,
+                ]
+            )
+    try:
+        yield
+    finally:
+        log.debug('cleaning up base image files')
+        for client in config.iterkeys():
+            base_file = '{tdir}/qemu/base.{client}.qcow2'.format(
+                tdir=testdir,
+                client=client,
+                )
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            remote.run(
+                args=[
+                    'rm', '-f', base_file,
+                    ],
+                )
+
+
+def _setup_nfs_mount(remote, client, mount_dir):
+    """
+    Sets up an nfs mount on the remote that the guest can use to
+    store logs. This nfs mount is also used to touch a file
+    at the end of the test to indiciate if the test was successful
+    or not.
+    """
+    export_dir = "/export/{client}".format(client=client)
+    log.info("Creating the nfs export directory...")
+    remote.run(args=[
+        'sudo', 'mkdir', '-p', export_dir,
+    ])
+    log.info("Mounting the test directory...")
+    remote.run(args=[
+        'sudo', 'mount', '--bind', mount_dir, export_dir,
+    ])
+    log.info("Adding mount to /etc/exports...")
+    export = "{dir} *(rw,no_root_squash,no_subtree_check,insecure)".format(
+        dir=export_dir
+    )
+    remote.run(args=[
+        'sudo', 'sed', '-i', '/^\/export\//d', "/etc/exports",
+    ])
+    remote.run(args=[
+        'echo', export, run.Raw("|"),
+        'sudo', 'tee', '-a', "/etc/exports",
+    ])
+    log.info("Restarting NFS...")
+    if remote.os.package_type == "deb":
+        remote.run(args=['sudo', 'service', 'nfs-kernel-server', 'restart'])
+    else:
+        remote.run(args=['sudo', 'systemctl', 'restart', 'nfs'])
+
+
+def _teardown_nfs_mount(remote, client):
+    """
+    Tears down the nfs mount on the remote used for logging and reporting the
+    status of the tests being ran in the guest.
+    """
+    log.info("Tearing down the nfs mount for {remote}".format(remote=remote))
+    export_dir = "/export/{client}".format(client=client)
+    log.info("Stopping NFS...")
+    if remote.os.package_type == "deb":
+        remote.run(args=[
+            'sudo', 'service', 'nfs-kernel-server', 'stop'
+        ])
+    else:
+        remote.run(args=[
+            'sudo', 'systemctl', 'stop', 'nfs'
+        ])
+    log.info("Unmounting exported directory...")
+    remote.run(args=[
+        'sudo', 'umount', export_dir
+    ])
+    log.info("Deleting exported directory...")
+    remote.run(args=[
+        'sudo', 'rm', '-r', '/export'
+    ])
+    log.info("Deleting export from /etc/exports...")
+    remote.run(args=[
+        'sudo', 'sed', '-i', '$ d', '/etc/exports'
+    ])
+    log.info("Starting NFS...")
+    if remote.os.package_type == "deb":
+        remote.run(args=[
+            'sudo', 'service', 'nfs-kernel-server', 'start'
+        ])
+    else:
+        remote.run(args=[
+            'sudo', 'systemctl', 'start', 'nfs'
+        ])
+
+
+@contextlib.contextmanager
+def run_qemu(ctx, config):
+    """Setup kvm environment and start qemu"""
+    procs = []
+    testdir = teuthology.get_testdir(ctx)
+    for client, client_config in config.iteritems():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir, client=client)
+        remote.run(
+            args=[
+                'mkdir', log_dir, run.Raw('&&'),
+                'sudo', 'modprobe', 'kvm',
+                ]
+            )
+
+        # make an nfs mount to use for logging and to
+        # allow to test to tell teuthology the tests outcome
+        _setup_nfs_mount(remote, client, log_dir)
+
+        # Hack to make sure /dev/kvm permissions are set correctly
+        # See http://tracker.ceph.com/issues/17977 and
+        # https://bugzilla.redhat.com/show_bug.cgi?id=1333159
+        remote.run(args='sudo udevadm control --reload')
+        remote.run(args='sudo udevadm trigger /dev/kvm')
+        remote.run(args='ls -l /dev/kvm')
+
+        qemu_cmd = 'qemu-system-x86_64'
+        if remote.os.package_type == "rpm":
+            qemu_cmd = "/usr/libexec/qemu-kvm"
+        args=[
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'daemon-helper',
+            'term',
+            qemu_cmd, '-enable-kvm', '-nographic', '-cpu', 'host',
+            '-smp', str(client_config.get('cpus', DEFAULT_CPUS)),
+            '-m', str(client_config.get('memory', DEFAULT_MEM)),
+            # cd holding metadata for cloud-init
+            '-cdrom', '{tdir}/qemu/{client}.iso'.format(tdir=testdir, client=client),
+            ]
+
+        cachemode = 'none'
+        ceph_config = ctx.ceph['ceph'].conf.get('global', {})
+        ceph_config.update(ctx.ceph['ceph'].conf.get('client', {}))
+        ceph_config.update(ctx.ceph['ceph'].conf.get(client, {}))
+        if ceph_config.get('rbd cache', True):
+            if ceph_config.get('rbd cache max dirty', 1) > 0:
+                cachemode = 'writeback'
+            else:
+                cachemode = 'writethrough'
+
+        clone = client_config.get('clone', False)
+        num_disks = client_config.get('disks', DEFAULT_NUM_DISKS)
+        if isinstance(num_disks, list):
+            num_disks = len(num_disks)
+        for i in xrange(num_disks):
+            suffix = '-clone' if clone else ''
+            args.extend([
+                '-drive',
+                'file=rbd:rbd/{img}:id={id},format=raw,if=virtio,cache={cachemode}'.format(
+                    img='{client}.{num}{suffix}'.format(client=client, num=i,
+                                                        suffix=suffix),
+                    id=client[len('client.'):],
+                    cachemode=cachemode,
+                    ),
+                ])
+
+        log.info('starting qemu...')
+        procs.append(
+            remote.run(
+                args=args,
+                logger=log.getChild(client),
+                stdin=run.PIPE,
+                wait=False,
+                )
+            )
+
+    try:
+        yield
+    finally:
+        log.info('waiting for qemu tests to finish...')
+        run.wait(procs)
+
+        log.debug('checking that qemu tests succeeded...')
+        for client in config.iterkeys():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+
+            # ensure we have permissions to all the logs
+            log_dir = '{tdir}/archive/qemu/{client}'.format(tdir=testdir,
+                                                            client=client)
+            remote.run(
+                args=[
+                    'sudo', 'chmod', 'a+rw', '-R', log_dir
+                    ]
+                )
+
+            # teardown nfs mount
+            _teardown_nfs_mount(remote, client)
+            # check for test status
+            remote.run(
+                args=[
+                    'test', '-f',
+                    '{tdir}/archive/qemu/{client}/success'.format(
+                        tdir=testdir,
+                        client=client
+                        ),
+                    ],
+                )
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run a test inside of QEMU on top of rbd. Only one test
+    is supported per client.
+
+    For example, you can specify which clients to run on::
+
+        tasks:
+        - ceph:
+        - qemu:
+            client.0:
+              test: http://download.ceph.com/qa/test.sh
+            client.1:
+              test: http://download.ceph.com/qa/test2.sh
+
+    Or use the same settings on all clients:
+
+        tasks:
+        - ceph:
+        - qemu:
+            all:
+              test: http://download.ceph.com/qa/test.sh
+
+    For tests that don't need a filesystem, set type to block::
+
+        tasks:
+        - ceph:
+        - qemu:
+            client.0:
+              test: http://download.ceph.com/qa/test.sh
+              type: block
+
+    The test should be configured to run on /dev/vdb and later
+    devices.
+
+    If you want to run a test that uses more than one rbd image,
+    specify how many images to use::
+
+        tasks:
+        - ceph:
+        - qemu:
+            client.0:
+              test: http://download.ceph.com/qa/test.sh
+              type: block
+              disks: 2
+
+    - or -
+
+        tasks:
+        - ceph:
+        - qemu:
+            client.0:
+              test: http://ceph.com/qa/test.sh
+              type: block
+              disks:
+                - image_size: 1024
+                - image_size: 2048
+
+    You can set the amount of CPUs and memory the VM has (default is 1 CPU and
+    4096 MB)::
+
+        tasks:
+        - ceph:
+        - qemu:
+            client.0:
+              test: http://download.ceph.com/qa/test.sh
+              cpus: 4
+              memory: 512 # megabytes
+
+    If you want to run a test against a cloned rbd image, set clone to true::
+
+        tasks:
+        - ceph:
+        - qemu:
+            client.0:
+              test: http://download.ceph.com/qa/test.sh
+              clone: true
+
+    If you need to configure additional cloud-config options, set cloud_config
+    to the required data set::
+
+        tasks:
+        - ceph
+        - qemu:
+            client.0:
+                test: http://ceph.com/qa/test.sh
+                cloud_config_archive:
+                    - |
+                      #/bin/bash
+                      touch foo1
+                    - content: |
+                        test data
+                      type: text/plain
+                      filename: /tmp/data
+
+    If you need to override the default cloud image, set image_url:
+
+        tasks:
+        - ceph
+        - qemu:
+            client.0:
+                test: http://ceph.com/qa/test.sh
+                image_url: https://cloud-images.ubuntu.com/releases/16.04/release/ubuntu-16.04-server-cloudimg-amd64-disk1.img
+    """
+    assert isinstance(config, dict), \
+           "task qemu only supports a dictionary for configuration"
+
+    config = teuthology.replace_all_with_clients(ctx.cluster, config)
+
+    managers = []
+    create_images(ctx=ctx, config=config, managers=managers)
+    managers.extend([
+        lambda: create_dirs(ctx=ctx, config=config),
+        lambda: generate_iso(ctx=ctx, config=config),
+        lambda: download_image(ctx=ctx, config=config),
+        ])
+    create_clones(ctx=ctx, config=config, managers=managers)
+    managers.append(
+        lambda: run_qemu(ctx=ctx, config=config),
+        )
+
+    with contextutil.nested(*managers):
+        yield
diff --git a/src/ceph/qa/tasks/rados.py b/src/ceph/qa/tasks/rados.py
new file mode 100644
index 0000000..3ab93d6
--- /dev/null
+++ b/src/ceph/qa/tasks/rados.py
@@ -0,0 +1,266 @@
+"""
+Rados modle-based integration tests
+"""
+import contextlib
+import logging
+import gevent
+from teuthology import misc as teuthology
+
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run RadosModel-based integration tests.
+
+    The config should be as follows::
+
+        rados:
+          clients: [client list]
+          ops: <number of ops>
+          objects: <number of objects to use>
+          max_in_flight: <max number of operations in flight>
+          object_size: <size of objects in bytes>
+          min_stride_size: <minimum write stride size in bytes>
+          max_stride_size: <maximum write stride size in bytes>
+          op_weights: <dictionary mapping operation type to integer weight>
+          runs: <number of times to run> - the pool is remade between runs
+          ec_pool: use an ec pool
+          erasure_code_profile: profile to use with the erasure coded pool
+          fast_read: enable ec_pool's fast_read
+          min_size: set the min_size of created pool
+          pool_snaps: use pool snapshots instead of selfmanaged snapshots
+	  write_fadvise_dontneed: write behavior like with LIBRADOS_OP_FLAG_FADVISE_DONTNEED.
+	                          This mean data don't access in the near future.
+				  Let osd backend don't keep data in cache.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rados:
+            clients: [client.0]
+            ops: 1000
+            max_seconds: 0   # 0 for no limit
+            objects: 25
+            max_in_flight: 16
+            object_size: 4000000
+            min_stride_size: 1024
+            max_stride_size: 4096
+            op_weights:
+              read: 20
+              write: 10
+              delete: 2
+              snap_create: 3
+              rollback: 2
+              snap_remove: 0
+            ec_pool: create an ec pool, defaults to False
+            erasure_code_use_overwrites: test overwrites, default false
+            erasure_code_profile:
+              name: teuthologyprofile
+              k: 2
+              m: 1
+              crush-failure-domain: osd
+            pool_snaps: true
+	    write_fadvise_dontneed: true
+            runs: 10
+        - interactive:
+
+    Optionally, you can provide the pool name to run against:
+
+        tasks:
+        - ceph:
+        - exec:
+            client.0:
+              - ceph osd pool create foo
+        - rados:
+            clients: [client.0]
+            pools: [foo]
+            ...
+
+    Alternatively, you can provide a pool prefix:
+
+        tasks:
+        - ceph:
+        - exec:
+            client.0:
+              - ceph osd pool create foo.client.0
+        - rados:
+            clients: [client.0]
+            pool_prefix: foo
+            ...
+
+    The tests are run asynchronously, they are not complete when the task
+    returns. For instance:
+
+        - rados:
+            clients: [client.0]
+            pools: [ecbase]
+            ops: 4000
+            objects: 500
+            op_weights:
+              read: 100
+              write: 100
+              delete: 50
+              copy_from: 50
+        - print: "**** done rados ec-cache-agent (part 2)"
+
+     will run the print task immediately after the rados tasks begins but
+     not after it completes. To make the rados task a blocking / sequential
+     task, use:
+
+        - sequential:
+          - rados:
+              clients: [client.0]
+              pools: [ecbase]
+              ops: 4000
+              objects: 500
+              op_weights:
+                read: 100
+                write: 100
+                delete: 50
+                copy_from: 50
+        - print: "**** done rados ec-cache-agent (part 2)"
+
+    """
+    log.info('Beginning rados...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+
+    object_size = int(config.get('object_size', 4000000))
+    op_weights = config.get('op_weights', {})
+    testdir = teuthology.get_testdir(ctx)
+    args = [
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'ceph_test_rados']
+    if config.get('ec_pool', False):
+        args.extend(['--no-omap'])
+        if not config.get('erasure_code_use_overwrites', False):
+            args.extend(['--ec-pool'])
+    if config.get('write_fadvise_dontneed', False):
+        args.extend(['--write-fadvise-dontneed'])
+    if config.get('set_redirect', False):
+        args.extend(['--set_redirect'])
+    if config.get('pool_snaps', False):
+        args.extend(['--pool-snaps'])
+    args.extend([
+        '--max-ops', str(config.get('ops', 10000)),
+        '--objects', str(config.get('objects', 500)),
+        '--max-in-flight', str(config.get('max_in_flight', 16)),
+        '--size', str(object_size),
+        '--min-stride-size', str(config.get('min_stride_size', object_size / 10)),
+        '--max-stride-size', str(config.get('max_stride_size', object_size / 5)),
+        '--max-seconds', str(config.get('max_seconds', 0))
+        ])
+
+    weights = {}
+    weights['read'] = 100
+    weights['write'] = 100
+    weights['delete'] = 10
+    # Parallel of the op_types in test/osd/TestRados.cc
+    for field in [
+        # read handled above
+        # write handled above
+        # delete handled above
+        "snap_create",
+        "snap_remove",
+        "rollback",
+        "setattr",
+        "rmattr",
+        "watch",
+        "copy_from",
+        "hit_set_list",
+        "is_dirty",
+        "undirty",
+        "cache_flush",
+        "cache_try_flush",
+        "cache_evict",
+        "append",
+        "write",
+        "read",
+        "delete"
+        ]:
+        if field in op_weights:
+            weights[field] = op_weights[field]
+
+    if config.get('write_append_excl', True):
+        if 'write' in weights:
+            weights['write'] = weights['write'] / 2
+            weights['write_excl'] = weights['write']
+
+        if 'append' in weights:
+            weights['append'] = weights['append'] / 2
+            weights['append_excl'] = weights['append']
+
+    for op, weight in weights.iteritems():
+        args.extend([
+            '--op', op, str(weight)
+        ])
+                
+
+    def thread():
+        """Thread spawned by gevent"""
+        clients = ['client.{id}'.format(id=id_) for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+        log.info('clients are %s' % clients)
+        manager = ctx.managers['ceph']
+        if config.get('ec_pool', False):
+            profile = config.get('erasure_code_profile', {})
+            profile_name = profile.get('name', 'teuthologyprofile')
+            manager.create_erasure_code_profile(profile_name, profile)
+        else:
+            profile_name = None
+        for i in range(int(config.get('runs', '1'))):
+            log.info("starting run %s out of %s", str(i), config.get('runs', '1'))
+            tests = {}
+            existing_pools = config.get('pools', [])
+            created_pools = []
+            for role in config.get('clients', clients):
+                assert isinstance(role, basestring)
+                PREFIX = 'client.'
+                assert role.startswith(PREFIX)
+                id_ = role[len(PREFIX):]
+
+                pool = config.get('pool', None)
+                if not pool and existing_pools:
+                    pool = existing_pools.pop()
+                else:
+                    pool = manager.create_pool_with_unique_name(
+                        erasure_code_profile_name=profile_name,
+                        erasure_code_use_overwrites=
+                          config.get('erasure_code_use_overwrites', False)
+                    )
+                    created_pools.append(pool)
+                    if config.get('fast_read', False):
+                        manager.raw_cluster_cmd(
+                            'osd', 'pool', 'set', pool, 'fast_read', 'true')
+                    min_size = config.get('min_size', None);
+                    if min_size is not None:
+                        manager.raw_cluster_cmd(
+                            'osd', 'pool', 'set', pool, 'min_size', str(min_size))
+
+                (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+                proc = remote.run(
+                    args=["CEPH_CLIENT_ID={id_}".format(id_=id_)] + args +
+                    ["--pool", pool],
+                    logger=log.getChild("rados.{id}".format(id=id_)),
+                    stdin=run.PIPE,
+                    wait=False
+                    )
+                tests[id_] = proc
+            run.wait(tests.itervalues())
+
+            for pool in created_pools:
+                manager.wait_snap_trimming_complete(pool);
+                manager.remove_pool(pool)
+
+    running = gevent.spawn(thread)
+
+    try:
+        yield
+    finally:
+        log.info('joining rados')
+        running.get()
diff --git a/src/ceph/qa/tasks/radosbench.py b/src/ceph/qa/tasks/radosbench.py
new file mode 100644
index 0000000..530a6f1
--- /dev/null
+++ b/src/ceph/qa/tasks/radosbench.py
@@ -0,0 +1,135 @@
+"""
+Rados benchmarking
+"""
+import contextlib
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run radosbench
+
+    The config should be as follows:
+
+    radosbench:
+        clients: [client list]
+        time: <seconds to run>
+        pool: <pool to use>
+        size: write size to use
+        objectsize: object size to use
+        unique_pool: use a unique pool, defaults to False
+        ec_pool: create an ec pool, defaults to False
+        create_pool: create pool, defaults to True
+        erasure_code_profile:
+          name: teuthologyprofile
+          k: 2
+          m: 1
+          crush-failure-domain: osd
+        cleanup: false (defaults to true)
+        type: <write|seq|rand> (defaults to write)
+    example:
+
+    tasks:
+    - ceph:
+    - radosbench:
+        clients: [client.0]
+        time: 360
+    - interactive:
+    """
+    log.info('Beginning radosbench...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+    radosbench = {}
+
+    testdir = teuthology.get_testdir(ctx)
+    manager = ctx.managers['ceph']
+    runtype = config.get('type', 'write')
+
+    create_pool = config.get('create_pool', True)
+    for role in config.get('clients', ['client.0']):
+        assert isinstance(role, basestring)
+        PREFIX = 'client.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+
+        if config.get('ec_pool', False):
+            profile = config.get('erasure_code_profile', {})
+            profile_name = profile.get('name', 'teuthologyprofile')
+            manager.create_erasure_code_profile(profile_name, profile)
+        else:
+            profile_name = None
+
+        cleanup = []
+        if not config.get('cleanup', True):
+            cleanup = ['--no-cleanup']
+
+        pool = config.get('pool', 'data')
+        if create_pool:
+            if pool != 'data':
+                manager.create_pool(pool, erasure_code_profile_name=profile_name)
+            else:
+                pool = manager.create_pool_with_unique_name(erasure_code_profile_name=profile_name)
+
+        osize = config.get('objectsize', 0)
+        if osize is 0:
+            objectsize = []
+        else:
+            objectsize = ['-o', str(osize)]
+        size = ['-b', str(config.get('size', 4<<20))]
+        # If doing a reading run then populate data
+        if runtype != "write":
+            proc = remote.run(
+                args=[
+                    "/bin/sh", "-c",
+                    " ".join(['adjust-ulimits',
+                              'ceph-coverage',
+                              '{tdir}/archive/coverage',
+                              'rados',
+                              '--no-log-to-stderr',
+                              '--name', role]
+                              + size + objectsize +
+                              ['-p' , pool,
+                          'bench', str(60), "write", "--no-cleanup"
+                          ]).format(tdir=testdir),
+                ],
+            logger=log.getChild('radosbench.{id}'.format(id=id_)),
+            wait=True
+            )
+            size = []
+            objectsize = []
+
+        proc = remote.run(
+            args=[
+                "/bin/sh", "-c",
+                " ".join(['adjust-ulimits',
+                          'ceph-coverage',
+                          '{tdir}/archive/coverage',
+                          'rados',
+			  '--no-log-to-stderr',
+                          '--name', role]
+                          + size + objectsize +
+                          ['-p' , pool,
+                          'bench', str(config.get('time', 360)), runtype,
+                          ] + cleanup).format(tdir=testdir),
+                ],
+            logger=log.getChild('radosbench.{id}'.format(id=id_)),
+            stdin=run.PIPE,
+            wait=False
+            )
+        radosbench[id_] = proc
+
+    try:
+        yield
+    finally:
+        timeout = config.get('time', 360) * 30 + 300
+        log.info('joining radosbench (timing out after %ss)', timeout)
+        run.wait(radosbench.itervalues(), timeout=timeout)
+
+        if pool is not 'data' and create_pool:
+            manager.remove_pool(pool)
diff --git a/src/ceph/qa/tasks/radosbenchsweep.py b/src/ceph/qa/tasks/radosbenchsweep.py
new file mode 100644
index 0000000..cda106a
--- /dev/null
+++ b/src/ceph/qa/tasks/radosbenchsweep.py
@@ -0,0 +1,221 @@
+"""
+Rados benchmarking sweep
+"""
+import contextlib
+import logging
+import re
+
+from cStringIO import StringIO
+from itertools import product
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Execute a radosbench parameter sweep
+
+    Puts radosbench in a loop, taking values from the given config at each
+    iteration. If given, the min and max values below create a range, e.g.
+    min_replicas=1 and max_replicas=3 implies executing with 1-3 replicas.
+
+    Parameters:
+
+        clients: [client list]
+        time: seconds to run (default=120)
+        sizes: [list of object sizes] (default=[4M])
+        mode: <write|read|seq> (default=write)
+        repetitions: execute the same configuration multiple times (default=1)
+        min_num_replicas: minimum number of replicas to use (default = 3)
+        max_num_replicas: maximum number of replicas to use (default = 3)
+        min_num_osds: the minimum number of OSDs in a pool (default=all)
+        max_num_osds: the maximum number of OSDs in a pool (default=all)
+        file: name of CSV-formatted output file (default='radosbench.csv')
+        columns: columns to include (default=all)
+          - rep: execution number (takes values from 'repetitions')
+          - num_osd: number of osds for pool
+          - num_replica: number of replicas
+          - avg_throughput: throughput
+          - avg_latency: latency
+          - stdev_throughput:
+          - stdev_latency:
+
+    Example:
+    - radsobenchsweep:
+        columns: [rep, num_osd, num_replica, avg_throughput, stdev_throughput]
+    """
+    log.info('Beginning radosbenchsweep...')
+    assert isinstance(config, dict), 'expecting dictionary for configuration'
+
+    # get and validate config values
+    # {
+
+    # only one client supported for now
+    if len(config.get('clients', [])) != 1:
+        raise Exception("Only one client can be specified")
+
+    # only write mode
+    if config.get('mode', 'write') != 'write':
+        raise Exception("Only 'write' mode supported for now.")
+
+    # OSDs
+    total_osds_in_cluster = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    min_num_osds = config.get('min_num_osds', total_osds_in_cluster)
+    max_num_osds = config.get('max_num_osds', total_osds_in_cluster)
+
+    if max_num_osds > total_osds_in_cluster:
+        raise Exception('max_num_osds cannot be greater than total in cluster')
+    if min_num_osds < 1:
+        raise Exception('min_num_osds cannot be less than 1')
+    if min_num_osds > max_num_osds:
+        raise Exception('min_num_osds cannot be greater than max_num_osd')
+    osds = range(0, (total_osds_in_cluster + 1))
+
+    # replicas
+    min_num_replicas = config.get('min_num_replicas', 3)
+    max_num_replicas = config.get('max_num_replicas', 3)
+
+    if min_num_replicas < 1:
+        raise Exception('min_num_replicas cannot be less than 1')
+    if min_num_replicas > max_num_replicas:
+        raise Exception('min_num_replicas cannot be greater than max_replicas')
+    if max_num_replicas > max_num_osds:
+        raise Exception('max_num_replicas cannot be greater than max_num_osds')
+    replicas = range(min_num_replicas, (max_num_replicas + 1))
+
+    # object size
+    sizes = config.get('size', [4 << 20])
+
+    # repetitions
+    reps = range(config.get('repetitions', 1))
+
+    # file
+    fname = config.get('file', 'radosbench.csv')
+    f = open('{}/{}'.format(ctx.archive, fname), 'w')
+    f.write(get_csv_header(config) + '\n')
+    # }
+
+    # set default pools size=1 to avoid 'unhealthy' issues
+    ctx.manager.set_pool_property('data', 'size', 1)
+    ctx.manager.set_pool_property('metadata', 'size', 1)
+    ctx.manager.set_pool_property('rbd', 'size', 1)
+
+    current_osds_out = 0
+
+    # sweep through all parameters
+    for osds_out, size, replica, rep in product(osds, sizes, replicas, reps):
+
+        osds_in = total_osds_in_cluster - osds_out
+
+        if osds_in == 0:
+            # we're done
+            break
+
+        if current_osds_out != osds_out:
+            # take an osd out
+            ctx.manager.raw_cluster_cmd(
+                'osd', 'reweight', str(osds_out-1), '0.0')
+            wait_until_healthy(ctx, config)
+            current_osds_out = osds_out
+
+        if osds_in not in range(min_num_osds, (max_num_osds + 1)):
+            # no need to execute with a number of osds that wasn't requested
+            continue
+
+        if osds_in < replica:
+            # cannot execute with more replicas than available osds
+            continue
+
+        run_radosbench(ctx, config, f, osds_in, size, replica, rep)
+
+    f.close()
+
+    yield
+
+
+def get_csv_header(conf):
+    all_columns = [
+        'rep', 'num_osd', 'num_replica', 'avg_throughput',
+        'avg_latency', 'stdev_throughput', 'stdev_latency'
+    ]
+    given_columns = conf.get('columns', None)
+    if given_columns and len(given_columns) != 0:
+        for column in given_columns:
+            if column not in all_columns:
+                raise Exception('Unknown column ' + column)
+        return ','.join(conf['columns'])
+    else:
+        conf['columns'] = all_columns
+        return ','.join(all_columns)
+
+
+def run_radosbench(ctx, config, f, num_osds, size, replica, rep):
+    pool = ctx.manager.create_pool_with_unique_name()
+
+    ctx.manager.set_pool_property(pool, 'size', replica)
+
+    wait_until_healthy(ctx, config)
+
+    log.info('Executing with parameters: ')
+    log.info('  num_osd =' + str(num_osds))
+    log.info('  size =' + str(size))
+    log.info('  num_replicas =' + str(replica))
+    log.info('  repetition =' + str(rep))
+
+    for role in config.get('clients', ['client.0']):
+        assert isinstance(role, basestring)
+        PREFIX = 'client.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+
+        proc = remote.run(
+            args=[
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{}/archive/coverage'.format(teuthology.get_testdir(ctx)),
+                'rados',
+                '--no-log-to-stderr',
+                '--name', role,
+                '-b', str(size),
+                '-p', pool,
+                'bench', str(config.get('time', 120)), 'write',
+            ],
+            logger=log.getChild('radosbench.{id}'.format(id=id_)),
+            stdin=run.PIPE,
+            stdout=StringIO(),
+            wait=False
+        )
+
+        # parse output to get summary and format it as CSV
+        proc.wait()
+        out = proc.stdout.getvalue()
+        all_values = {
+            'stdev_throughput': re.sub(r'Stddev Bandwidth: ', '', re.search(
+                r'Stddev Bandwidth:.*', out).group(0)),
+            'stdev_latency': re.sub(r'Stddev Latency: ', '', re.search(
+                r'Stddev Latency:.*', out).group(0)),
+            'avg_throughput': re.sub(r'Bandwidth \(MB/sec\): ', '', re.search(
+                r'Bandwidth \(MB/sec\):.*', out).group(0)),
+            'avg_latency': re.sub(r'Average Latency: ', '', re.search(
+                r'Average Latency:.*', out).group(0)),
+            'rep': str(rep),
+            'num_osd': str(num_osds),
+            'num_replica': str(replica)
+        }
+        values_to_write = []
+        for column in config['columns']:
+            values_to_write.extend([all_values[column]])
+        f.write(','.join(values_to_write) + '\n')
+
+    ctx.manager.remove_pool(pool)
+
+
+def wait_until_healthy(ctx, config):
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon_remote,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    teuthology.wait_until_healthy(ctx, mon_remote)
diff --git a/src/ceph/qa/tasks/radosgw_admin.py b/src/ceph/qa/tasks/radosgw_admin.py
new file mode 100644
index 0000000..8e744e3
--- /dev/null
+++ b/src/ceph/qa/tasks/radosgw_admin.py
@@ -0,0 +1,955 @@
+"""
+Rgw admin testing against a running instance
+"""
+# The test cases in this file have been annotated for inventory.
+# To extract the inventory (in csv format) use the command:
+#
+#   grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //'
+#
+# to run this standalone:
+#	python qa/tasks/radosgw_admin.py [USER] HOSTNAME
+#
+
+import copy
+import json
+import logging
+import time
+import datetime
+import Queue
+import bunch
+
+import sys
+
+from cStringIO import StringIO
+
+import boto.exception
+import boto.s3.connection
+import boto.s3.acl
+from boto.utils import RequestHook
+
+import httplib2
+
+import util.rgw as rgw_utils
+
+from util.rgw import rgwadmin, get_user_summary, get_user_successful_ops
+
+log = logging.getLogger(__name__)
+
+def usage_acc_findentry2(entries, user, add=True):
+    for e in entries:
+        if e['user'] == user:
+            return e
+    if not add:
+            return None
+    e = {'user': user, 'buckets': []}
+    entries.append(e)
+    return e
+def usage_acc_findsum2(summaries, user, add=True):
+    for e in summaries:
+        if e['user'] == user:
+            return e
+    if not add:
+        return None
+    e = {'user': user, 'categories': [],
+        'total': {'bytes_received': 0,
+            'bytes_sent': 0, 'ops': 0, 'successful_ops': 0 }}
+    summaries.append(e)
+    return e
+def usage_acc_update2(x, out, b_in, err):
+    x['bytes_sent'] += b_in
+    x['bytes_received'] += out
+    x['ops'] += 1
+    if not err:
+        x['successful_ops'] += 1
+def usage_acc_validate_fields(r, x, x2, what):
+    q=[]
+    for field in ['bytes_sent', 'bytes_received', 'ops', 'successful_ops']:
+        try:
+            if x2[field] < x[field]:
+                q.append("field %s: %d < %d" % (field, x2[field], x[field]))
+        except Exception as ex:
+            r.append( "missing/bad field " + field + " in " + what + " " + str(ex))
+            return
+    if len(q) > 0:
+        r.append("incomplete counts in " + what + ": " + ", ".join(q))
+class usage_acc:
+    def __init__(self):
+        self.results = {'entries': [], 'summary': []}
+    def findentry(self, user):
+        return usage_acc_findentry2(self.results['entries'], user)
+    def findsum(self, user):
+        return usage_acc_findsum2(self.results['summary'], user)
+    def e2b(self, e, bucket, add=True):
+        for b in e['buckets']:
+            if b['bucket'] == bucket:
+                return b
+        if not add:
+                return None
+        b = {'bucket': bucket, 'categories': []}
+        e['buckets'].append(b)
+        return b
+    def c2x(self, c, cat, add=True):
+        for x in c:
+            if x['category'] == cat:
+                return x
+        if not add:
+                return None
+        x = {'bytes_received': 0, 'category': cat,
+            'bytes_sent': 0, 'ops': 0, 'successful_ops': 0 }
+        c.append(x)
+        return x
+    def update(self, c, cat, user, out, b_in, err):
+        x = self.c2x(c, cat)
+        usage_acc_update2(x, out, b_in, err)
+        if not err and cat == 'create_bucket' and not x.has_key('owner'):
+            x['owner'] = user
+    def make_entry(self, cat, bucket, user, out, b_in, err):
+        if cat == 'create_bucket' and err:
+                return
+        e = self.findentry(user)
+        b = self.e2b(e, bucket)
+        self.update(b['categories'], cat, user, out, b_in, err)
+        s = self.findsum(user)
+        x = self.c2x(s['categories'], cat)
+        usage_acc_update2(x, out, b_in, err)
+        x = s['total']
+        usage_acc_update2(x, out, b_in, err)
+    def generate_make_entry(self):
+        return lambda cat,bucket,user,out,b_in,err: self.make_entry(cat, bucket, user, out, b_in, err)
+    def get_usage(self):
+        return self.results
+    def compare_results(self, results):
+        if not results.has_key('entries') or not results.has_key('summary'):
+            return ['Missing entries or summary']
+        r = []
+        for e in self.results['entries']:
+            try:
+                e2 = usage_acc_findentry2(results['entries'], e['user'], False)
+            except Exception as ex:
+                r.append("malformed entry looking for user "
+		    + e['user'] + " " + str(ex))
+                break
+            if e2 == None:
+                r.append("missing entry for user " + e['user'])
+                continue
+            for b in e['buckets']:
+                c = b['categories']
+                if b['bucket'] == 'nosuchbucket':
+                    print "got here"
+                try:
+                    b2 = self.e2b(e2, b['bucket'], False)
+                    if b2 != None:
+                            c2 = b2['categories']
+                except Exception as ex:
+                    r.append("malformed entry looking for bucket "
+			+ b['bucket'] + " in user " + e['user'] + " " + str(ex))
+                    break
+                if b2 == None:
+                    r.append("can't find bucket " + b['bucket']
+			+ " in user " + e['user'])
+                    continue
+                for x in c:
+                    try:
+                        x2 = self.c2x(c2, x['category'], False)
+                    except Exception as ex:
+                        r.append("malformed entry looking for "
+			    + x['category'] + " in bucket " + b['bucket']
+			    + " user " + e['user'] + " " + str(ex))
+                        break
+                    usage_acc_validate_fields(r, x, x2, "entry: category "
+			+ x['category'] + " bucket " + b['bucket']
+			+ " in user " + e['user'])
+        for s in self.results['summary']:
+            c = s['categories']
+            try:
+                s2 = usage_acc_findsum2(results['summary'], s['user'], False)
+            except Exception as ex:
+                r.append("malformed summary looking for user " + e['user']
+		    + " " + str(ex))
+                break
+            if s2 == None:
+                r.append("missing summary for user " + e['user'] + " " + str(ex))
+                continue
+            try:
+                c2 = s2['categories']
+            except Exception as ex:
+                r.append("malformed summary missing categories for user "
+		    + e['user'] + " " + str(ex))
+                break
+            for x in c:
+                try:
+                    x2 = self.c2x(c2, x['category'], False)
+                except Exception as ex:
+                    r.append("malformed summary looking for "
+			+ x['category'] + " user " + e['user'] + " " + str(ex))
+                    break
+                usage_acc_validate_fields(r, x, x2, "summary: category "
+		    + x['category'] + " in user " + e['user'])
+            x = s['total']
+            try:
+                x2 = s2['total']
+            except Exception as ex:
+                r.append("malformed summary looking for totals for user "
+		    + e['user'] + " " + str(ex))
+                break
+            usage_acc_validate_fields(r, x, x2, "summary: totals for user" + e['user'])
+        return r
+
+def ignore_this_entry(cat, bucket, user, out, b_in, err):
+    pass
+class requestlog_queue():
+    def __init__(self, add):
+        self.q = Queue.Queue(1000)
+        self.adder = add
+    def handle_request_data(self, request, response, error=False):
+        now = datetime.datetime.now()
+	if error:
+	    pass
+	elif response.status < 200 or response.status >= 400:
+	    error = True
+        self.q.put(bunch.Bunch({'t': now, 'o': request, 'i': response, 'e': error}))
+    def clear(self):
+        with self.q.mutex:
+            self.q.queue.clear()
+    def log_and_clear(self, cat, bucket, user, add_entry = None):
+        while not self.q.empty():
+            j = self.q.get()
+	    bytes_out = 0
+            if 'Content-Length' in j.o.headers:
+		bytes_out = int(j.o.headers['Content-Length'])
+            bytes_in = 0
+            if 'content-length' in j.i.msg.dict:
+		bytes_in = int(j.i.msg.dict['content-length'])
+            log.info('RL: %s %s %s bytes_out=%d bytes_in=%d failed=%r'
+		% (cat, bucket, user, bytes_out, bytes_in, j.e))
+	    if add_entry == None:
+		add_entry = self.adder
+	    add_entry(cat, bucket, user, bytes_out, bytes_in, j.e)
+
+def create_presigned_url(conn, method, bucket_name, key_name, expiration):
+    return conn.generate_url(expires_in=expiration,
+        method=method,
+        bucket=bucket_name,
+        key=key_name,
+        query_auth=True,
+    )
+
+def send_raw_http_request(conn, method, bucket_name, key_name, follow_redirects = False):
+    url = create_presigned_url(conn, method, bucket_name, key_name, 3600)
+    print url
+    h = httplib2.Http()
+    h.follow_redirects = follow_redirects
+    return h.request(url, method)
+
+
+def get_acl(key):
+    """
+    Helper function to get the xml acl from a key, ensuring that the xml
+    version tag is removed from the acl response
+    """
+    raw_acl = key.get_xml_acl()
+
+    def remove_version(string):
+        return string.split(
+            '<?xml version="1.0" encoding="UTF-8"?>'
+        )[-1]
+
+    def remove_newlines(string):
+        return string.strip('\n')
+
+    return remove_version(
+        remove_newlines(raw_acl)
+    )
+
+def task(ctx, config):
+    """
+    Test radosgw-admin functionality against a running rgw instance.
+    """
+    global log
+
+    assert ctx.rgw.config, \
+        "radosgw_admin task needs a config passed from the rgw task"
+    config = ctx.rgw.config
+    log.debug('config is: %r', config)
+
+    clients_from_config = config.keys()
+
+    # choose first client as default
+    client = clients_from_config[0]
+
+    # once the client is chosen, pull the host name and  assigned port out of
+    # the role_endpoints that were assigned by the rgw task
+    (remote_host, remote_port) = ctx.rgw.role_endpoints[client]
+
+    ##
+    user1='foo'
+    user2='fud'
+    subuser1='foo:foo1'
+    subuser2='foo:foo2'
+    display_name1='Foo'
+    display_name2='Fud'
+    email='foo@foo.com'
+    email2='bar@bar.com'
+    access_key='9te6NH5mcdcq0Tc5i8i1'
+    secret_key='Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu'
+    access_key2='p5YnriCv1nAtykxBrupQ'
+    secret_key2='Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh'
+    swift_secret1='gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL'
+    swift_secret2='ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy'
+
+    bucket_name='myfoo'
+    bucket_name2='mybar'
+
+    # connect to rgw
+    connection = boto.s3.connection.S3Connection(
+        aws_access_key_id=access_key,
+        aws_secret_access_key=secret_key,
+        is_secure=False,
+        port=remote_port,
+        host=remote_host,
+        calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+        )
+    connection2 = boto.s3.connection.S3Connection(
+        aws_access_key_id=access_key2,
+        aws_secret_access_key=secret_key2,
+        is_secure=False,
+        port=remote_port,
+        host=remote_host,
+        calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+        )
+
+    acc = usage_acc()
+    rl = requestlog_queue(acc.generate_make_entry())
+    connection.set_request_hook(rl)
+    connection2.set_request_hook(rl)
+
+    # legend (test cases can be easily grep-ed out)
+    # TESTCASE 'testname','object','method','operation','assertion'
+
+    # TESTCASE 'usage-show0' 'usage' 'show' 'all usage' 'succeeds'
+    (err, summary0) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
+
+    # TESTCASE 'info-nosuch','user','info','non-existent user','fails'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
+    assert err
+
+    # TESTCASE 'create-ok','user','create','w/all valid info','succeeds'
+    (err, out) = rgwadmin(ctx, client, [
+            'user', 'create',
+            '--uid', user1,
+            '--display-name', display_name1,
+            '--email', email,
+            '--access-key', access_key,
+            '--secret', secret_key,
+            '--max-buckets', '4'
+            ],
+            check_status=True)
+
+    # TESTCASE 'duplicate email','user','create','existing user email','fails'
+    (err, out) = rgwadmin(ctx, client, [
+            'user', 'create',
+            '--uid', user2,
+            '--display-name', display_name2,
+            '--email', email,
+            ])
+    assert err
+
+    # TESTCASE 'info-existing','user','info','existing user','returns correct info'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+    assert out['user_id'] == user1
+    assert out['email'] == email
+    assert out['display_name'] == display_name1
+    assert len(out['keys']) == 1
+    assert out['keys'][0]['access_key'] == access_key
+    assert out['keys'][0]['secret_key'] == secret_key
+    assert not out['suspended']
+
+    # TESTCASE 'suspend-ok','user','suspend','active user','succeeds'
+    (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1],
+        check_status=True)
+
+    # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+    assert out['suspended']
+
+    # TESTCASE 're-enable','user','enable','suspended user','succeeds'
+    (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1], check_status=True)
+
+    # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+    assert not out['suspended']
+
+    # TESTCASE 'add-keys','key','create','w/valid info','succeeds'
+    (err, out) = rgwadmin(ctx, client, [
+            'key', 'create', '--uid', user1,
+            '--access-key', access_key2, '--secret', secret_key2,
+            ], check_status=True)
+
+    # TESTCASE 'info-new-key','user','info','after key addition','returns all keys'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1],
+        check_status=True)
+    assert len(out['keys']) == 2
+    assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2
+    assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2
+
+    # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed'
+    (err, out) = rgwadmin(ctx, client, [
+            'key', 'rm', '--uid', user1,
+            '--access-key', access_key2,
+            ], check_status=True)
+    assert len(out['keys']) == 1
+    assert out['keys'][0]['access_key'] == access_key
+    assert out['keys'][0]['secret_key'] == secret_key
+
+    # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
+    subuser_access = 'full'
+    subuser_perm = 'full-control'
+
+    (err, out) = rgwadmin(ctx, client, [
+            'subuser', 'create', '--subuser', subuser1,
+            '--access', subuser_access
+            ], check_status=True)
+
+    # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
+    (err, out) = rgwadmin(ctx, client, [
+            'subuser', 'modify', '--subuser', subuser1,
+            '--secret', swift_secret1,
+            '--key-type', 'swift',
+            ], check_status=True)
+
+    # TESTCASE 'subuser-perm-mask', 'subuser', 'info', 'test subuser perm mask durability', 'succeeds'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
+
+    assert out['subusers'][0]['permissions'] == subuser_perm
+
+    # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+    assert len(out['swift_keys']) == 1
+    assert out['swift_keys'][0]['user'] == subuser1
+    assert out['swift_keys'][0]['secret_key'] == swift_secret1
+
+    # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds'
+    (err, out) = rgwadmin(ctx, client, [
+            'subuser', 'create', '--subuser', subuser2,
+            '--secret', swift_secret2,
+            '--key-type', 'swift',
+            ], check_status=True)
+
+    # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1], check_status=True)
+    assert len(out['swift_keys']) == 2
+    assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2
+    assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2
+
+    # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed'
+    (err, out) = rgwadmin(ctx, client, [
+            'key', 'rm', '--subuser', subuser1,
+            '--key-type', 'swift',
+            ], check_status=True)
+    assert len(out['swift_keys']) == 1
+
+    # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed'
+    (err, out) = rgwadmin(ctx, client, [
+            'subuser', 'rm', '--subuser', subuser1,
+            ], check_status=True)
+    assert len(out['subusers']) == 1
+
+    # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed'
+    (err, out) = rgwadmin(ctx, client, [
+            'subuser', 'rm', '--subuser', subuser2,
+            '--key-type', 'swift', '--purge-keys',
+            ], check_status=True)
+    assert len(out['swift_keys']) == 0
+    assert len(out['subusers']) == 0
+
+    # TESTCASE 'bucket-stats','bucket','stats','no session/buckets','succeeds, empty list'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1],
+        check_status=True)
+    assert len(out) == 0
+
+    # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True)
+    assert len(out) == 0
+
+    # create a first bucket
+    bucket = connection.create_bucket(bucket_name)
+
+    rl.log_and_clear("create_bucket", bucket_name, user1)
+
+    # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'list', '--uid', user1], check_status=True)
+    assert len(out) == 1
+    assert out[0] == bucket_name
+
+    bucket_list = connection.get_all_buckets()
+    assert len(bucket_list) == 1
+    assert bucket_list[0].name == bucket_name
+
+    rl.log_and_clear("list_buckets", '', user1)
+
+    # TESTCASE 'bucket-list-all','bucket','list','all buckets','succeeds, expected list'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'list'], check_status=True)
+    assert len(out) >= 1
+    assert bucket_name in out;
+
+    # TESTCASE 'max-bucket-limit,'bucket','create','4 buckets','5th bucket fails due to max buckets == 4'
+    bucket2 = connection.create_bucket(bucket_name + '2')
+    rl.log_and_clear("create_bucket", bucket_name + '2', user1)
+    bucket3 = connection.create_bucket(bucket_name + '3')
+    rl.log_and_clear("create_bucket", bucket_name + '3', user1)
+    bucket4 = connection.create_bucket(bucket_name + '4')
+    rl.log_and_clear("create_bucket", bucket_name + '4', user1)
+    # the 5th should fail.
+    failed = False
+    try:
+        connection.create_bucket(bucket_name + '5')
+    except Exception:
+        failed = True
+    assert failed
+    rl.log_and_clear("create_bucket", bucket_name + '5', user1)
+
+    # delete the buckets
+    bucket2.delete()
+    rl.log_and_clear("delete_bucket", bucket_name + '2', user1)
+    bucket3.delete()
+    rl.log_and_clear("delete_bucket", bucket_name + '3', user1)
+    bucket4.delete()
+    rl.log_and_clear("delete_bucket", bucket_name + '4', user1)
+
+    # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list'
+    (err, out) = rgwadmin(ctx, client, [
+            'bucket', 'stats', '--bucket', bucket_name], check_status=True)
+    assert out['owner'] == user1
+    bucket_id = out['id']
+
+    # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'stats', '--uid', user1], check_status=True)
+    assert len(out) == 1
+    assert out[0]['id'] == bucket_id    # does it return the same ID twice in a row?
+
+    # use some space
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('one')
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object'
+    (err, out) = rgwadmin(ctx, client, [
+            'bucket', 'stats', '--bucket', bucket_name], check_status=True)
+    assert out['id'] == bucket_id
+    assert out['usage']['rgw.main']['num_objects'] == 1
+    assert out['usage']['rgw.main']['size_kb'] > 0
+
+    # reclaim it
+    key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
+
+    # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error'
+    (err, out) = rgwadmin(ctx, client,
+        ['bucket', 'unlink', '--uid', user1, '--bucket', bucket_name],
+        check_status=True)
+
+    # create a second user to link the bucket to
+    (err, out) = rgwadmin(ctx, client, [
+            'user', 'create',
+            '--uid', user2,
+            '--display-name', display_name2,
+            '--access-key', access_key2,
+            '--secret', secret_key2,
+            '--max-buckets', '1',
+            ],
+            check_status=True)
+
+    # try creating an object with the first user before the bucket is relinked
+    denied = False
+    key = boto.s3.key.Key(bucket)
+
+    try:
+        key.set_contents_from_string('two')
+    except boto.exception.S3ResponseError:
+        denied = True
+
+    assert not denied
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # delete the object
+    key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
+
+    # link the bucket to another user
+    (err, out) = rgwadmin(ctx, client, ['metadata', 'get', 'bucket:{n}'.format(n=bucket_name)],
+        check_status=True)
+
+    bucket_data = out['data']
+    assert bucket_data['bucket']['name'] == bucket_name
+
+    bucket_id = bucket_data['bucket']['bucket_id']
+
+    # link the bucket to another user
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'link', '--uid', user2, '--bucket', bucket_name, '--bucket-id', bucket_id],
+        check_status=True)
+
+    # try to remove user, should fail (has a linked bucket)
+    (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2])
+    assert err
+
+    # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'succeeds, bucket unlinked'
+    (err, out) = rgwadmin(ctx, client, ['bucket', 'unlink', '--uid', user2, '--bucket', bucket_name],
+        check_status=True)
+
+    # relink the bucket to the first user and delete the second user
+    (err, out) = rgwadmin(ctx, client,
+        ['bucket', 'link', '--uid', user1, '--bucket', bucket_name, '--bucket-id', bucket_id],
+        check_status=True)
+
+    (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user2],
+        check_status=True)
+
+    # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed'
+
+    # upload an object
+    object_name = 'four'
+    key = boto.s3.key.Key(bucket, object_name)
+    key.set_contents_from_string(object_name)
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # fetch it too (for usage stats presently)
+    s = key.get_contents_as_string()
+    rl.log_and_clear("get_obj", bucket_name, user1)
+    assert s == object_name
+    # list bucket too (for usage stats presently)
+    keys = list(bucket.list())
+    rl.log_and_clear("list_bucket", bucket_name, user1)
+    assert len(keys) == 1
+    assert keys[0].name == object_name
+
+    # now delete it
+    (err, out) = rgwadmin(ctx, client,
+        ['object', 'rm', '--bucket', bucket_name, '--object', object_name],
+        check_status=True)
+
+    # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects'
+    (err, out) = rgwadmin(ctx, client, [
+            'bucket', 'stats', '--bucket', bucket_name],
+            check_status=True)
+    assert out['id'] == bucket_id
+    assert out['usage']['rgw.main']['num_objects'] == 0
+
+    # list log objects
+    # TESTCASE 'log-list','log','list','after activity','succeeds, lists one no objects'
+    (err, out) = rgwadmin(ctx, client, ['log', 'list'], check_status=True)
+    assert len(out) > 0
+
+    for obj in out:
+        # TESTCASE 'log-show','log','show','after activity','returns expected info'
+        if obj[:4] == 'meta' or obj[:4] == 'data' or obj[:18] == 'obj_delete_at_hint':
+            continue
+
+        (err, rgwlog) = rgwadmin(ctx, client, ['log', 'show', '--object', obj],
+            check_status=True)
+        assert len(rgwlog) > 0
+
+        # exempt bucket_name2 from checking as it was only used for multi-region tests
+        assert rgwlog['bucket'].find(bucket_name) == 0 or rgwlog['bucket'].find(bucket_name2) == 0
+        assert rgwlog['bucket'] != bucket_name or rgwlog['bucket_id'] == bucket_id
+        assert rgwlog['bucket_owner'] == user1 or rgwlog['bucket'] == bucket_name + '5' or rgwlog['bucket'] == bucket_name2
+        for entry in rgwlog['log_entries']:
+            log.debug('checking log entry: ', entry)
+            assert entry['bucket'] == rgwlog['bucket']
+            possible_buckets = [bucket_name + '5', bucket_name2]
+            user = entry['user']
+            assert user == user1 or user.endswith('system-user') or \
+                rgwlog['bucket'] in possible_buckets
+
+        # TESTCASE 'log-rm','log','rm','delete log objects','succeeds'
+        (err, out) = rgwadmin(ctx, client, ['log', 'rm', '--object', obj],
+            check_status=True)
+
+    # TODO: show log by bucket+date
+
+    # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds'
+    (err, out) = rgwadmin(ctx, client, ['user', 'suspend', '--uid', user1],
+        check_status=True)
+
+    # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects'
+    denied = False
+    try:
+        key = boto.s3.key.Key(bucket)
+        key.set_contents_from_string('five')
+    except boto.exception.S3ResponseError as e:
+        denied = True
+        assert e.status == 403
+
+    assert denied
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # TESTCASE 'user-renable2','user','enable','suspended user','succeeds'
+    (err, out) = rgwadmin(ctx, client, ['user', 'enable', '--uid', user1],
+        check_status=True)
+
+    # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects'
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('six')
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # TESTCASE 'gc-list', 'gc', 'list', 'get list of objects ready for garbage collection'
+
+    # create an object large enough to be split into multiple parts
+    test_string = 'foo'*10000000
+
+    big_key = boto.s3.key.Key(bucket)
+    big_key.set_contents_from_string(test_string)
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # now delete the head
+    big_key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
+
+    # wait a bit to give the garbage collector time to cycle
+    time.sleep(15)
+
+    (err, out) = rgwadmin(ctx, client, ['gc', 'list'])
+
+    assert len(out) > 0
+
+    # TESTCASE 'gc-process', 'gc', 'process', 'manually collect garbage'
+    (err, out) = rgwadmin(ctx, client, ['gc', 'process'], check_status=True)
+
+    #confirm
+    (err, out) = rgwadmin(ctx, client, ['gc', 'list'])
+
+    assert len(out) == 0
+
+    # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets'
+    (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1])
+    assert err
+
+    # delete should fail because ``key`` still exists
+    try:
+        bucket.delete()
+    except boto.exception.S3ResponseError as e:
+        assert e.status == 409
+    rl.log_and_clear("delete_bucket", bucket_name, user1)
+
+    key.delete()
+    rl.log_and_clear("delete_obj", bucket_name, user1)
+    bucket.delete()
+    rl.log_and_clear("delete_bucket", bucket_name, user1)
+
+    # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy'
+    bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
+
+    # create an object
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('seven')
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    # should be private already but guarantee it
+    key.set_acl('private')
+    rl.log_and_clear("put_acls", bucket_name, user1)
+
+    (err, out) = rgwadmin(ctx, client,
+        ['policy', '--bucket', bucket.name, '--object', key.key],
+        check_status=True, format='xml')
+
+    acl = get_acl(key)
+    rl.log_and_clear("get_acls", bucket_name, user1)
+
+    assert acl == out.strip('\n')
+
+    # add another grantee by making the object public read
+    key.set_acl('public-read')
+    rl.log_and_clear("put_acls", bucket_name, user1)
+
+    (err, out) = rgwadmin(ctx, client,
+        ['policy', '--bucket', bucket.name, '--object', key.key],
+        check_status=True, format='xml')
+
+    acl = get_acl(key)
+    rl.log_and_clear("get_acls", bucket_name, user1)
+
+    assert acl == out.strip('\n')
+
+    # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds'
+    bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
+    key_name = ['eight', 'nine', 'ten', 'eleven']
+    for i in range(4):
+        key = boto.s3.key.Key(bucket)
+        key.set_contents_from_string(key_name[i])
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    (err, out) = rgwadmin(ctx, client,
+        ['bucket', 'rm', '--bucket', bucket_name, '--purge-objects'],
+        check_status=True)
+
+    # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds'
+    caps='user=read'
+    (err, out) = rgwadmin(ctx, client, ['caps', 'add', '--uid', user1, '--caps', caps])
+
+    assert out['caps'][0]['perm'] == 'read'
+
+    # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds'
+    (err, out) = rgwadmin(ctx, client, ['caps', 'rm', '--uid', user1, '--caps', caps])
+
+    assert not out['caps']
+
+    # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets'
+    bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
+    key = boto.s3.key.Key(bucket)
+
+    (err, out) = rgwadmin(ctx, client, ['user', 'rm', '--uid', user1])
+    assert err
+
+    # TESTCASE 'rm-user2', 'user', 'rm', 'user with data', 'succeeds'
+    bucket = connection.create_bucket(bucket_name)
+    rl.log_and_clear("create_bucket", bucket_name, user1)
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('twelve')
+    rl.log_and_clear("put_obj", bucket_name, user1)
+
+    time.sleep(35)
+
+    # need to wait for all usage data to get flushed, should take up to 30 seconds
+    timestamp = time.time()
+    while time.time() - timestamp <= (2 * 60):      # wait up to 20 minutes
+        (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--categories', 'delete_obj'])  # one of the operations we did is delete_obj, should be present.
+        if get_user_successful_ops(out, user1) > 0:
+            break
+        time.sleep(1)
+
+    assert time.time() - timestamp <= (20 * 60)
+
+    # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
+    (err, out) = rgwadmin(ctx, client, ['usage', 'show'], check_status=True)
+    assert len(out['entries']) > 0
+    assert len(out['summary']) > 0
+
+    r = acc.compare_results(out)
+    if len(r) != 0:
+        sys.stderr.write(("\n".join(r))+"\n")
+        assert(len(r) == 0)
+
+    user_summary = get_user_summary(out, user1)
+
+    total = user_summary['total']
+    assert total['successful_ops'] > 0
+
+    # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
+    (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
+        check_status=True)
+    assert len(out['entries']) > 0
+    assert len(out['summary']) > 0
+    user_summary = out['summary'][0]
+    for entry in user_summary['categories']:
+        assert entry['successful_ops'] > 0
+    assert user_summary['user'] == user1
+
+    # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
+    test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
+    for cat in test_categories:
+        (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1, '--categories', cat],
+            check_status=True)
+        assert len(out['summary']) > 0
+        user_summary = out['summary'][0]
+        assert user_summary['user'] == user1
+        assert len(user_summary['categories']) == 1
+        entry = user_summary['categories'][0]
+        assert entry['category'] == cat
+        assert entry['successful_ops'] > 0
+
+    # should be all through with connection. (anything using connection
+    #  should be BEFORE the usage stuff above.)
+    rl.log_and_clear("(before-close)", '-', '-', ignore_this_entry)
+    connection.close()
+    connection = None
+
+    # the usage flush interval is 30 seconds, wait that much an then some
+    # to make sure everything has been flushed
+    time.sleep(35)
+
+    # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
+    (err, out) = rgwadmin(ctx, client, ['usage', 'trim', '--uid', user1],
+        check_status=True)
+    (err, out) = rgwadmin(ctx, client, ['usage', 'show', '--uid', user1],
+        check_status=True)
+    assert len(out['entries']) == 0
+    assert len(out['summary']) == 0
+
+    (err, out) = rgwadmin(ctx, client,
+        ['user', 'rm', '--uid', user1, '--purge-data' ],
+        check_status=True)
+
+    # TESTCASE 'rm-user3','user','rm','deleted user','fails'
+    (err, out) = rgwadmin(ctx, client, ['user', 'info', '--uid', user1])
+    assert err
+
+    # TESTCASE 'zone-info', 'zone', 'get', 'get zone info', 'succeeds, has default placement rule'
+    #
+
+    (err, out) = rgwadmin(ctx, client, ['zone', 'get','--rgw-zone','default'])
+    orig_placement_pools = len(out['placement_pools'])
+
+    # removed this test, it is not correct to assume that zone has default placement, it really
+    # depends on how we set it up before
+    #
+    # assert len(out) > 0
+    # assert len(out['placement_pools']) == 1
+
+    # default_rule = out['placement_pools'][0]
+    # assert default_rule['key'] == 'default-placement'
+
+    rule={'key': 'new-placement', 'val': {'data_pool': '.rgw.buckets.2', 'index_pool': '.rgw.buckets.index.2'}}
+
+    out['placement_pools'].append(rule)
+
+    (err, out) = rgwadmin(ctx, client, ['zone', 'set'],
+        stdin=StringIO(json.dumps(out)),
+        check_status=True)
+
+    (err, out) = rgwadmin(ctx, client, ['zone', 'get','--rgw-zone','default'])
+    assert len(out) > 0
+    assert len(out['placement_pools']) == orig_placement_pools + 1
+
+    zonecmd = ['zone', 'placement', 'rm',
+	'--rgw-zone', 'default',
+	'--placement-id', 'new-placement']
+
+    (err, out) = rgwadmin(ctx, client, zonecmd, check_status=True)
+
+import sys
+from tasks.radosgw_admin import task
+from teuthology.config import config
+from teuthology.orchestra import cluster, remote
+import argparse;
+
+def main():
+    if len(sys.argv) == 3:
+	user = sys.argv[1] + "@"
+	host = sys.argv[2]
+    elif len(sys.argv) == 2:
+        user = ""
+	host = sys.argv[1]
+    else:
+        sys.stderr.write("usage: radosgw_admin.py [user] host\n")
+	exit(1)
+    client0 = remote.Remote(user + host)
+    ctx = config
+    ctx.cluster=cluster.Cluster(remotes=[(client0,
+     [ 'ceph.client.rgw.%s' % (host),  ]),])
+
+    ctx.rgw = argparse.Namespace()
+    endpoints = {}
+    endpoints['ceph.client.rgw.%s' % host] = (host, 80)
+    ctx.rgw.role_endpoints = endpoints
+    ctx.rgw.realm = None
+    ctx.rgw.regions = {'region0': { 'api name': 'api1',
+	    'is master': True, 'master zone': 'r0z0',
+	    'zones': ['r0z0', 'r0z1'] }}
+    ctx.rgw.config = {'ceph.client.rgw.%s' % host: {'system user': {'name': '%s-system-user' % host}}}
+    task(config, None)
+    exit()
+
+if __name__ == '__main__':
+    main()
diff --git a/src/ceph/qa/tasks/radosgw_admin_rest.py b/src/ceph/qa/tasks/radosgw_admin_rest.py
new file mode 100644
index 0000000..7bd72d1
--- /dev/null
+++ b/src/ceph/qa/tasks/radosgw_admin_rest.py
@@ -0,0 +1,668 @@
+"""
+Run a series of rgw admin commands through the rest interface.
+
+The test cases in this file have been annotated for inventory.
+To extract the inventory (in csv format) use the command:
+
+   grep '^ *# TESTCASE' | sed 's/^ *# TESTCASE //'
+
+"""
+from cStringIO import StringIO
+import logging
+import json
+
+import boto.exception
+import boto.s3.connection
+import boto.s3.acl
+
+import requests
+import time
+
+from boto.connection import AWSAuthConnection
+from teuthology import misc as teuthology
+from util.rgw import get_user_summary, get_user_successful_ops
+
+log = logging.getLogger(__name__)
+
+def rgwadmin(ctx, client, cmd):
+    """
+    Perform rgw admin command
+
+    :param client: client
+    :param cmd: command to execute.
+    :return: command exit status, json result.
+    """
+    log.info('radosgw-admin: %s' % cmd)
+    testdir = teuthology.get_testdir(ctx)
+    pre = [
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'radosgw-admin',
+        '--log-to-stderr',
+        '--format', 'json',
+        ]
+    pre.extend(cmd)
+    (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+    proc = remote.run(
+        args=pre,
+        check_status=False,
+        stdout=StringIO(),
+        stderr=StringIO(),
+        )
+    r = proc.exitstatus
+    out = proc.stdout.getvalue()
+    j = None
+    if not r and out != '':
+        try:
+            j = json.loads(out)
+            log.info(' json result: %s' % j)
+        except ValueError:
+            j = out
+            log.info(' raw result: %s' % j)
+    return (r, j)
+
+
+def rgwadmin_rest(connection, cmd, params=None, headers=None, raw=False):
+    """
+    perform a rest command
+    """
+    log.info('radosgw-admin-rest: %s %s' % (cmd, params))
+    put_cmds = ['create', 'link', 'add']
+    post_cmds = ['unlink', 'modify']
+    delete_cmds = ['trim', 'rm', 'process']
+    get_cmds = ['check', 'info', 'show', 'list']
+
+    bucket_sub_resources = ['object', 'policy', 'index']
+    user_sub_resources = ['subuser', 'key', 'caps']
+    zone_sub_resources = ['pool', 'log', 'garbage']
+
+    def get_cmd_method_and_handler(cmd):
+        """
+        Get the rest command and handler from information in cmd and
+        from the imported requests object.
+        """
+        if cmd[1] in put_cmds:
+            return 'PUT', requests.put
+        elif cmd[1] in delete_cmds:
+            return 'DELETE', requests.delete
+        elif cmd[1] in post_cmds:
+            return 'POST', requests.post
+        elif cmd[1] in get_cmds:
+            return 'GET', requests.get
+
+    def get_resource(cmd):
+        """
+        Get the name of the resource from information in cmd.
+        """
+        if cmd[0] == 'bucket' or cmd[0] in bucket_sub_resources:
+            if cmd[0] == 'bucket':
+                return 'bucket', ''
+            else:
+                return 'bucket', cmd[0]
+        elif cmd[0] == 'user' or cmd[0] in user_sub_resources:
+            if cmd[0] == 'user':
+                return 'user', ''
+            else:
+                return 'user', cmd[0]
+        elif cmd[0] == 'usage':
+            return 'usage', ''
+        elif cmd[0] == 'zone' or cmd[0] in zone_sub_resources:
+            if cmd[0] == 'zone':
+                return 'zone', ''
+            else:
+                return 'zone', cmd[0]
+
+    def build_admin_request(conn, method, resource = '', headers=None, data='',
+            query_args=None, params=None):
+        """
+        Build an administative request adapted from the build_request()
+        method of boto.connection
+        """
+
+        path = conn.calling_format.build_path_base('admin', resource)
+        auth_path = conn.calling_format.build_auth_path('admin', resource)
+        host = conn.calling_format.build_host(conn.server_name(), 'admin')
+        if query_args:
+            path += '?' + query_args
+            boto.log.debug('path=%s' % path)
+            auth_path += '?' + query_args
+            boto.log.debug('auth_path=%s' % auth_path)
+        return AWSAuthConnection.build_base_http_request(conn, method, path,
+                auth_path, params, headers, data, host)
+
+    method, handler = get_cmd_method_and_handler(cmd)
+    resource, query_args = get_resource(cmd)
+    request = build_admin_request(connection, method, resource,
+            query_args=query_args, headers=headers)
+
+    url = '{protocol}://{host}{path}'.format(protocol=request.protocol,
+            host=request.host, path=request.path)
+
+    request.authorize(connection=connection)
+    result = handler(url, params=params, headers=request.headers)
+
+    if raw:
+        log.info(' text result: %s' % result.txt)
+        return result.status_code, result.txt
+    else:
+        log.info(' json result: %s' % result.json())
+        return result.status_code, result.json()
+
+
+def task(ctx, config):
+    """
+    Test radosgw-admin functionality through the RESTful interface
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task s3tests only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    # just use the first client...
+    client = clients[0]
+
+    ##
+    admin_user = 'ada'
+    admin_display_name = 'Ms. Admin User'
+    admin_access_key = 'MH1WC2XQ1S8UISFDZC8W'
+    admin_secret_key = 'dQyrTPA0s248YeN5bBv4ukvKU0kh54LWWywkrpoG'
+    admin_caps = 'users=read, write; usage=read, write; buckets=read, write; zone=read, write'
+
+    user1 = 'foo'
+    user2 = 'fud'
+    subuser1 = 'foo:foo1'
+    subuser2 = 'foo:foo2'
+    display_name1 = 'Foo'
+    display_name2 = 'Fud'
+    email = 'foo@foo.com'
+    access_key = '9te6NH5mcdcq0Tc5i8i1'
+    secret_key = 'Ny4IOauQoL18Gp2zM7lC1vLmoawgqcYP/YGcWfXu'
+    access_key2 = 'p5YnriCv1nAtykxBrupQ'
+    secret_key2 = 'Q8Tk6Q/27hfbFSYdSkPtUqhqx1GgzvpXa4WARozh'
+    swift_secret1 = 'gpS2G9RREMrnbqlp29PP2D36kgPR1tm72n5fPYfL'
+    swift_secret2 = 'ri2VJQcKSYATOY6uaDUX7pxgkW+W1YmC6OCxPHwy'
+
+    bucket_name = 'myfoo'
+
+    # legend (test cases can be easily grep-ed out)
+    # TESTCASE 'testname','object','method','operation','assertion'
+    # TESTCASE 'create-admin-user','user','create','administrative user','succeeds'
+    (err, out) = rgwadmin(ctx, client, [
+            'user', 'create',
+            '--uid', admin_user,
+            '--display-name', admin_display_name,
+            '--access-key', admin_access_key,
+            '--secret', admin_secret_key,
+            '--max-buckets', '0',
+            '--caps', admin_caps
+            ])
+    logging.error(out)
+    logging.error(err)
+    assert not err
+
+    (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+    remote_host = remote.name.split('@')[1]
+    admin_conn = boto.s3.connection.S3Connection(
+        aws_access_key_id=admin_access_key,
+        aws_secret_access_key=admin_secret_key,
+        is_secure=False,
+        port=7280,
+        host=remote_host,
+        calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+        )
+
+    # TESTCASE 'info-nosuch','user','info','non-existent user','fails'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {"uid": user1})
+    assert ret == 404
+
+    # TESTCASE 'create-ok','user','create','w/all valid info','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['user', 'create'],
+            {'uid' : user1,
+             'display-name' :  display_name1,
+             'email' : email,
+             'access-key' : access_key,
+             'secret-key' : secret_key,
+             'max-buckets' : '4'
+            })
+
+    assert ret == 200
+
+    # TESTCASE 'info-existing','user','info','existing user','returns correct info'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+
+    assert out['user_id'] == user1
+    assert out['email'] == email
+    assert out['display_name'] == display_name1
+    assert len(out['keys']) == 1
+    assert out['keys'][0]['access_key'] == access_key
+    assert out['keys'][0]['secret_key'] == secret_key
+    assert not out['suspended']
+
+    # TESTCASE 'suspend-ok','user','suspend','active user','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True})
+    assert ret == 200
+
+    # TESTCASE 'suspend-suspended','user','suspend','suspended user','succeeds w/advisory'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+    assert ret == 200
+    assert out['suspended']
+
+    # TESTCASE 're-enable','user','enable','suspended user','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : 'false'})
+    assert not err
+
+    # TESTCASE 'info-re-enabled','user','info','re-enabled user','no longer suspended'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+    assert ret == 200
+    assert not out['suspended']
+
+    # TESTCASE 'add-keys','key','create','w/valid info','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['key', 'create'],
+            {'uid' : user1,
+             'access-key' : access_key2,
+             'secret-key' : secret_key2
+            })
+
+
+    assert ret == 200
+
+    # TESTCASE 'info-new-key','user','info','after key addition','returns all keys'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+    assert ret == 200
+    assert len(out['keys']) == 2
+    assert out['keys'][0]['access_key'] == access_key2 or out['keys'][1]['access_key'] == access_key2
+    assert out['keys'][0]['secret_key'] == secret_key2 or out['keys'][1]['secret_key'] == secret_key2
+
+    # TESTCASE 'rm-key','key','rm','newly added key','succeeds, key is removed'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['key', 'rm'],
+            {'uid' : user1,
+             'access-key' : access_key2
+            })
+
+    assert ret == 200
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+
+    assert len(out['keys']) == 1
+    assert out['keys'][0]['access_key'] == access_key
+    assert out['keys'][0]['secret_key'] == secret_key
+
+    # TESTCASE 'add-swift-key','key','create','swift key','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['subuser', 'create'],
+            {'subuser' : subuser1,
+             'secret-key' : swift_secret1,
+             'key-type' : 'swift'
+            })
+
+    assert ret == 200
+
+    # TESTCASE 'info-swift-key','user','info','after key addition','returns all keys'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' : user1})
+    assert ret == 200
+    assert len(out['swift_keys']) == 1
+    assert out['swift_keys'][0]['user'] == subuser1
+    assert out['swift_keys'][0]['secret_key'] == swift_secret1
+
+    # TESTCASE 'add-swift-subuser','key','create','swift sub-user key','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['subuser', 'create'],
+            {'subuser' : subuser2,
+             'secret-key' : swift_secret2,
+             'key-type' : 'swift'
+            })
+
+    assert ret == 200
+
+    # TESTCASE 'info-swift-subuser','user','info','after key addition','returns all sub-users/keys'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' :  user1})
+    assert ret == 200
+    assert len(out['swift_keys']) == 2
+    assert out['swift_keys'][0]['user'] == subuser2 or out['swift_keys'][1]['user'] == subuser2
+    assert out['swift_keys'][0]['secret_key'] == swift_secret2 or out['swift_keys'][1]['secret_key'] == swift_secret2
+
+    # TESTCASE 'rm-swift-key1','key','rm','subuser','succeeds, one key is removed'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['key', 'rm'],
+            {'subuser' : subuser1,
+             'key-type' :'swift'
+            })
+
+    assert ret == 200
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' :  user1})
+    assert len(out['swift_keys']) == 1
+
+    # TESTCASE 'rm-subuser','subuser','rm','subuser','success, subuser is removed'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['subuser', 'rm'],
+            {'subuser' : subuser1
+            })
+
+    assert ret == 200
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' :  user1})
+    assert len(out['subusers']) == 1
+
+    # TESTCASE 'rm-subuser-with-keys','subuser','rm','subuser','succeeds, second subser and key is removed'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['subuser', 'rm'],
+            {'subuser' : subuser2,
+             'key-type' : 'swift',
+             '{purge-keys' :True
+            })
+
+    assert ret == 200
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' :  user1})
+    assert len(out['swift_keys']) == 0
+    assert len(out['subusers']) == 0
+
+    # TESTCASE 'bucket-stats','bucket','info','no session/buckets','succeeds, empty list'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' :  user1})
+    assert ret == 200
+    assert len(out) == 0
+
+    # connect to rgw
+    connection = boto.s3.connection.S3Connection(
+        aws_access_key_id=access_key,
+        aws_secret_access_key=secret_key,
+        is_secure=False,
+        port=7280,
+        host=remote_host,
+        calling_format=boto.s3.connection.OrdinaryCallingFormat(),
+        )
+
+    # TESTCASE 'bucket-stats2','bucket','stats','no buckets','succeeds, empty list'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True})
+    assert ret == 200
+    assert len(out) == 0
+
+    # create a first bucket
+    bucket = connection.create_bucket(bucket_name)
+
+    # TESTCASE 'bucket-list','bucket','list','one bucket','succeeds, expected list'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1})
+    assert ret == 200
+    assert len(out) == 1
+    assert out[0] == bucket_name
+
+    # TESTCASE 'bucket-stats3','bucket','stats','new empty bucket','succeeds, empty list'
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
+
+    assert ret == 200
+    assert out['owner'] == user1
+    bucket_id = out['id']
+
+    # TESTCASE 'bucket-stats4','bucket','stats','new empty bucket','succeeds, expected bucket ID'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'uid' : user1, 'stats' : True})
+    assert ret == 200
+    assert len(out) == 1
+    assert out[0]['id'] == bucket_id    # does it return the same ID twice in a row?
+
+    # use some space
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('one')
+
+    # TESTCASE 'bucket-stats5','bucket','stats','after creating key','succeeds, lists one non-empty object'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
+    assert ret == 200
+    assert out['id'] == bucket_id
+    assert out['usage']['rgw.main']['num_objects'] == 1
+    assert out['usage']['rgw.main']['size_kb'] > 0
+
+    # reclaim it
+    key.delete()
+
+    # TESTCASE 'bucket unlink', 'bucket', 'unlink', 'unlink bucket from user', 'fails', 'access denied error'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'unlink'], {'uid' : user1, 'bucket' : bucket_name})
+
+    assert ret == 200
+
+    # create a second user to link the bucket to
+    (ret, out) = rgwadmin_rest(admin_conn,
+            ['user', 'create'],
+            {'uid' : user2,
+            'display-name' :  display_name2,
+            'access-key' : access_key2,
+            'secret-key' : secret_key2,
+            'max-buckets' : '1',
+            })
+
+    assert ret == 200
+
+    # try creating an object with the first user before the bucket is relinked
+    denied = False
+    key = boto.s3.key.Key(bucket)
+
+    try:
+        key.set_contents_from_string('two')
+    except boto.exception.S3ResponseError:
+        denied = True
+
+    assert not denied
+
+    # delete the object
+    key.delete()
+
+    # link the bucket to another user
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'link'], {'uid' : user2, 'bucket' : bucket_name})
+
+    assert ret == 200
+
+    # try creating an object with the first user which should cause an error
+    key = boto.s3.key.Key(bucket)
+
+    try:
+        key.set_contents_from_string('three')
+    except boto.exception.S3ResponseError:
+        denied = True
+
+    assert denied
+
+    # relink the bucket to the first user and delete the second user
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'link'], {'uid' : user1, 'bucket' : bucket_name})
+    assert ret == 200
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user2})
+    assert ret == 200
+
+    # TESTCASE 'object-rm', 'object', 'rm', 'remove object', 'succeeds, object is removed'
+
+    # upload an object
+    object_name = 'four'
+    key = boto.s3.key.Key(bucket, object_name)
+    key.set_contents_from_string(object_name)
+
+    # now delete it
+    (ret, out) = rgwadmin_rest(admin_conn, ['object', 'rm'], {'bucket' : bucket_name, 'object' : object_name})
+    assert ret == 200
+
+    # TESTCASE 'bucket-stats6','bucket','stats','after deleting key','succeeds, lists one no objects'
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'info'], {'bucket' : bucket_name, 'stats' : True})
+    assert ret == 200
+    assert out['id'] == bucket_id
+    assert out['usage']['rgw.main']['num_objects'] == 0
+
+    # create a bucket for deletion stats
+    useless_bucket = connection.create_bucket('useless_bucket')
+    useless_key = useless_bucket.new_key('useless_key')
+    useless_key.set_contents_from_string('useless string')
+
+    # delete it
+    useless_key.delete()
+    useless_bucket.delete()
+
+    # wait for the statistics to flush
+    time.sleep(60)
+
+    # need to wait for all usage data to get flushed, should take up to 30 seconds
+    timestamp = time.time()
+    while time.time() - timestamp <= (20 * 60):      # wait up to 20 minutes
+        (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'categories' : 'delete_obj'})  # last operation we did is delete obj, wait for it to flush
+
+        if get_user_successful_ops(out, user1) > 0:
+            break
+        time.sleep(1)
+
+    assert time.time() - timestamp <= (20 * 60)
+
+    # TESTCASE 'usage-show' 'usage' 'show' 'all usage' 'succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'])
+    assert ret == 200
+    assert len(out['entries']) > 0
+    assert len(out['summary']) > 0
+    user_summary = get_user_summary(out, user1)
+    total = user_summary['total']
+    assert total['successful_ops'] > 0
+
+    # TESTCASE 'usage-show2' 'usage' 'show' 'user usage' 'succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1})
+    assert ret == 200
+    assert len(out['entries']) > 0
+    assert len(out['summary']) > 0
+    user_summary = out['summary'][0]
+    for entry in user_summary['categories']:
+        assert entry['successful_ops'] > 0
+    assert user_summary['user'] == user1
+
+    # TESTCASE 'usage-show3' 'usage' 'show' 'user usage categories' 'succeeds'
+    test_categories = ['create_bucket', 'put_obj', 'delete_obj', 'delete_bucket']
+    for cat in test_categories:
+        (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1, 'categories' : cat})
+        assert ret == 200
+        assert len(out['summary']) > 0
+        user_summary = out['summary'][0]
+        assert user_summary['user'] == user1
+        assert len(user_summary['categories']) == 1
+        entry = user_summary['categories'][0]
+        assert entry['category'] == cat
+        assert entry['successful_ops'] > 0
+
+    # TESTCASE 'usage-trim' 'usage' 'trim' 'user usage' 'succeeds, usage removed'
+    (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'trim'], {'uid' : user1})
+    assert ret == 200
+    (ret, out) = rgwadmin_rest(admin_conn, ['usage', 'show'], {'uid' : user1})
+    assert ret == 200
+    assert len(out['entries']) == 0
+    assert len(out['summary']) == 0
+
+    # TESTCASE 'user-suspend2','user','suspend','existing user','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' : user1, 'suspended' : True})
+    assert ret == 200
+
+    # TESTCASE 'user-suspend3','user','suspend','suspended user','cannot write objects'
+    try:
+        key = boto.s3.key.Key(bucket)
+        key.set_contents_from_string('five')
+    except boto.exception.S3ResponseError as e:
+        assert e.status == 403
+
+    # TESTCASE 'user-renable2','user','enable','suspended user','succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'modify'], {'uid' :  user1, 'suspended' : 'false'})
+    assert ret == 200
+
+    # TESTCASE 'user-renable3','user','enable','reenabled user','can write objects'
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('six')
+
+    # TESTCASE 'garbage-list', 'garbage', 'list', 'get list of objects ready for garbage collection'
+
+    # create an object large enough to be split into multiple parts
+    test_string = 'foo'*10000000
+
+    big_key = boto.s3.key.Key(bucket)
+    big_key.set_contents_from_string(test_string)
+
+    # now delete the head
+    big_key.delete()
+
+    # TESTCASE 'rm-user-buckets','user','rm','existing user','fails, still has buckets'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1})
+    assert ret == 409
+
+    # delete should fail because ``key`` still exists
+    try:
+        bucket.delete()
+    except boto.exception.S3ResponseError as e:
+        assert e.status == 409
+
+    key.delete()
+    bucket.delete()
+
+    # TESTCASE 'policy', 'bucket', 'policy', 'get bucket policy', 'returns S3 policy'
+    bucket = connection.create_bucket(bucket_name)
+
+    # create an object
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('seven')
+
+    # should be private already but guarantee it
+    key.set_acl('private')
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key})
+    assert ret == 200
+
+    acl = key.get_xml_acl()
+    assert acl == out.strip('\n')
+
+    # add another grantee by making the object public read
+    key.set_acl('public-read')
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['policy', 'show'], {'bucket' : bucket.name, 'object' : key.key})
+    assert ret == 200
+
+    acl = key.get_xml_acl()
+    assert acl == out.strip('\n')
+
+    # TESTCASE 'rm-bucket', 'bucket', 'rm', 'bucket with objects', 'succeeds'
+    bucket = connection.create_bucket(bucket_name)
+    key_name = ['eight', 'nine', 'ten', 'eleven']
+    for i in range(4):
+        key = boto.s3.key.Key(bucket)
+        key.set_contents_from_string(key_name[i])
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['bucket', 'rm'], {'bucket' : bucket_name, 'purge-objects' : True})
+    assert ret == 200
+
+    # TESTCASE 'caps-add', 'caps', 'add', 'add user cap', 'succeeds'
+    caps = 'usage=read'
+    (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'add'], {'uid' :  user1, 'user-caps' : caps})
+    assert ret == 200
+    assert out[0]['perm'] == 'read'
+
+    # TESTCASE 'caps-rm', 'caps', 'rm', 'remove existing cap from user', 'succeeds'
+    (ret, out) = rgwadmin_rest(admin_conn, ['caps', 'rm'], {'uid' :  user1, 'user-caps' : caps})
+    assert ret == 200
+    assert not out
+
+    # TESTCASE 'rm-user','user','rm','existing user','fails, still has buckets'
+    bucket = connection.create_bucket(bucket_name)
+    key = boto.s3.key.Key(bucket)
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1})
+    assert ret == 409
+
+    # TESTCASE 'rm-user2', 'user', 'rm', user with data', 'succeeds'
+    bucket = connection.create_bucket(bucket_name)
+    key = boto.s3.key.Key(bucket)
+    key.set_contents_from_string('twelve')
+
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'rm'], {'uid' : user1, 'purge-data' : True})
+    assert ret == 200
+
+    # TESTCASE 'rm-user3','user','info','deleted user','fails'
+    (ret, out) = rgwadmin_rest(admin_conn, ['user', 'info'], {'uid' :  user1})
+    assert ret == 404
+
diff --git a/src/ceph/qa/tasks/rbd.py b/src/ceph/qa/tasks/rbd.py
new file mode 100644
index 0000000..d45636a
--- /dev/null
+++ b/src/ceph/qa/tasks/rbd.py
@@ -0,0 +1,612 @@
+"""
+Rbd testing task
+"""
+import contextlib
+import logging
+import os
+import tempfile
+
+from cStringIO import StringIO
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.parallel import parallel
+from teuthology.task.common_fs_utils import generic_mkfs
+from teuthology.task.common_fs_utils import generic_mount
+from teuthology.task.common_fs_utils import default_image_name
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def create_image(ctx, config):
+    """
+    Create an rbd image.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rbd.create_image:
+            client.0:
+                image_name: testimage
+                image_size: 100
+                image_format: 1
+            client.1:
+
+    Image size is expressed as a number of megabytes; default value
+    is 10240.
+
+    Image format value must be either 1 or 2; default value is 1.
+
+    """
+    assert isinstance(config, dict) or isinstance(config, list), \
+        "task create_image only supports a list or dictionary for configuration"
+
+    if isinstance(config, dict):
+        images = config.items()
+    else:
+        images = [(role, None) for role in config]
+
+    testdir = teuthology.get_testdir(ctx)
+    for role, properties in images:
+        if properties is None:
+            properties = {}
+        name = properties.get('image_name', default_image_name(role))
+        size = properties.get('image_size', 10240)
+        fmt = properties.get('image_format', 1)
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        log.info('Creating image {name} with size {size}'.format(name=name,
+                                                                 size=size))
+        args = [
+                'adjust-ulimits',
+                'ceph-coverage'.format(tdir=testdir),
+                '{tdir}/archive/coverage'.format(tdir=testdir),
+                'rbd',
+                '-p', 'rbd',
+                'create',
+                '--size', str(size),
+                name,
+            ]
+        # omit format option if using the default (format 1)
+        # since old versions of don't support it
+        if int(fmt) != 1:
+            args += ['--image-format', str(fmt)]
+        remote.run(args=args)
+    try:
+        yield
+    finally:
+        log.info('Deleting rbd images...')
+        for role, properties in images:
+            if properties is None:
+                properties = {}
+            name = properties.get('image_name', default_image_name(role))
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            remote.run(
+                args=[
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'rbd',
+                    '-p', 'rbd',
+                    'rm',
+                    name,
+                    ],
+                )
+
+@contextlib.contextmanager
+def clone_image(ctx, config):
+    """
+    Clones a parent imag
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rbd.clone_image:
+            client.0:
+                parent_name: testimage
+                image_name: cloneimage
+    """
+    assert isinstance(config, dict) or isinstance(config, list), \
+        "task clone_image only supports a list or dictionary for configuration"
+
+    if isinstance(config, dict):
+        images = config.items()
+    else:
+        images = [(role, None) for role in config]
+
+    testdir = teuthology.get_testdir(ctx)
+    for role, properties in images:
+        if properties is None:
+            properties = {}
+
+        name = properties.get('image_name', default_image_name(role))
+        parent_name = properties.get('parent_name')
+        assert parent_name is not None, \
+            "parent_name is required"
+        parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name)
+
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        log.info('Clone image {parent} to {child}'.format(parent=parent_name,
+                                                          child=name))
+        for cmd in [('snap', 'create', parent_spec),
+                    ('snap', 'protect', parent_spec),
+                    ('clone', parent_spec, name)]:
+            args = [
+                    'adjust-ulimits',
+                    'ceph-coverage'.format(tdir=testdir),
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'rbd', '-p', 'rbd'
+                    ]
+            args.extend(cmd)
+            remote.run(args=args)
+
+    try:
+        yield
+    finally:
+        log.info('Deleting rbd clones...')
+        for role, properties in images:
+            if properties is None:
+                properties = {}
+            name = properties.get('image_name', default_image_name(role))
+            parent_name = properties.get('parent_name')
+            parent_spec = '{name}@{snap}'.format(name=parent_name, snap=name)
+
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+
+            for cmd in [('rm', name),
+                        ('snap', 'unprotect', parent_spec),
+                        ('snap', 'rm', parent_spec)]:
+                args = [
+                        'adjust-ulimits',
+                        'ceph-coverage'.format(tdir=testdir),
+                        '{tdir}/archive/coverage'.format(tdir=testdir),
+                        'rbd', '-p', 'rbd'
+                        ]
+                args.extend(cmd)
+                remote.run(args=args)
+
+@contextlib.contextmanager
+def modprobe(ctx, config):
+    """
+    Load the rbd kernel module..
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rbd.create_image: [client.0]
+        - rbd.modprobe: [client.0]
+    """
+    log.info('Loading rbd kernel module...')
+    for role in config:
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        remote.run(
+            args=[
+                'sudo',
+                'modprobe',
+                'rbd',
+                ],
+            )
+    try:
+        yield
+    finally:
+        log.info('Unloading rbd kernel module...')
+        for role in config:
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            remote.run(
+                args=[
+                    'sudo',
+                    'modprobe',
+                    '-r',
+                    'rbd',
+                    # force errors to be ignored; necessary if more
+                    # than one device was created, which may mean
+                    # the module isn't quite ready to go the first
+                    # time through.
+                    run.Raw('||'),
+                    'true',
+                    ],
+                )
+
+@contextlib.contextmanager
+def dev_create(ctx, config):
+    """
+    Map block devices to rbd images.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rbd.create_image: [client.0]
+        - rbd.modprobe: [client.0]
+        - rbd.dev_create:
+            client.0: testimage.client.0
+    """
+    assert isinstance(config, dict) or isinstance(config, list), \
+        "task dev_create only supports a list or dictionary for configuration"
+
+    if isinstance(config, dict):
+        role_images = config.items()
+    else:
+        role_images = [(role, None) for role in config]
+
+    log.info('Creating rbd block devices...')
+
+    testdir = teuthology.get_testdir(ctx)
+
+    for role, image in role_images:
+        if image is None:
+            image = default_image_name(role)
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+
+        remote.run(
+            args=[
+                'sudo',
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{tdir}/archive/coverage'.format(tdir=testdir),
+                'rbd',
+                '--user', role.rsplit('.')[-1],
+                '-p', 'rbd',
+                'map',
+                image,
+                run.Raw('&&'),
+                # wait for the symlink to be created by udev
+                'while', 'test', '!', '-e', '/dev/rbd/rbd/{image}'.format(image=image), run.Raw(';'), 'do',
+                'sleep', '1', run.Raw(';'),
+                'done',
+                ],
+            )
+    try:
+        yield
+    finally:
+        log.info('Unmapping rbd devices...')
+        for role, image in role_images:
+            if image is None:
+                image = default_image_name(role)
+            (remote,) = ctx.cluster.only(role).remotes.keys()
+            remote.run(
+                args=[
+                    'LD_LIBRARY_PATH={tdir}/binary/usr/local/lib'.format(tdir=testdir),
+                    'sudo',
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'rbd',
+                    '-p', 'rbd',
+                    'unmap',
+                    '/dev/rbd/rbd/{imgname}'.format(imgname=image),
+                    run.Raw('&&'),
+                    # wait for the symlink to be deleted by udev
+                    'while', 'test', '-e', '/dev/rbd/rbd/{image}'.format(image=image),
+                    run.Raw(';'),
+                    'do',
+                    'sleep', '1', run.Raw(';'),
+                    'done',
+                    ],
+                )
+
+
+def rbd_devname_rtn(ctx, image):
+    return '/dev/rbd/rbd/{image}'.format(image=image)    
+
+def canonical_path(ctx, role, path):
+    """
+    Determine the canonical path for a given path on the host
+    representing the given role.  A canonical path contains no
+    . or .. components, and includes no symbolic links.
+    """
+    version_fp = StringIO()
+    ctx.cluster.only(role).run(
+        args=[ 'readlink', '-f', path ],
+        stdout=version_fp,
+        )
+    canonical_path = version_fp.getvalue().rstrip('\n')
+    version_fp.close()
+    return canonical_path
+
+@contextlib.contextmanager
+def run_xfstests(ctx, config):
+    """
+    Run xfstests over specified devices.
+
+    Warning: both the test and scratch devices specified will be
+    overwritten.  Normally xfstests modifies (but does not destroy)
+    the test device, but for now the run script used here re-makes
+    both filesystems.
+
+    Note: Only one instance of xfstests can run on a single host at
+    a time, although this is not enforced.
+
+    This task in its current form needs some improvement.  For
+    example, it assumes all roles provided in the config are
+    clients, and that the config provided is a list of key/value
+    pairs.  For now please use the xfstests() interface, below.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - rbd.run_xfstests:
+            client.0:
+                count: 2
+                test_dev: 'test_dev'
+                scratch_dev: 'scratch_dev'
+                fs_type: 'xfs'
+                tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
+                exclude:
+                - generic/42
+                randomize: true
+    """
+    with parallel() as p:
+        for role, properties in config.items():
+            p.spawn(run_xfstests_one_client, ctx, role, properties)
+    yield
+
+def run_xfstests_one_client(ctx, role, properties):
+    """
+    Spawned routine to handle xfs tests for a single client
+    """
+    testdir = teuthology.get_testdir(ctx)
+    try:
+        count = properties.get('count')
+        test_dev = properties.get('test_dev')
+        assert test_dev is not None, \
+            "task run_xfstests requires test_dev to be defined"
+        test_dev = canonical_path(ctx, role, test_dev)
+
+        scratch_dev = properties.get('scratch_dev')
+        assert scratch_dev is not None, \
+            "task run_xfstests requires scratch_dev to be defined"
+        scratch_dev = canonical_path(ctx, role, scratch_dev)
+
+        fs_type = properties.get('fs_type')
+        tests = properties.get('tests')
+        exclude_list = properties.get('exclude')
+        randomize = properties.get('randomize')
+
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+
+        # Fetch the test script
+        test_root = teuthology.get_testdir(ctx)
+        test_script = 'run_xfstests.sh'
+        test_path = os.path.join(test_root, test_script)
+
+        xfstests_url = properties.get('xfstests_url')
+        assert xfstests_url is not None, \
+            "task run_xfstests requires xfstests_url to be defined"
+
+        xfstests_krbd_url = xfstests_url + '/' + test_script
+
+        log.info('Fetching {script} for {role} from {url}'.format(
+            script=test_script,
+            role=role,
+            url=xfstests_krbd_url))
+
+        args = [ 'wget', '-O', test_path, '--', xfstests_krbd_url ]
+        remote.run(args=args)
+
+        log.info('Running xfstests on {role}:'.format(role=role))
+        log.info('   iteration count: {count}:'.format(count=count))
+        log.info('       test device: {dev}'.format(dev=test_dev))
+        log.info('    scratch device: {dev}'.format(dev=scratch_dev))
+        log.info('     using fs_type: {fs_type}'.format(fs_type=fs_type))
+        log.info('      tests to run: {tests}'.format(tests=tests))
+        log.info('      exclude list: {}'.format(' '.join(exclude_list)))
+        log.info('         randomize: {randomize}'.format(randomize=randomize))
+
+        if exclude_list:
+            with tempfile.NamedTemporaryFile(bufsize=0, prefix='exclude') as exclude_file:
+                for test in exclude_list:
+                    exclude_file.write("{}\n".format(test))
+                remote.put_file(exclude_file.name, exclude_file.name)
+
+        # Note that the device paths are interpreted using
+        # readlink -f <path> in order to get their canonical
+        # pathname (so it matches what the kernel remembers).
+        args = [
+            '/usr/bin/sudo',
+            'TESTDIR={tdir}'.format(tdir=testdir),
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            '/bin/bash',
+            test_path,
+            '-c', str(count),
+            '-f', fs_type,
+            '-t', test_dev,
+            '-s', scratch_dev,
+            ]
+        if exclude_list:
+            args.extend(['-x', exclude_file.name])
+        if randomize:
+            args.append('-r')
+        if tests:
+            args.extend(['--', tests])
+        remote.run(args=args, logger=log.getChild(role))
+    finally:
+        log.info('Removing {script} on {role}'.format(script=test_script,
+                                                      role=role))
+        remote.run(args=['rm', '-f', test_path])
+
+@contextlib.contextmanager
+def xfstests(ctx, config):
+    """
+    Run xfstests over rbd devices.  This interface sets up all
+    required configuration automatically if not otherwise specified.
+    Note that only one instance of xfstests can run on a single host
+    at a time.  By default, the set of tests specified is run once.
+    If a (non-zero) count value is supplied, the complete set of
+    tests will be run that number of times.
+
+    For example::
+
+        tasks:
+        - ceph:
+        # Image sizes are in MB
+        - rbd.xfstests:
+            client.0:
+                count: 3
+                test_image: 'test_image'
+                test_size: 250
+                test_format: 2
+                scratch_image: 'scratch_image'
+                scratch_size: 250
+                scratch_format: 1
+                fs_type: 'xfs'
+                tests: 'generic/100 xfs/003 xfs/005 xfs/006 generic/015'
+                exclude:
+                - generic/42
+                randomize: true
+                xfstests_branch: master
+                xfstests_url: 'https://raw.github.com/ceph/branch/master/qa'
+    """
+    if config is None:
+        config = { 'all': None }
+    assert isinstance(config, dict) or isinstance(config, list), \
+        "task xfstests only supports a list or dictionary for configuration"
+    if isinstance(config, dict):
+        config = teuthology.replace_all_with_clients(ctx.cluster, config)
+        runs = config.items()
+    else:
+        runs = [(role, None) for role in config]
+
+    running_xfstests = {}
+    for role, properties in runs:
+        assert role.startswith('client.'), \
+            "task xfstests can only run on client nodes"
+        for host, roles_for_host in ctx.cluster.remotes.items():
+            if role in roles_for_host:
+                assert host not in running_xfstests, \
+                    "task xfstests allows only one instance at a time per host"
+                running_xfstests[host] = True
+
+    images_config = {}
+    scratch_config = {}
+    modprobe_config = {}
+    image_map_config = {}
+    scratch_map_config = {}
+    xfstests_config = {}
+    for role, properties in runs:
+        if properties is None:
+            properties = {}
+
+        test_image = properties.get('test_image', 'test_image.{role}'.format(role=role))
+        test_size = properties.get('test_size', 10000) # 10G
+        test_fmt = properties.get('test_format', 1)
+        scratch_image = properties.get('scratch_image', 'scratch_image.{role}'.format(role=role))
+        scratch_size = properties.get('scratch_size', 10000) # 10G
+        scratch_fmt = properties.get('scratch_format', 1)
+
+        images_config[role] = dict(
+            image_name=test_image,
+            image_size=test_size,
+            image_format=test_fmt,
+            )
+
+        scratch_config[role] = dict(
+            image_name=scratch_image,
+            image_size=scratch_size,
+            image_format=scratch_fmt,
+            )
+
+        xfstests_branch = properties.get('xfstests_branch', 'master')
+        xfstests_url = properties.get('xfstests_url', 'https://raw.github.com/ceph/ceph/{branch}/qa'.format(branch=xfstests_branch))
+
+        xfstests_config[role] = dict(
+            count=properties.get('count', 1),
+            test_dev='/dev/rbd/rbd/{image}'.format(image=test_image),
+            scratch_dev='/dev/rbd/rbd/{image}'.format(image=scratch_image),
+            fs_type=properties.get('fs_type', 'xfs'),
+            randomize=properties.get('randomize', False),
+            tests=properties.get('tests'),
+            exclude=properties.get('exclude', []),
+            xfstests_url=xfstests_url,
+            )
+
+        log.info('Setting up xfstests using RBD images:')
+        log.info('      test ({size} MB): {image}'.format(size=test_size,
+                                                        image=test_image))
+        log.info('   scratch ({size} MB): {image}'.format(size=scratch_size,
+                                                        image=scratch_image))
+        modprobe_config[role] = None
+        image_map_config[role] = test_image
+        scratch_map_config[role] = scratch_image
+
+    with contextutil.nested(
+        lambda: create_image(ctx=ctx, config=images_config),
+        lambda: create_image(ctx=ctx, config=scratch_config),
+        lambda: modprobe(ctx=ctx, config=modprobe_config),
+        lambda: dev_create(ctx=ctx, config=image_map_config),
+        lambda: dev_create(ctx=ctx, config=scratch_map_config),
+        lambda: run_xfstests(ctx=ctx, config=xfstests_config),
+        ):
+        yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Create and mount an rbd image.
+
+    For example, you can specify which clients to run on::
+
+        tasks:
+        - ceph:
+        - rbd: [client.0, client.1]
+
+    There are a few image options::
+
+        tasks:
+        - ceph:
+        - rbd:
+            client.0: # uses defaults
+            client.1:
+                image_name: foo
+                image_size: 2048
+                image_format: 2
+                fs_type: xfs
+
+    To use default options on all clients::
+
+        tasks:
+        - ceph:
+        - rbd:
+            all:
+
+    To create 20GiB images and format them with xfs on all clients::
+
+        tasks:
+        - ceph:
+        - rbd:
+            all:
+              image_size: 20480
+              fs_type: xfs
+    """
+    if config is None:
+        config = { 'all': None }
+    norm_config = config
+    if isinstance(config, dict):
+        norm_config = teuthology.replace_all_with_clients(ctx.cluster, config)
+    if isinstance(norm_config, dict):
+        role_images = {}
+        for role, properties in norm_config.iteritems():
+            if properties is None:
+                properties = {}
+            role_images[role] = properties.get('image_name')
+    else:
+        role_images = norm_config
+
+    log.debug('rbd config is: %s', norm_config)
+
+    with contextutil.nested(
+        lambda: create_image(ctx=ctx, config=norm_config),
+        lambda: modprobe(ctx=ctx, config=norm_config),
+        lambda: dev_create(ctx=ctx, config=role_images),
+        lambda: generic_mkfs(ctx=ctx, config=norm_config,
+                devname_rtn=rbd_devname_rtn),
+        lambda: generic_mount(ctx=ctx, config=role_images,
+                devname_rtn=rbd_devname_rtn),
+        ):
+        yield
diff --git a/src/ceph/qa/tasks/rbd_fio.py b/src/ceph/qa/tasks/rbd_fio.py
new file mode 100644
index 0000000..663e8f5
--- /dev/null
+++ b/src/ceph/qa/tasks/rbd_fio.py
@@ -0,0 +1,226 @@
+"""
+ Long running fio tests on rbd mapped devices for format/features provided in config
+ Many fio parameters can be configured so that this task can be used along with thrash/power-cut tests
+ and exercise IO on full disk for all format/features
+  - This test should not be run on VM due to heavy use of resource
+
+"""
+import contextlib
+import json
+import logging
+import os
+import StringIO
+
+from teuthology.parallel import parallel
+from teuthology import misc as teuthology
+from tempfile import NamedTemporaryFile
+from teuthology.orchestra import run
+from teuthology.packaging import install_package, remove_package
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    client.0:
+       fio-io-size: 100g or 80% or 100m
+       fio-version: 2.2.9
+       formats: [2]
+       features: [[layering],[striping],[layering,exclusive-lock,object-map]]
+       test-clone-io: 1  #remove this option to not run create rbd clone and not run io on clone
+       io-engine: "sync or rbd or any io-engine"
+       rw: randrw
+    client.1:
+       fio-io-size: 100g
+       fio-version: 2.2.9
+       rw: read
+       image-size:20480
+
+or
+    all:
+       fio-io-size: 400g
+       rw: randrw
+       formats: [2]
+       features: [[layering],[striping]]
+       io-engine: libaio
+
+    Create rbd image + device and exercise IO for format/features provided in config file
+    Config can be per client or one config can be used for all clients, fio jobs are run in parallel for client provided
+
+    """
+    if config.get('all'):
+        client_config = config['all']
+    clients = ctx.cluster.only(teuthology.is_type('client'))
+    rbd_test_dir = teuthology.get_testdir(ctx) + "/rbd_fio_test"
+    for remote,role in clients.remotes.iteritems():
+        if 'client_config' in locals():
+           with parallel() as p:
+               p.spawn(run_fio, remote, client_config, rbd_test_dir)
+        else:
+           for client_config in config:
+              if client_config in role:
+                 with parallel() as p:
+                     p.spawn(run_fio, remote, config[client_config], rbd_test_dir)
+
+    yield
+
+
+def get_ioengine_package_name(ioengine, remote):
+    system_type = teuthology.get_system_type(remote)
+    if ioengine == 'rbd':
+        return 'librbd1-devel' if system_type == 'rpm' else 'librbd-dev'
+    elif ioengine == 'libaio':
+        return 'libaio-devel' if system_type == 'rpm' else 'libaio-dev'
+    else:
+        return None
+
+
+def run_rbd_map(remote, image, iodepth):
+    iodepth = max(iodepth, 128)  # RBD_QUEUE_DEPTH_DEFAULT
+    out = StringIO.StringIO()
+    remote.run(args=['sudo', 'rbd', 'map', '-o', 'queue_depth={}'.format(iodepth), image], stdout=out)
+    dev = out.getvalue().rstrip('\n')
+    teuthology.sudo_write_file(
+        remote,
+        '/sys/block/{}/queue/nr_requests'.format(os.path.basename(dev)),
+        str(iodepth))
+    return dev
+
+
+def run_fio(remote, config, rbd_test_dir):
+    """
+    create fio config file with options based on above config
+    get the fio from github, generate binary, and use it to run on
+    the generated fio config file
+    """
+    fio_config=NamedTemporaryFile(prefix='fio_rbd_', dir='/tmp/', delete=False)
+    fio_config.write('[global]\n')
+    if config.get('io-engine'):
+        ioengine=config['io-engine']
+        fio_config.write('ioengine={ioe}\n'.format(ioe=ioengine))
+    else:
+        fio_config.write('ioengine=sync\n')
+    if config.get('bs'):
+        bs=config['bs']
+        fio_config.write('bs={bs}\n'.format(bs=bs))
+    else:
+        fio_config.write('bs=4k\n')
+    iodepth = config.get('io-depth', 2)
+    fio_config.write('iodepth={iod}\n'.format(iod=iodepth))
+    if config.get('fio-io-size'):
+        size=config['fio-io-size']
+        fio_config.write('size={size}\n'.format(size=size))
+    else:
+        fio_config.write('size=100m\n')
+
+    fio_config.write('time_based\n')
+    if config.get('runtime'):
+        runtime=config['runtime']
+        fio_config.write('runtime={runtime}\n'.format(runtime=runtime))
+    else:
+        fio_config.write('runtime=1800\n')
+    fio_config.write('allow_file_create=0\n')
+    image_size=10240
+    if config.get('image_size'):
+        image_size=config['image_size']
+
+    formats=[1,2]
+    features=[['layering'],['striping'],['exclusive-lock','object-map']]
+    fio_version='2.21'
+    if config.get('formats'):
+        formats=config['formats']
+    if config.get('features'):
+        features=config['features']
+    if config.get('fio-version'):
+        fio_version=config['fio-version']
+
+    # handle package required for ioengine, if any
+    sn=remote.shortname
+    ioengine_pkg = get_ioengine_package_name(ioengine, remote)
+    if ioengine_pkg:
+        install_package(ioengine_pkg, remote)
+
+    fio_config.write('norandommap\n')
+    if ioengine == 'rbd':
+        fio_config.write('clientname=admin\n')
+        fio_config.write('pool=rbd\n')
+        fio_config.write('invalidate=0\n')
+    elif ioengine == 'libaio':
+        fio_config.write('direct=1\n')
+    for frmt in formats:
+        for feature in features:
+           log.info("Creating rbd images on {sn}".format(sn=sn))
+           feature_name = '-'.join(feature)
+           rbd_name = 'i{i}f{f}{sn}'.format(i=frmt,f=feature_name,sn=sn)
+           rbd_snap_name = 'i{i}f{f}{sn}@i{i}f{f}{sn}Snap'.format(i=frmt,f=feature_name,sn=sn)
+           rbd_clone_name = 'i{i}f{f}{sn}Clone'.format(i=frmt,f=feature_name,sn=sn)
+           create_args=['rbd', 'create',
+                        '--size', '{size}'.format(size=image_size),
+                        '--image', rbd_name,
+                        '--image-format', '{f}'.format(f=frmt)]
+           map(lambda x: create_args.extend(['--image-feature', x]), feature)
+           remote.run(args=create_args)
+           remote.run(args=['rbd', 'info', rbd_name])
+           if ioengine != 'rbd':
+               rbd_dev = run_rbd_map(remote, rbd_name, iodepth)
+               if config.get('test-clone-io'):
+                    log.info("Testing clones using fio")
+                    remote.run(args=['rbd', 'snap', 'create', rbd_snap_name])
+                    remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name])
+                    remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name])
+                    rbd_clone_dev = run_rbd_map(remote, rbd_clone_name, iodepth)
+               fio_config.write('[{rbd_dev}]\n'.format(rbd_dev=rbd_dev))
+               if config.get('rw'):
+                   rw=config['rw']
+                   fio_config.write('rw={rw}\n'.format(rw=rw))
+               else:
+                   fio_config .write('rw=randrw\n')
+               fio_config.write('filename={rbd_dev}\n'.format(rbd_dev=rbd_dev))
+               if config.get('test-clone-io'):
+                   fio_config.write('[{rbd_clone_dev}]\n'.format(rbd_clone_dev=rbd_clone_dev))
+                   fio_config.write('rw={rw}\n'.format(rw=rw))
+                   fio_config.write('filename={rbd_clone_dev}\n'.format(rbd_clone_dev=rbd_clone_dev))
+           else:
+               if config.get('test-clone-io'):
+                    log.info("Testing clones using fio")
+                    remote.run(args=['rbd', 'snap', 'create', rbd_snap_name])
+                    remote.run(args=['rbd', 'snap', 'protect', rbd_snap_name])
+                    remote.run(args=['rbd', 'clone', rbd_snap_name, rbd_clone_name])
+               fio_config.write('[{img_name}]\n'.format(img_name=rbd_name))
+               if config.get('rw'):
+                   rw=config['rw']
+                   fio_config.write('rw={rw}\n'.format(rw=rw))
+               else:
+                   fio_config.write('rw=randrw\n')
+               fio_config.write('rbdname={img_name}\n'.format(img_name=rbd_name))
+               if config.get('test-clone-io'):
+                   fio_config.write('[{clone_img_name}]\n'.format(clone_img_name=rbd_clone_name))
+                   fio_config.write('rw={rw}\n'.format(rw=rw))
+                   fio_config.write('rbdname={clone_img_name}\n'.format(clone_img_name=rbd_clone_name))
+
+
+    fio_config.close()
+    remote.put_file(fio_config.name,fio_config.name)
+    try:
+        log.info("Running rbd feature - fio test on {sn}".format(sn=sn))
+        fio = "https://github.com/axboe/fio/archive/fio-" + fio_version + ".tar.gz"
+        remote.run(args=['mkdir', run.Raw(rbd_test_dir),])
+        remote.run(args=['cd' , run.Raw(rbd_test_dir),
+                         run.Raw(';'), 'wget' , fio , run.Raw(';'), run.Raw('tar -xvf fio*tar.gz'), run.Raw(';'),
+                         run.Raw('cd fio-fio*'), 'configure', run.Raw(';') ,'make'])
+        remote.run(args=['ceph', '-s'])
+        remote.run(args=[run.Raw('{tdir}/fio-fio-{v}/fio --showcmd {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))])
+        remote.run(args=['sudo', run.Raw('{tdir}/fio-fio-{v}/fio {f}'.format(tdir=rbd_test_dir,v=fio_version,f=fio_config.name))])
+        remote.run(args=['ceph', '-s'])
+    finally:
+        out=StringIO.StringIO()
+        remote.run(args=['rbd','showmapped', '--format=json'], stdout=out)
+        mapped_images = json.loads(out.getvalue())
+        if mapped_images:
+            log.info("Unmapping rbd images on {sn}".format(sn=sn))
+            for image in mapped_images.itervalues():
+                remote.run(args=['sudo', 'rbd', 'unmap', str(image['device'])])
+        log.info("Cleaning up fio install")
+        remote.run(args=['rm','-rf', run.Raw(rbd_test_dir)])
+        if ioengine_pkg:
+            remove_package(ioengine_pkg, remote)
diff --git a/src/ceph/qa/tasks/rbd_fsx.py b/src/ceph/qa/tasks/rbd_fsx.py
new file mode 100644
index 0000000..ab1a47f
--- /dev/null
+++ b/src/ceph/qa/tasks/rbd_fsx.py
@@ -0,0 +1,102 @@
+"""
+Run fsx on an rbd image
+"""
+import contextlib
+import logging
+
+from teuthology.parallel import parallel
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run fsx on an rbd image.
+
+    Currently this requires running as client.admin
+    to create a pool.
+
+    Specify which clients to run on as a list::
+
+      tasks:
+        ceph:
+        rbd_fsx:
+          clients: [client.0, client.1]
+
+    You can optionally change some properties of fsx:
+
+      tasks:
+        ceph:
+        rbd_fsx:
+          clients: <list of clients>
+          seed: <random seed number, or 0 to use the time>
+          ops: <number of operations to do>
+          size: <maximum image size in bytes>
+          valgrind: [--tool=<valgrind tool>]
+    """
+    log.info('starting rbd_fsx...')
+    with parallel() as p:
+        for role in config['clients']:
+            p.spawn(_run_one_client, ctx, config, role)
+    yield
+
+def _run_one_client(ctx, config, role):
+    """Spawned task that runs the client"""
+    krbd = config.get('krbd', False)
+    nbd = config.get('nbd', False)
+    testdir = teuthology.get_testdir(ctx)
+    (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+
+    args = []
+    if krbd or nbd:
+        args.append('sudo') # rbd(-nbd) map/unmap need privileges
+    args.extend([
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir)
+    ])
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('rbd_fsx', {}))
+
+    if config.get('valgrind'):
+        args = teuthology.get_valgrind_args(
+            testdir,
+            'fsx_{id}'.format(id=role),
+            args,
+            config.get('valgrind')
+        )
+
+    args.extend([
+        'ceph_test_librbd_fsx',
+        '-d', # debug output for all operations
+        '-W', '-R', # mmap doesn't work with rbd
+        '-p', str(config.get('progress_interval', 100)), # show progress
+        '-P', '{tdir}/archive'.format(tdir=testdir),
+        '-r', str(config.get('readbdy',1)),
+        '-w', str(config.get('writebdy',1)),
+        '-t', str(config.get('truncbdy',1)),
+        '-h', str(config.get('holebdy',1)),
+        '-l', str(config.get('size', 250000000)),
+        '-S', str(config.get('seed', 0)),
+        '-N', str(config.get('ops', 1000)),
+    ])
+    if krbd:
+        args.append('-K') # -K enables krbd mode
+    if nbd:
+        args.append('-M') # -M enables nbd mode
+    if config.get('direct_io', False):
+        args.append('-Z') # -Z use direct IO
+    if not config.get('randomized_striping', True):
+        args.append('-U') # -U disables randomized striping
+    if not config.get('punch_holes', True):
+        args.append('-H') # -H disables discard ops
+    if config.get('journal_replay', False):
+        args.append('-j') # -j replay all IO events from journal
+    args.extend([
+        'pool_{pool}'.format(pool=role),
+        'image_{image}'.format(image=role),
+    ])
+
+    remote.run(args=args)
diff --git a/src/ceph/qa/tasks/rbd_mirror.py b/src/ceph/qa/tasks/rbd_mirror.py
new file mode 100644
index 0000000..851b64f
--- /dev/null
+++ b/src/ceph/qa/tasks/rbd_mirror.py
@@ -0,0 +1,117 @@
+"""
+Task for running rbd mirroring daemons and configuring mirroring
+"""
+
+import logging
+
+from teuthology.orchestra import run
+from teuthology import misc
+from teuthology.exceptions import ConfigError
+from teuthology.task import Task
+from util import get_remote_for_role
+
+log = logging.getLogger(__name__)
+
+
+class RBDMirror(Task):
+    """
+    Run an rbd-mirror daemon to sync rbd images between clusters.
+
+    This requires two clients (one from each cluster) on the same host
+    to connect with. The pool configuration should be adjusted by later
+    test scripts to include the remote client and cluster name. This task
+    just needs to know how to connect to the local cluster.
+
+    For example:
+
+        roles:
+        - [primary.mon.a, primary.osd.0, primary.osd.1, primary.osd.2]
+        - [secondary.mon.a, secondary.osd.0, secondary.osd.1, secondary.osd.2]
+        - [primary.client.mirror, secondary.client.mirror]
+        tasks:
+        - ceph:
+            cluster: primary
+        - ceph:
+            cluster: secondary
+        - rbd-mirror:
+            client: primary.client.mirror
+
+    To mirror back to the primary cluster as well, add another
+    rbd_mirror instance:
+
+        - rbd-mirror:
+            client: secondary.client.mirror
+
+    Possible options for this task are:
+
+        client: role - ceph client to connect as
+        valgrind: [--tool=<valgrind tool>] - none by default
+        coverage: bool - whether this run may be collecting coverage data
+    """
+    def __init__(self, ctx, config):
+        super(RBDMirror, self).__init__(ctx, config)
+        self.log = log
+
+    def setup(self):
+        super(RBDMirror, self).setup()
+        try:
+            self.client = self.config['client']
+        except KeyError:
+            raise ConfigError('rbd-mirror requires a client to connect with')
+
+        self.cluster_name, type_, self.client_id = misc.split_role(self.client)
+
+        if type_ != 'client':
+            msg = 'client role ({0}) must be a client'.format(self.client)
+            raise ConfigError(msg)
+
+        self.remote = get_remote_for_role(self.ctx, self.client)
+
+    def begin(self):
+        super(RBDMirror, self).begin()
+        testdir = misc.get_testdir(self.ctx)
+        daemon_signal = 'kill'
+        if 'coverage' in self.config or 'valgrind' in self.config:
+            daemon_signal = 'term'
+
+        args = [
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'daemon-helper',
+            daemon_signal,
+            ]
+
+        if 'valgrind' in self.config:
+            args = misc.get_valgrind_args(
+                testdir,
+                'rbd-mirror-{id}'.format(id=self.client),
+                args,
+                self.config.get('valgrind')
+            )
+
+        args.extend([
+            'rbd-mirror', '--foreground',
+            '--cluster',
+            self.cluster_name,
+            '--id',
+            self.client_id,
+            ])
+
+        self.ctx.daemons.add_daemon(
+            self.remote, 'rbd-mirror', self.client,
+            cluster=self.cluster_name,
+            args=args,
+            logger=self.log.getChild(self.client),
+            stdin=run.PIPE,
+            wait=False,
+        )
+
+    def end(self):
+        mirror_daemon = self.ctx.daemons.get_daemon('rbd-mirror',
+                                                    self.client,
+                                                    self.cluster_name)
+        mirror_daemon.stop()
+        super(RBDMirror, self).end()
+
+task = RBDMirror
diff --git a/src/ceph/qa/tasks/rebuild_mondb.py b/src/ceph/qa/tasks/rebuild_mondb.py
new file mode 100644
index 0000000..900bd16
--- /dev/null
+++ b/src/ceph/qa/tasks/rebuild_mondb.py
@@ -0,0 +1,216 @@
+"""
+Test if we can recover the leveldb from OSD after where all leveldbs are
+corrupted
+"""
+
+import logging
+import os.path
+import shutil
+import tempfile
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+
+def _push_directory(path, remote, remote_dir):
+    """
+    local_temp_path=`mktemp`
+    tar czf $local_temp_path $path
+    ssh remote mkdir -p remote_dir
+    remote_temp_path=`mktemp`
+    scp $local_temp_path $remote_temp_path
+    rm $local_temp_path
+    tar xzf $remote_temp_path -C $remote_dir
+    ssh remote:$remote_temp_path
+    """
+    fd, local_temp_path = tempfile.mkstemp(suffix='.tgz',
+                                           prefix='rebuild_mondb-')
+    os.close(fd)
+    cmd = ' '.join(['tar', 'cz',
+                    '-f', local_temp_path,
+                    '-C', path,
+                    '--', '.'])
+    teuthology.sh(cmd)
+    _, fname = os.path.split(local_temp_path)
+    fd, remote_temp_path = tempfile.mkstemp(suffix='.tgz',
+                                            prefix='rebuild_mondb-')
+    os.close(fd)
+    remote.put_file(local_temp_path, remote_temp_path)
+    os.remove(local_temp_path)
+    remote.run(args=['sudo',
+                     'tar', 'xz',
+                     '-C', remote_dir,
+                     '-f', remote_temp_path])
+    remote.run(args=['sudo', 'rm', '-fr', remote_temp_path])
+
+
+def _nuke_mons(manager, mons, mon_id):
+    assert mons
+    is_mon = teuthology.is_type('mon')
+    for remote, roles in mons.remotes.iteritems():
+        for role in roles:
+            if not is_mon(role):
+                continue
+            cluster, _, m = teuthology.split_role(role)
+            log.info('killing {cluster}:mon.{mon}'.format(
+                cluster=cluster,
+                mon=m))
+            manager.kill_mon(m)
+            mon_data = os.path.join('/var/lib/ceph/mon/',
+                                    '{0}-{1}'.format(cluster, m))
+            if m == mon_id:
+                # so we will only need to recreate the store.db for the
+                # first mon, would be easier than mkfs on it then replace
+                # the its store.db with the recovered one
+                store_dir = os.path.join(mon_data, 'store.db')
+                remote.run(args=['sudo', 'rm', '-r', store_dir])
+            else:
+                remote.run(args=['sudo', 'rm', '-r', mon_data])
+
+
+def _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path):
+    local_mstore = tempfile.mkdtemp()
+
+    # collect the maps from all OSDs
+    is_osd = teuthology.is_type('osd')
+    osds = ctx.cluster.only(is_osd)
+    assert osds
+    for osd, roles in osds.remotes.iteritems():
+        for role in roles:
+            if not is_osd(role):
+                continue
+            cluster, _, osd_id = teuthology.split_role(role)
+            assert cluster_name == cluster
+            log.info('collecting maps from {cluster}:osd.{osd}'.format(
+                cluster=cluster,
+                osd=osd_id))
+            # push leveldb to OSD
+            osd_mstore = os.path.join(teuthology.get_testdir(ctx), 'mon-store')
+            osd.run(args=['sudo', 'mkdir', '-m', 'o+x', '-p', osd_mstore])
+
+            _push_directory(local_mstore, osd, osd_mstore)
+            log.info('rm -rf {0}'.format(local_mstore))
+            shutil.rmtree(local_mstore)
+            # update leveldb with OSD data
+            options = '--op update-mon-db --mon-store-path {0}'
+            log.info('cot {0}'.format(osd_mstore))
+            manager.objectstore_tool(pool=None,
+                                     options=options.format(osd_mstore),
+                                     args='',
+                                     osd=osd_id,
+                                     do_revive=False)
+            # pull the updated mon db
+            log.info('pull dir {0} -> {1}'.format(osd_mstore, local_mstore))
+            local_mstore = tempfile.mkdtemp()
+            teuthology.pull_directory(osd, osd_mstore, local_mstore)
+            log.info('rm -rf osd:{0}'.format(osd_mstore))
+            osd.run(args=['sudo', 'rm', '-fr', osd_mstore])
+
+    # recover the first_mon with re-built mon db
+    # pull from recovered leveldb from client
+    mon_store_dir = os.path.join('/var/lib/ceph/mon',
+                                 '{0}-{1}'.format(cluster_name, mon_id))
+    _push_directory(local_mstore, mon, mon_store_dir)
+    mon.run(args=['sudo', 'chown', '-R', 'ceph:ceph', mon_store_dir])
+    shutil.rmtree(local_mstore)
+
+    # fill up the caps in the keyring file
+    mon.run(args=['sudo',
+                  'ceph-authtool', keyring_path,
+                  '-n', 'mon.',
+                  '--cap', 'mon', 'allow *'])
+    mon.run(args=['sudo',
+                  'ceph-authtool', keyring_path,
+                  '-n', 'client.admin',
+                  '--cap', 'mon', 'allow *',
+                  '--cap', 'osd', 'allow *',
+                  '--cap', 'mds', 'allow *',
+                  '--cap', 'mgr', 'allow *'])
+    mon.run(args=['sudo', '-u', 'ceph',
+                  'ceph-monstore-tool', mon_store_dir,
+                  'rebuild', '--', '--keyring',
+                  keyring_path])
+
+
+def _revive_mons(manager, mons, recovered, keyring_path):
+    # revive monitors
+    # the initial monmap is in the ceph.conf, so we are good.
+    n_mons = 0
+    is_mon = teuthology.is_type('mon')
+    for remote, roles in mons.remotes.iteritems():
+        for role in roles:
+            if not is_mon(role):
+                continue
+            cluster, _, m = teuthology.split_role(role)
+            if recovered != m:
+                log.info('running mkfs on {cluster}:mon.{mon}'.format(
+                    cluster=cluster,
+                    mon=m))
+                remote.run(
+                    args=[
+                        'sudo',
+                        'ceph-mon',
+                        '--cluster', cluster,
+                        '--mkfs',
+                        '-i', m,
+                        '--keyring', keyring_path])
+            log.info('reviving mon.{0}'.format(m))
+            manager.revive_mon(m)
+            n_mons += 1
+    manager.wait_for_mon_quorum_size(n_mons, timeout=30)
+
+
+def _revive_mgrs(ctx, manager):
+    is_mgr = teuthology.is_type('mgr')
+    mgrs = ctx.cluster.only(is_mgr)
+    for _, roles in mgrs.remotes.iteritems():
+        for role in roles:
+            if not is_mgr(role):
+                continue
+            _, _, mgr_id = teuthology.split_role(role)
+            log.info('reviving mgr.{0}'.format(mgr_id))
+            manager.revive_mgr(mgr_id)
+
+
+def _revive_osds(ctx, manager):
+    is_osd = teuthology.is_type('osd')
+    osds = ctx.cluster.only(is_osd)
+    for _, roles in osds.remotes.iteritems():
+        for role in roles:
+            if not is_osd(role):
+                continue
+            _, _, osd_id = teuthology.split_role(role)
+            log.info('reviving osd.{0}'.format(osd_id))
+            manager.revive_osd(osd_id)
+
+
+def task(ctx, config):
+    """
+    Test monitor recovery from OSD
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'task only accepts a dict for configuration'
+
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'))
+
+    mons = ctx.cluster.only(teuthology.is_type('mon'))
+    # note down the first cluster_name and mon_id
+    # we will recover it later on
+    cluster_name, _, mon_id = teuthology.split_role(first_mon)
+    _nuke_mons(manager, mons, mon_id)
+    default_keyring = '/etc/ceph/{cluster}.keyring'.format(
+        cluster=cluster_name)
+    keyring_path = config.get('keyring_path', default_keyring)
+    _rebuild_db(ctx, manager, cluster_name, mon, mon_id, keyring_path)
+    _revive_mons(manager, mons, mon_id, keyring_path)
+    _revive_mgrs(ctx, manager)
+    _revive_osds(ctx, manager)
diff --git a/src/ceph/qa/tasks/recovery_bench.py b/src/ceph/qa/tasks/recovery_bench.py
new file mode 100644
index 0000000..5eb9fd2
--- /dev/null
+++ b/src/ceph/qa/tasks/recovery_bench.py
@@ -0,0 +1,208 @@
+"""
+Recovery system benchmarking
+"""
+from cStringIO import StringIO
+
+import contextlib
+import gevent
+import json
+import logging
+import random
+import time
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Benchmark the recovery system.
+
+    Generates objects with smalliobench, runs it normally to get a
+    baseline performance measurement, then marks an OSD out and reruns
+    to measure performance during recovery.
+
+    The config should be as follows:
+
+    recovery_bench:
+        duration: <seconds for each measurement run>
+        num_objects: <number of objects>
+        io_size: <io size in bytes>
+
+    example:
+
+    tasks:
+    - ceph:
+    - recovery_bench:
+        duration: 60
+        num_objects: 500
+        io_size: 4096
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'recovery_bench task only accepts a dict for configuration'
+
+    log.info('Beginning recovery bench...')
+
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    while len(manager.get_osd_status()['up']) < num_osds:
+        time.sleep(10)
+
+    bench_proc = RecoveryBencher(
+        manager,
+        config,
+        )
+    try:
+        yield
+    finally:
+        log.info('joining recovery bencher')
+        bench_proc.do_join()
+
+class RecoveryBencher:
+    """
+    RecoveryBencher
+    """
+    def __init__(self, manager, config):
+        self.ceph_manager = manager
+        self.ceph_manager.wait_for_clean()
+
+        osd_status = self.ceph_manager.get_osd_status()
+        self.osds = osd_status['up']
+
+        self.config = config
+        if self.config is None:
+            self.config = dict()
+
+        else:
+            def tmp(x):
+                """
+                Local wrapper to print value.
+                """
+                print x
+            self.log = tmp
+
+        log.info("spawning thread")
+
+        self.thread = gevent.spawn(self.do_bench)
+
+    def do_join(self):
+        """
+        Join the recovery bencher.  This is called after the main
+        task exits.
+        """
+        self.thread.get()
+
+    def do_bench(self):
+        """
+        Do the benchmarking.
+        """
+        duration = self.config.get("duration", 60)
+        num_objects = self.config.get("num_objects", 500)
+        io_size = self.config.get("io_size", 4096)
+
+        osd = str(random.choice(self.osds))
+        (osd_remote,) = self.ceph_manager.ctx.cluster.only('osd.%s' % osd).remotes.iterkeys()
+
+        testdir = teuthology.get_testdir(self.ceph_manager.ctx)
+
+        # create the objects
+        osd_remote.run(
+            args=[
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{tdir}/archive/coverage'.format(tdir=testdir),
+                'smalliobench'.format(tdir=testdir),
+                '--use-prefix', 'recovery_bench',
+                '--init-only', '1',
+                '--num-objects', str(num_objects),
+                '--io-size', str(io_size),
+                ],
+            wait=True,
+        )
+
+        # baseline bench
+        log.info('non-recovery (baseline)')
+        p = osd_remote.run(
+            args=[
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{tdir}/archive/coverage'.format(tdir=testdir),
+                'smalliobench',
+                '--use-prefix', 'recovery_bench',
+                '--do-not-init', '1',
+                '--duration', str(duration),
+                '--io-size', str(io_size),
+                ],
+            stdout=StringIO(),
+            stderr=StringIO(),
+            wait=True,
+        )
+        self.process_samples(p.stderr.getvalue())
+
+        self.ceph_manager.raw_cluster_cmd('osd', 'out', osd)
+        time.sleep(5)
+
+        # recovery bench
+        log.info('recovery active')
+        p = osd_remote.run(
+            args=[
+                'adjust-ulimits',
+                'ceph-coverage',
+                '{tdir}/archive/coverage'.format(tdir=testdir),
+                'smalliobench',
+                '--use-prefix', 'recovery_bench',
+                '--do-not-init', '1',
+                '--duration', str(duration),
+                '--io-size', str(io_size),
+                ],
+            stdout=StringIO(),
+            stderr=StringIO(),
+            wait=True,
+        )
+        self.process_samples(p.stderr.getvalue())
+
+        self.ceph_manager.raw_cluster_cmd('osd', 'in', osd)
+
+    def process_samples(self, input):
+        """
+        Extract samples from the input and process the results
+
+        :param input: input lines in JSON format
+        """
+        lat = {}
+        for line in input.split('\n'):
+            try:
+                sample = json.loads(line)
+                samples = lat.setdefault(sample['type'], [])
+                samples.append(float(sample['latency']))
+            except Exception:
+                pass
+
+        for type in lat:
+            samples = lat[type]
+            samples.sort()
+
+            num = len(samples)
+
+            # median
+            if num & 1 == 1: # odd number of samples
+                median = samples[num / 2]
+            else:
+                median = (samples[num / 2] + samples[num / 2 - 1]) / 2
+
+            # 99%
+            ninety_nine = samples[int(num * 0.99)]
+
+            log.info("%s: median %f, 99%% %f" % (type, median, ninety_nine))
diff --git a/src/ceph/qa/tasks/reg11184.py b/src/ceph/qa/tasks/reg11184.py
new file mode 100644
index 0000000..f248623
--- /dev/null
+++ b/src/ceph/qa/tasks/reg11184.py
@@ -0,0 +1,241 @@
+"""
+Special regression test for tracker #11184
+
+Synopsis: osd/SnapMapper.cc: 282: FAILED assert(check(oid))
+
+This is accomplished by moving a pg that wasn't part of split and still include
+divergent priors.
+"""
+import logging
+import time
+from cStringIO import StringIO
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from util.rados import rados
+import os
+
+
+log = logging.getLogger(__name__)
+
+
+def task(ctx, config):
+    """
+    Test handling of divergent entries during export / import
+    to regression test tracker #11184
+
+    overrides:
+      ceph:
+        conf:
+          osd:
+            debug osd: 5
+
+    Requires 3 osds on a single test node.
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'divergent_priors task only accepts a dict for configuration'
+
+    manager = ctx.managers['ceph']
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    osds = [0, 1, 2]
+    manager.flush_pg_stats(osds)
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+    manager.raw_cluster_cmd('osd', 'set', 'noin')
+    manager.raw_cluster_cmd('osd', 'set', 'nodown')
+    manager.wait_for_clean()
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+    dummyfile2 = '/etc/resolv.conf'
+    testdir = teuthology.get_testdir(ctx)
+
+    # create 1 pg pool
+    log.info('creating foo')
+    manager.raw_cluster_cmd('osd', 'pool', 'create', 'foo', '1')
+    manager.raw_cluster_cmd(
+        'osd', 'pool', 'application', 'enable',
+        'foo', 'rados', run.Raw('||'), 'true')
+
+    # Remove extra pool to simlify log output
+    manager.raw_cluster_cmd('osd', 'pool', 'delete', 'rbd', 'rbd', '--yes-i-really-really-mean-it')
+
+    for i in osds:
+        manager.set_config(i, osd_min_pg_log_entries=10)
+        manager.set_config(i, osd_max_pg_log_entries=10)
+        manager.set_config(i, osd_pg_log_trim_min=5)
+
+    # determine primary
+    divergent = manager.get_pg_primary('foo', 0)
+    log.info("primary and soon to be divergent is %d", divergent)
+    non_divergent = list(osds)
+    non_divergent.remove(divergent)
+
+    log.info('writing initial objects')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    # write 100 objects
+    for i in range(100):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+    manager.wait_for_clean()
+
+    # blackhole non_divergent
+    log.info("blackholing osds %s", str(non_divergent))
+    for i in non_divergent:
+        manager.set_config(i, objectstore_blackhole=1)
+
+    DIVERGENT_WRITE = 5
+    DIVERGENT_REMOVE = 5
+    # Write some soon to be divergent
+    log.info('writing divergent objects')
+    for i in range(DIVERGENT_WRITE):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i,
+                         dummyfile2], wait=False)
+    # Remove some soon to be divergent
+    log.info('remove divergent objects')
+    for i in range(DIVERGENT_REMOVE):
+        rados(ctx, mon, ['-p', 'foo', 'rm',
+                         'existing_%d' % (i + DIVERGENT_WRITE)], wait=False)
+    time.sleep(10)
+    mon.run(
+        args=['killall', '-9', 'rados'],
+        wait=True,
+        check_status=False)
+
+    # kill all the osds but leave divergent in
+    log.info('killing all the osds')
+    for i in osds:
+        manager.kill_osd(i)
+    for i in osds:
+        manager.mark_down_osd(i)
+    for i in non_divergent:
+        manager.mark_out_osd(i)
+
+    # bring up non-divergent
+    log.info("bringing up non_divergent %s", str(non_divergent))
+    for i in non_divergent:
+        manager.revive_osd(i)
+    for i in non_divergent:
+        manager.mark_in_osd(i)
+
+    # write 1 non-divergent object (ensure that old divergent one is divergent)
+    objname = "existing_%d" % (DIVERGENT_WRITE + DIVERGENT_REMOVE)
+    log.info('writing non-divergent object ' + objname)
+    rados(ctx, mon, ['-p', 'foo', 'put', objname, dummyfile2])
+
+    manager.wait_for_recovery()
+
+    # ensure no recovery of up osds first
+    log.info('delay recovery')
+    for i in non_divergent:
+        manager.wait_run_admin_socket(
+            'osd', i, ['set_recovery_delay', '100000'])
+
+    # bring in our divergent friend
+    log.info("revive divergent %d", divergent)
+    manager.raw_cluster_cmd('osd', 'set', 'noup')
+    manager.revive_osd(divergent)
+
+    log.info('delay recovery divergent')
+    manager.wait_run_admin_socket(
+        'osd', divergent, ['set_recovery_delay', '100000'])
+
+    manager.raw_cluster_cmd('osd', 'unset', 'noup')
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+    log.info('wait for peering')
+    rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+    # At this point the divergent_priors should have been detected
+
+    log.info("killing divergent %d", divergent)
+    manager.kill_osd(divergent)
+
+    # Split pgs for pool foo
+    manager.raw_cluster_cmd('osd', 'pool', 'set', 'foo', 'pg_num', '2')
+    time.sleep(5)
+
+    manager.raw_cluster_cmd('pg','dump')
+
+    # Export a pg
+    (exp_remote,) = ctx.\
+        cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
+    FSPATH = manager.get_filepath()
+    JPATH = os.path.join(FSPATH, "journal")
+    prefix = ("sudo adjust-ulimits ceph-objectstore-tool "
+              "--data-path {fpath} --journal-path {jpath} "
+              "--log-file="
+              "/var/log/ceph/objectstore_tool.$$.log ".
+              format(fpath=FSPATH, jpath=JPATH))
+    pid = os.getpid()
+    expfile = os.path.join(testdir, "exp.{pid}.out".format(pid=pid))
+    cmd = ((prefix + "--op export-remove --pgid 2.0 --file {file}").
+           format(id=divergent, file=expfile))
+    proc = exp_remote.run(args=cmd, wait=True,
+                          check_status=False, stdout=StringIO())
+    assert proc.exitstatus == 0
+
+    # Kill one of non-divergent OSDs
+    log.info('killing osd.%d' % non_divergent[0])
+    manager.kill_osd(non_divergent[0])
+    manager.mark_down_osd(non_divergent[0])
+    # manager.mark_out_osd(non_divergent[0])
+
+    # An empty collection for pg 2.0 might need to be cleaned up
+    cmd = ((prefix + "--force --op remove --pgid 2.0").
+           format(id=non_divergent[0]))
+    proc = exp_remote.run(args=cmd, wait=True,
+                          check_status=False, stdout=StringIO())
+
+    cmd = ((prefix + "--op import --file {file}").
+           format(id=non_divergent[0], file=expfile))
+    proc = exp_remote.run(args=cmd, wait=True,
+                          check_status=False, stdout=StringIO())
+    assert proc.exitstatus == 0
+
+    # bring in our divergent friend and other node
+    log.info("revive divergent %d", divergent)
+    manager.revive_osd(divergent)
+    manager.mark_in_osd(divergent)
+    log.info("revive %d", non_divergent[0])
+    manager.revive_osd(non_divergent[0])
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+    log.info('delay recovery divergent')
+    manager.set_config(divergent, osd_recovery_delay_start=100000)
+    log.info('mark divergent in')
+    manager.mark_in_osd(divergent)
+
+    log.info('wait for peering')
+    rados(ctx, mon, ['-p', 'foo', 'put', 'foo', dummyfile])
+
+    log.info("killing divergent %d", divergent)
+    manager.kill_osd(divergent)
+    log.info("reviving divergent %d", divergent)
+    manager.revive_osd(divergent)
+    time.sleep(3)
+
+    log.info('allowing recovery')
+    # Set osd_recovery_delay_start back to 0 and kick the queue
+    for i in osds:
+        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'debug',
+                                'kick_recovery_wq', ' 0')
+
+    log.info('reading divergent objects')
+    for i in range(DIVERGENT_WRITE + DIVERGENT_REMOVE):
+        exit_status = rados(ctx, mon, ['-p', 'foo', 'get', 'existing_%d' % i,
+                                       '/tmp/existing'])
+        assert exit_status is 0
+
+    (remote,) = ctx.\
+        cluster.only('osd.{o}'.format(o=divergent)).remotes.iterkeys()
+    cmd = 'rm {file}'.format(file=expfile)
+    remote.run(args=cmd, wait=True)
+    log.info("success")
diff --git a/src/ceph/qa/tasks/rep_lost_unfound_delete.py b/src/ceph/qa/tasks/rep_lost_unfound_delete.py
new file mode 100644
index 0000000..4e5678d
--- /dev/null
+++ b/src/ceph/qa/tasks/rep_lost_unfound_delete.py
@@ -0,0 +1,177 @@
+"""
+Lost_unfound
+"""
+import logging
+from teuthology.orchestra import run
+import ceph_manager
+import time
+from teuthology import misc as teuthology
+from util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling of lost objects.
+
+    A pretty rigid cluseter is brought up andtested by this task
+    """
+    POOL = 'unfounddel_pool'
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'lost_unfound task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+    manager.flush_pg_stats([0, 1, 2])
+    manager.wait_for_clean()
+
+    manager.create_pool(POOL)
+
+    # something that is always there
+    dummyfile = '/etc/fstab'
+
+    # take an osd out until the very end
+    manager.kill_osd(2)
+    manager.mark_down_osd(2)
+    manager.mark_out_osd(2)
+
+    # kludge to make sure they get a map
+    rados(ctx, mon, ['-p', POOL, 'put', 'dummy', dummyfile])
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_for_recovery()
+
+    # create old objects
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'rm', 'existed_%d' % f])
+
+    # delay recovery, and make the pg log very long (to prevent backfill)
+    manager.raw_cluster_cmd(
+            'tell', 'osd.1',
+            'injectargs',
+            '--osd-recovery-delay-start 1000 --osd-min-pg-log-entries 100000000'
+            )
+
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+    
+    for f in range(1, 10):
+        rados(ctx, mon, ['-p', POOL, 'put', 'new_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existed_%d' % f, dummyfile])
+        rados(ctx, mon, ['-p', POOL, 'put', 'existing_%d' % f, dummyfile])
+
+    # bring osd.0 back up, let it peer, but don't replicate the new
+    # objects...
+    log.info('osd.0 command_args is %s' % 'foo')
+    log.info(ctx.daemons.get_daemon('osd', 0).command_args)
+    ctx.daemons.get_daemon('osd', 0).command_kwargs['args'].extend([
+            '--osd-recovery-delay-start', '1000'
+            ])
+    manager.revive_osd(0)
+    manager.mark_in_osd(0)
+    manager.wait_till_osd_is_up(0)
+
+    manager.flush_pg_stats([0, 1])
+    manager.wait_till_active()
+
+    # take out osd.1 and the only copy of those objects.
+    manager.kill_osd(1)
+    manager.mark_down_osd(1)
+    manager.mark_out_osd(1)
+    manager.raw_cluster_cmd('osd', 'lost', '1', '--yes-i-really-mean-it')
+
+    # bring up osd.2 so that things would otherwise, in theory, recovery fully
+    manager.revive_osd(2)
+    manager.mark_in_osd(2)
+    manager.wait_till_osd_is_up(2)
+
+    manager.flush_pg_stats([0, 2])
+    manager.wait_till_active()
+    manager.flush_pg_stats([0, 2])
+
+    # verify that there are unfound objects
+    unfound = manager.get_num_unfound_objects()
+    log.info("there are %d unfound objects" % unfound)
+    assert unfound
+
+    testdir = teuthology.get_testdir(ctx)
+    procs = []
+    if config.get('parallel_bench', True):
+        procs.append(mon.run(
+            args=[
+                "/bin/sh", "-c",
+                " ".join(['adjust-ulimits',
+                          'ceph-coverage',
+                          '{tdir}/archive/coverage',
+                          'rados',
+                          '--no-log-to-stderr',
+                          '--name', 'client.admin',
+                          '-b', str(4<<10),
+                          '-p' , POOL,
+                          '-t', '20',
+                          'bench', '240', 'write',
+                      ]).format(tdir=testdir),
+            ],
+            logger=log.getChild('radosbench.{id}'.format(id='client.admin')),
+            stdin=run.PIPE,
+            wait=False
+        ))
+    time.sleep(10)
+
+    # mark stuff lost
+    pgs = manager.get_pg_stats()
+    for pg in pgs:
+        if pg['stat_sum']['num_objects_unfound'] > 0:
+            primary = 'osd.%d' % pg['acting'][0]
+
+            # verify that i can list them direct from the osd
+            log.info('listing missing/lost in %s state %s', pg['pgid'],
+                     pg['state']);
+            m = manager.list_pg_missing(pg['pgid'])
+            #log.info('%s' % m)
+            assert m['num_unfound'] == pg['stat_sum']['num_objects_unfound']
+            num_unfound=0
+            for o in m['objects']:
+                if len(o['locations']) == 0:
+                    num_unfound += 1
+            assert m['num_unfound'] == num_unfound
+
+            log.info("reverting unfound in %s on %s", pg['pgid'], primary)
+            manager.raw_cluster_cmd('pg', pg['pgid'],
+                                    'mark_unfound_lost', 'delete')
+        else:
+            log.info("no unfound in %s", pg['pgid'])
+
+    manager.raw_cluster_cmd('tell', 'osd.0', 'debug', 'kick_recovery_wq', '5')
+    manager.raw_cluster_cmd('tell', 'osd.2', 'debug', 'kick_recovery_wq', '5')
+    manager.flush_pg_stats([0, 2])
+    manager.wait_for_recovery()
+
+    # verify result
+    for f in range(1, 10):
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'new_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'existed_%d' % f, '-'])
+        assert err
+        err = rados(ctx, mon, ['-p', POOL, 'get', 'existing_%d' % f, '-'])
+        assert err
+
+    # see if osd.1 can cope
+    manager.revive_osd(1)
+    manager.mark_in_osd(1)
+    manager.wait_till_osd_is_up(1)
+    manager.wait_for_clean()
+    run.wait(procs)
+
diff --git a/src/ceph/qa/tasks/repair_test.py b/src/ceph/qa/tasks/repair_test.py
new file mode 100644
index 0000000..5a63bd6
--- /dev/null
+++ b/src/ceph/qa/tasks/repair_test.py
@@ -0,0 +1,308 @@
+"""
+Test pool repairing after objects are damaged.
+"""
+import logging
+import time
+
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+
+def choose_primary(manager, pool, num):
+    """
+    Return primary to test on.
+    """
+    log.info("Choosing primary")
+    return manager.get_pg_primary(pool, num)
+
+
+def choose_replica(manager, pool, num):
+    """
+    Return replica to test on.
+    """
+    log.info("Choosing replica")
+    return manager.get_pg_replica(pool, num)
+
+
+def trunc(manager, osd, pool, obj):
+    """
+    truncate an object
+    """
+    log.info("truncating object")
+    return manager.osd_admin_socket(
+        osd,
+        ['truncobj', pool, obj, '1'])
+
+
+def dataerr(manager, osd, pool, obj):
+    """
+    cause an error in the data
+    """
+    log.info("injecting data err on object")
+    return manager.osd_admin_socket(
+        osd,
+        ['injectdataerr', pool, obj])
+
+
+def mdataerr(manager, osd, pool, obj):
+    """
+    cause an error in the mdata
+    """
+    log.info("injecting mdata err on object")
+    return manager.osd_admin_socket(
+        osd,
+        ['injectmdataerr', pool, obj])
+
+
+def omaperr(manager, osd, pool, obj):
+    """
+    Cause an omap error.
+    """
+    log.info("injecting omap err on object")
+    return manager.osd_admin_socket(osd, ['setomapval', pool, obj,
+                                              'badkey', 'badval'])
+
+
+def repair_test_1(manager, corrupter, chooser, scrub_type):
+    """
+    Creates an object in the pool, corrupts it,
+    scrubs it, and verifies that the pool is inconsistent.  It then repairs
+    the pool, rescrubs it, and verifies that the pool is consistent
+
+    :param corrupter: error generating function (truncate, data-error, or
+     meta-data error, for example).
+    :param chooser: osd type chooser (primary or replica)
+    :param scrub_type: regular scrub or deep-scrub
+    """
+    pool = "repair_pool_1"
+    manager.wait_for_clean()
+    with manager.pool(pool, 1):
+
+        log.info("starting repair test type 1")
+        victim_osd = chooser(manager, pool, 0)
+
+        # create object
+        log.info("doing put")
+        manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
+
+        # corrupt object
+        log.info("corrupting object")
+        corrupter(manager, victim_osd, pool, 'repair_test_obj')
+
+        # verify inconsistent
+        log.info("scrubbing")
+        manager.do_pg_scrub(pool, 0, scrub_type)
+
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
+
+        # repair
+        log.info("repairing")
+        manager.do_pg_scrub(pool, 0, "repair")
+
+        log.info("re-scrubbing")
+        manager.do_pg_scrub(pool, 0, scrub_type)
+
+        # verify consistent
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
+        log.info("done")
+
+
+def repair_test_2(ctx, manager, config, chooser):
+    """
+    First creates a set of objects and
+    sets the omap value.  It then corrupts an object, does both a scrub
+    and a deep-scrub, and then corrupts more objects.  After that, it
+    repairs the pool and makes sure that the pool is consistent some
+    time after a deep-scrub.
+
+    :param chooser: primary or replica selection routine.
+    """
+    pool = "repair_pool_2"
+    manager.wait_for_clean()
+    with manager.pool(pool, 1):
+        log.info("starting repair test type 2")
+        victim_osd = chooser(manager, pool, 0)
+        first_mon = teuthology.get_first_mon(ctx, config)
+        (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+        # create object
+        log.info("doing put and setomapval")
+        manager.do_put(pool, 'file1', '/etc/hosts')
+        manager.do_rados(mon, ['-p', pool, 'setomapval', 'file1',
+                                   'key', 'val'])
+        manager.do_put(pool, 'file2', '/etc/hosts')
+        manager.do_put(pool, 'file3', '/etc/hosts')
+        manager.do_put(pool, 'file4', '/etc/hosts')
+        manager.do_put(pool, 'file5', '/etc/hosts')
+        manager.do_rados(mon, ['-p', pool, 'setomapval', 'file5',
+                                   'key', 'val'])
+        manager.do_put(pool, 'file6', '/etc/hosts')
+
+        # corrupt object
+        log.info("corrupting object")
+        omaperr(manager, victim_osd, pool, 'file1')
+
+        # verify inconsistent
+        log.info("scrubbing")
+        manager.do_pg_scrub(pool, 0, 'deep-scrub')
+
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
+
+        # Regression test for bug #4778, should still
+        # be inconsistent after scrub
+        manager.do_pg_scrub(pool, 0, 'scrub')
+
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
+
+        # Additional corruptions including 2 types for file1
+        log.info("corrupting more objects")
+        dataerr(manager, victim_osd, pool, 'file1')
+        mdataerr(manager, victim_osd, pool, 'file2')
+        trunc(manager, victim_osd, pool, 'file3')
+        omaperr(manager, victim_osd, pool, 'file6')
+
+        # see still inconsistent
+        log.info("scrubbing")
+        manager.do_pg_scrub(pool, 0, 'deep-scrub')
+
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
+
+        # repair
+        log.info("repairing")
+        manager.do_pg_scrub(pool, 0, "repair")
+
+        # Let repair clear inconsistent flag
+        time.sleep(10)
+
+        # verify consistent
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
+
+        # In the future repair might determine state of
+        # inconsistency itself, verify with a deep-scrub
+        log.info("scrubbing")
+        manager.do_pg_scrub(pool, 0, 'deep-scrub')
+
+        # verify consistent
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
+
+        log.info("done")
+
+
+def hinfoerr(manager, victim, pool, obj):
+    """
+    cause an error in the hinfo_key
+    """
+    log.info("remove the hinfo_key")
+    manager.objectstore_tool(pool,
+                             options='',
+                             args='rm-attr hinfo_key',
+                             object_name=obj,
+                             osd=victim)
+
+
+def repair_test_erasure_code(manager, corrupter, victim, scrub_type):
+    """
+    Creates an object in the pool, corrupts it,
+    scrubs it, and verifies that the pool is inconsistent.  It then repairs
+    the pool, rescrubs it, and verifies that the pool is consistent
+
+    :param corrupter: error generating function.
+    :param chooser: osd type chooser (primary or replica)
+    :param scrub_type: regular scrub or deep-scrub
+    """
+    pool = "repair_pool_3"
+    manager.wait_for_clean()
+    with manager.pool(pool_name=pool, pg_num=1,
+                          erasure_code_profile_name='default'):
+
+        log.info("starting repair test for erasure code")
+
+        # create object
+        log.info("doing put")
+        manager.do_put(pool, 'repair_test_obj', '/etc/hosts')
+
+        # corrupt object
+        log.info("corrupting object")
+        corrupter(manager, victim, pool, 'repair_test_obj')
+
+        # verify inconsistent
+        log.info("scrubbing")
+        manager.do_pg_scrub(pool, 0, scrub_type)
+
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' in s)
+
+        # repair
+        log.info("repairing")
+        manager.do_pg_scrub(pool, 0, "repair")
+
+        log.info("re-scrubbing")
+        manager.do_pg_scrub(pool, 0, scrub_type)
+
+        # verify consistent
+        manager.with_pg_state(pool, 0, lambda s: 'inconsistent' not in s)
+        log.info("done")
+
+
+def task(ctx, config):
+    """
+    Test [deep] repair in several situations:
+      Repair [Truncate, Data EIO, MData EIO] on [Primary|Replica]
+
+    The config should be as follows:
+
+      Must include the log-whitelist below
+      Must enable filestore_debug_inject_read_err config
+
+    example:
+
+    tasks:
+    - chef:
+    - install:
+    - ceph:
+        log-whitelist:
+          - 'candidate had a stat error'
+          - 'candidate had a read error'
+          - 'deep-scrub 0 missing, 1 inconsistent objects'
+          - 'deep-scrub 0 missing, 4 inconsistent objects'
+          - 'deep-scrub [0-9]+ errors'
+          - '!= omap_digest'
+          - '!= data_digest'
+          - 'repair 0 missing, 1 inconsistent objects'
+          - 'repair 0 missing, 4 inconsistent objects'
+          - 'repair [0-9]+ errors, [0-9]+ fixed'
+          - 'scrub 0 missing, 1 inconsistent objects'
+          - 'scrub [0-9]+ errors'
+          - 'size 1 != size'
+          - 'attr name mismatch'
+          - 'Regular scrub request, deep-scrub details will be lost'
+        conf:
+          osd:
+            filestore debug inject read err: true
+    - repair_test:
+
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'repair_test task only accepts a dict for config'
+
+    manager = ctx.managers['ceph']
+    manager.wait_for_all_osds_up()
+
+    manager.raw_cluster_cmd('osd', 'set', 'noscrub')
+    manager.raw_cluster_cmd('osd', 'set', 'nodeep-scrub')
+
+    repair_test_1(manager, mdataerr, choose_primary, "scrub")
+    repair_test_1(manager, mdataerr, choose_replica, "scrub")
+    repair_test_1(manager, dataerr, choose_primary, "deep-scrub")
+    repair_test_1(manager, dataerr, choose_replica, "deep-scrub")
+    repair_test_1(manager, trunc, choose_primary, "scrub")
+    repair_test_1(manager, trunc, choose_replica, "scrub")
+    repair_test_2(ctx, manager, config, choose_primary)
+    repair_test_2(ctx, manager, config, choose_replica)
+
+    repair_test_erasure_code(manager, hinfoerr, 'primary', "deep-scrub")
+
+    manager.raw_cluster_cmd('osd', 'unset', 'noscrub')
+    manager.raw_cluster_cmd('osd', 'unset', 'nodeep-scrub')
diff --git a/src/ceph/qa/tasks/resolve_stuck_peering.py b/src/ceph/qa/tasks/resolve_stuck_peering.py
new file mode 100644
index 0000000..bdf86e9
--- /dev/null
+++ b/src/ceph/qa/tasks/resolve_stuck_peering.py
@@ -0,0 +1,112 @@
+"""
+Resolve stuck peering
+"""
+import logging
+import time
+
+from teuthology import misc as teuthology
+from util.rados import rados
+
+log = logging.getLogger(__name__)
+
+def task(ctx, config):
+    """
+    Test handling resolve stuck peering
+
+    requires 3 osds on a single test node
+    """
+    if config is None:
+        config = {}
+        assert isinstance(config, dict), \
+            'Resolve stuck peering only accepts a dict for config'
+
+    manager = ctx.managers['ceph']
+
+    while len(manager.get_osd_status()['up']) < 3:
+        time.sleep(10)
+
+
+    manager.wait_for_clean()
+
+    dummyfile = '/etc/fstab'
+    dummyfile1 = '/etc/resolv.conf'
+
+    #create 1 PG pool
+    pool='foo'
+    log.info('creating pool foo')
+    manager.raw_cluster_cmd('osd', 'pool', 'create', '%s' % pool, '1')
+
+    #set min_size of the pool to 1
+    #so that we can continue with I/O
+    #when 2 osds are down
+    manager.set_pool_property(pool, "min_size", 1)
+
+    osds = [0, 1, 2]
+
+    primary = manager.get_pg_primary('foo', 0)
+    log.info("primary osd is %d", primary)
+
+    others = list(osds)
+    others.remove(primary)
+
+    log.info('writing initial objects')
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+    #create few objects
+    for i in range(100):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'existing_%d' % i, dummyfile])
+
+    manager.wait_for_clean()
+
+    #kill other osds except primary
+    log.info('killing other osds except primary')
+    for i in others:
+        manager.kill_osd(i)
+    for i in others:
+        manager.mark_down_osd(i)
+
+
+    for i in range(100):
+        rados(ctx, mon, ['-p', 'foo', 'put', 'new_%d' % i, dummyfile1])
+
+    #kill primary osd
+    manager.kill_osd(primary)
+    manager.mark_down_osd(primary)
+
+    #revive other 2 osds
+    for i in others:
+        manager.revive_osd(i)
+
+    #make sure that pg is down
+    #Assuming pg number for single pg pool will start from 0
+    pgnum=0
+    pgstr = manager.get_pgid(pool, pgnum)
+    stats = manager.get_single_pg_stats(pgstr)
+    print stats['state']
+
+    timeout=60
+    start=time.time()
+
+    while 'down' not in stats['state']:
+        assert time.time() - start < timeout, \
+            'failed to reach down state before timeout expired'
+        stats = manager.get_single_pg_stats(pgstr)
+
+    #mark primary as lost
+    manager.raw_cluster_cmd('osd', 'lost', '%d' % primary,\
+                            '--yes-i-really-mean-it')
+
+
+    #expect the pg status to be active+undersized+degraded
+    #pg should recover and become active+clean within timeout
+    stats = manager.get_single_pg_stats(pgstr)
+    print stats['state']
+
+    timeout=10
+    start=time.time()
+
+    while manager.get_num_down():
+        assert time.time() - start < timeout, \
+            'failed to recover before timeout expired'
+
+    manager.revive_osd(primary)
diff --git a/src/ceph/qa/tasks/rest_api.py b/src/ceph/qa/tasks/rest_api.py
new file mode 100644
index 0000000..e86f77e
--- /dev/null
+++ b/src/ceph/qa/tasks/rest_api.py
@@ -0,0 +1,184 @@
+"""
+Rest Api
+"""
+import logging
+import contextlib
+import time
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra import run
+from teuthology.orchestra.daemon import DaemonGroup
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def run_rest_api_daemon(ctx, api_clients):
+    """
+    Wrapper starts the rest api daemons
+    """
+    if not hasattr(ctx, 'daemons'):
+        ctx.daemons = DaemonGroup()
+    remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
+    for rems, roles in remotes.iteritems():
+        for whole_id_ in roles:
+            if whole_id_ in api_clients:
+                id_ = whole_id_[len('clients'):]
+                run_cmd = [
+                    'sudo',
+                    'daemon-helper',
+                    'kill',
+                    'ceph-rest-api',
+                    '-n',
+                    'client.rest{id}'.format(id=id_), ]
+                cl_rest_id = 'client.rest{id}'.format(id=id_)
+                ctx.daemons.add_daemon(rems, 'restapi',
+                    cl_rest_id,
+                    args=run_cmd,
+                    logger=log.getChild(cl_rest_id),
+                    stdin=run.PIPE,
+                    wait=False,
+                    )
+                for i in range(1, 12):
+                    log.info('testing for ceph-rest-api try {0}'.format(i))
+                    run_cmd = [
+                        'wget',
+                        '-O',
+                        '/dev/null',
+                        '-q',
+                        'http://localhost:5000/api/v0.1/status'
+                    ]
+                    proc = rems.run(
+                        args=run_cmd,
+                        check_status=False
+                    )
+                    if proc.exitstatus == 0:
+                        break
+                    time.sleep(5)
+                if proc.exitstatus != 0:
+                    raise RuntimeError('Cannot contact ceph-rest-api')
+    try:
+        yield
+
+    finally:
+        """
+        TO DO: destroy daemons started -- modify iter_daemons_of_role
+        """
+        teuthology.stop_daemons_of_type(ctx, 'restapi')
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Start up rest-api.
+
+    To start on on all clients::
+
+        tasks:
+        - ceph:
+        - rest-api:
+
+    To only run on certain clients::
+
+        tasks:
+        - ceph:
+        - rest-api: [client.0, client.3]
+
+    or
+
+        tasks:
+        - ceph:
+        - rest-api:
+            client.0:
+            client.3:
+
+    The general flow of things here is:
+        1. Find clients on which rest-api is supposed to run (api_clients)
+        2. Generate keyring values
+        3. Start up ceph-rest-api daemons
+    On cleanup:
+        4. Stop the daemons
+        5. Delete keyring value files.
+    """
+    api_clients = []
+    remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
+    log.info(remotes)
+    if config == None:
+        api_clients = ['client.{id}'.format(id=id_)
+            for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    else:
+        api_clients = config
+    log.info(api_clients)
+    testdir = teuthology.get_testdir(ctx)
+    coverage_dir = '{tdir}/archive/coverage'.format(tdir=testdir)
+    for rems, roles in remotes.iteritems():
+        for whole_id_ in roles:
+            if whole_id_ in api_clients:
+                id_ = whole_id_[len('client.'):]
+                keyring = '/etc/ceph/ceph.client.rest{id}.keyring'.format(
+                        id=id_)
+                rems.run(
+                    args=[
+                        'sudo',
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        coverage_dir,
+                        'ceph-authtool',
+                        '--create-keyring',
+                        '--gen-key',
+                        '--name=client.rest{id}'.format(id=id_),
+                        '--set-uid=0',
+                        '--cap', 'mon', 'allow *',
+                        '--cap', 'osd', 'allow *',
+                        '--cap', 'mds', 'allow',
+                        keyring,
+                        run.Raw('&&'),
+                        'sudo',
+                        'chmod',
+                        '0644',
+                        keyring,
+                        ],
+                    )
+                rems.run(
+                    args=[
+                        'sudo',
+                        'sh',
+                        '-c',
+                        run.Raw("'"),
+                        "echo",
+                        '[client.rest{id}]'.format(id=id_),
+                        run.Raw('>>'),
+                        "/etc/ceph/ceph.conf",
+                        run.Raw("'")
+                        ]
+                    )
+                rems.run(
+                    args=[
+                        'sudo',
+                        'sh',
+                        '-c',
+                        run.Raw("'"),
+                        'echo',
+                        'restapi',
+                        'keyring',
+                        '=',
+                        '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_),
+                        run.Raw('>>'),
+                        '/etc/ceph/ceph.conf',
+                        run.Raw("'"),
+                        ]
+                    )
+                rems.run(
+                    args=[
+                        'sudo',
+                        'ceph',
+                        'auth',
+                        'import',
+                        '-i',
+                        '/etc/ceph/ceph.client.rest{id}.keyring'.format(id=id_),
+                    ]
+                )
+    with contextutil.nested(
+            lambda: run_rest_api_daemon(ctx=ctx, api_clients=api_clients),):
+        yield
+
diff --git a/src/ceph/qa/tasks/restart.py b/src/ceph/qa/tasks/restart.py
new file mode 100644
index 0000000..697345a
--- /dev/null
+++ b/src/ceph/qa/tasks/restart.py
@@ -0,0 +1,163 @@
+"""
+Daemon restart
+"""
+import logging
+import pipes
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run as tor
+
+from teuthology.orchestra import run
+log = logging.getLogger(__name__)
+
+def restart_daemon(ctx, config, role, id_, *args):
+    """
+    Handle restart (including the execution of the command parameters passed)
+    """
+    log.info('Restarting {r}.{i} daemon...'.format(r=role, i=id_))
+    daemon = ctx.daemons.get_daemon(role, id_)
+    log.debug('Waiting for exit of {r}.{i} daemon...'.format(r=role, i=id_))
+    try:
+        daemon.wait_for_exit()
+    except tor.CommandFailedError as e:
+        log.debug('Command Failed: {e}'.format(e=e))
+    if len(args) > 0:
+        confargs = ['--{k}={v}'.format(k=k, v=v) for k,v in zip(args[0::2], args[1::2])]
+        log.debug('Doing restart of {r}.{i} daemon with args: {a}...'.format(r=role, i=id_, a=confargs))
+        daemon.restart_with_args(confargs)
+    else:
+        log.debug('Doing restart of {r}.{i} daemon...'.format(r=role, i=id_))
+        daemon.restart()
+
+def get_tests(ctx, config, role, remote, testdir):
+    """Download restart tests"""
+    srcdir = '{tdir}/restart.{role}'.format(tdir=testdir, role=role)
+
+    refspec = config.get('branch')
+    if refspec is None:
+        refspec = config.get('sha1')
+    if refspec is None:
+        refspec = config.get('tag')
+    if refspec is None:
+        refspec = 'HEAD'
+    log.info('Pulling restart qa/workunits from ref %s', refspec)
+
+    remote.run(
+        logger=log.getChild(role),
+        args=[
+            'mkdir', '--', srcdir,
+            run.Raw('&&'),
+            'git',
+            'archive',
+            '--remote=git://git.ceph.com/ceph.git',
+            '%s:qa/workunits' % refspec,
+            run.Raw('|'),
+            'tar',
+            '-C', srcdir,
+            '-x',
+            '-f-',
+            run.Raw('&&'),
+            'cd', '--', srcdir,
+            run.Raw('&&'),
+            'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
+            run.Raw('&&'),
+            'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir),
+            run.Raw('>{tdir}/restarts.list'.format(tdir=testdir)),
+            ],
+        )
+    restarts = sorted(teuthology.get_file(
+                        remote,
+                        '{tdir}/restarts.list'.format(tdir=testdir)).split('\0'))
+    return (srcdir, restarts)
+
+def task(ctx, config):
+    """
+    Execute commands and allow daemon restart with config options.
+    Each process executed can output to stdout restart commands of the form:
+        restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2>
+    This will restart the daemon <role>.<id> with the specified config values once
+    by modifying the conf file with those values, and then replacing the old conf file
+    once the daemon is restarted.
+    This task does not kill a running daemon, it assumes the daemon will abort on an
+    assert specified in the config.
+
+        tasks:
+        - install:
+        - ceph:
+        - restart:
+            exec:
+              client.0:
+                - test_backtraces.py
+
+    """
+    assert isinstance(config, dict), "task kill got invalid config"
+
+    testdir = teuthology.get_testdir(ctx)
+
+    try:
+        assert 'exec' in config, "config requires exec key with <role>: <command> entries"
+        for role, task in config['exec'].iteritems():
+            log.info('restart for role {r}'.format(r=role))
+            (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+            srcdir, restarts = get_tests(ctx, config, role, remote, testdir)
+            log.info('Running command on role %s host %s', role, remote.name)
+            spec = '{spec}'.format(spec=task[0])
+            log.info('Restarts list: %s', restarts)
+            log.info('Spec is %s', spec)
+            to_run = [w for w in restarts if w == task or w.find(spec) != -1]
+            log.info('To run: %s', to_run)
+            for c in to_run:
+                log.info('Running restart script %s...', c)
+                args = [
+                    run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
+                    ]
+                env = config.get('env')
+                if env is not None:
+                    for var, val in env.iteritems():
+                        quoted_val = pipes.quote(val)
+                        env_arg = '{var}={val}'.format(var=var, val=quoted_val)
+                        args.append(run.Raw(env_arg))
+                args.extend([
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            '{srcdir}/{c}'.format(
+                                srcdir=srcdir,
+                                c=c,
+                                ),
+                            ])
+                proc = remote.run(
+                    args=args,
+                    stdout=tor.PIPE,
+                    stdin=tor.PIPE,
+                    stderr=log,
+                    wait=False,
+                    )
+                log.info('waiting for a command from script')
+                while True:
+                    l = proc.stdout.readline()
+                    if not l or l == '':
+                        break
+                    log.debug('script command: {c}'.format(c=l))
+                    ll = l.strip()
+                    cmd = ll.split(' ')
+                    if cmd[0] == "done":
+                        break
+                    assert cmd[0] == 'restart', "script sent invalid command request to kill task"
+                    # cmd should be: restart <role> <id> <conf_key1> <conf_value1> <conf_key2> <conf_value2>
+                    # or to clear, just: restart <role> <id>
+                    restart_daemon(ctx, config, cmd[1], cmd[2], *cmd[3:])
+                    proc.stdin.writelines(['restarted\n'])
+                    proc.stdin.flush()
+                try:
+                    proc.wait()
+                except tor.CommandFailedError:
+                    raise Exception('restart task got non-zero exit status from script: {s}'.format(s=c))
+    finally:
+        log.info('Finishing %s on %s...', task, role)
+        remote.run(
+            logger=log.getChild(role),
+            args=[
+                'rm', '-rf', '--', '{tdir}/restarts.list'.format(tdir=testdir), srcdir,
+                ],
+            )
diff --git a/src/ceph/qa/tasks/rgw.py b/src/ceph/qa/tasks/rgw.py
new file mode 100644
index 0000000..cec0b64
--- /dev/null
+++ b/src/ceph/qa/tasks/rgw.py
@@ -0,0 +1,241 @@
+"""
+rgw routines
+"""
+import argparse
+import contextlib
+import json
+import logging
+import os
+import errno
+import util.rgw as rgw_utils
+
+from teuthology.orchestra import run
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.orchestra.run import CommandFailedError
+from util.rgw import rgwadmin, wait_for_radosgw
+from util.rados import (rados, create_ec_pool,
+                                        create_replicated_pool,
+                                        create_cache_pool)
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def start_rgw(ctx, config, clients):
+    """
+    Start rgw on remote sites.
+    """
+    log.info('Starting rgw...')
+    testdir = teuthology.get_testdir(ctx)
+    for client in clients:
+        (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+        cluster_name, daemon_type, client_id = teuthology.split_role(client)
+        client_with_id = daemon_type + '.' + client_id
+        client_with_cluster = cluster_name + '.' + client_with_id
+
+        client_config = config.get(client)
+        if client_config is None:
+            client_config = {}
+        log.info("rgw %s config is %s", client, client_config)
+        cmd_prefix = [
+            'sudo',
+            'adjust-ulimits',
+            'ceph-coverage',
+            '{tdir}/archive/coverage'.format(tdir=testdir),
+            'daemon-helper',
+            'term',
+            ]
+
+        rgw_cmd = ['radosgw']
+
+        log.info("Using %s as radosgw frontend", ctx.rgw.frontend)
+
+        host, port = ctx.rgw.role_endpoints[client]
+        rgw_cmd.extend([
+            '--rgw-frontends',
+            '{frontend} port={port}'.format(frontend=ctx.rgw.frontend, port=port),
+            '-n', client_with_id,
+            '--cluster', cluster_name,
+            '-k', '/etc/ceph/{client_with_cluster}.keyring'.format(client_with_cluster=client_with_cluster),
+            '--log-file',
+            '/var/log/ceph/rgw.{client_with_cluster}.log'.format(client_with_cluster=client_with_cluster),
+            '--rgw_ops_log_socket_path',
+            '{tdir}/rgw.opslog.{client_with_cluster}.sock'.format(tdir=testdir,
+                                                     client_with_cluster=client_with_cluster),
+            '--foreground',
+            run.Raw('|'),
+            'sudo',
+            'tee',
+            '/var/log/ceph/rgw.{client_with_cluster}.stdout'.format(tdir=testdir,
+                                                       client_with_cluster=client_with_cluster),
+            run.Raw('2>&1'),
+            ])
+
+        if client_config.get('valgrind'):
+            cmd_prefix = teuthology.get_valgrind_args(
+                testdir,
+                client_with_cluster,
+                cmd_prefix,
+                client_config.get('valgrind')
+                )
+
+        run_cmd = list(cmd_prefix)
+        run_cmd.extend(rgw_cmd)
+
+        ctx.daemons.add_daemon(
+            remote, 'rgw', client_with_id,
+            cluster=cluster_name,
+            args=run_cmd,
+            logger=log.getChild(client),
+            stdin=run.PIPE,
+            wait=False,
+            )
+
+    # XXX: add_daemon() doesn't let us wait until radosgw finishes startup
+    for client in config.keys():
+        host, port = ctx.rgw.role_endpoints[client]
+        endpoint = 'http://{host}:{port}/'.format(host=host, port=port)
+        log.info('Polling {client} until it starts accepting connections on {endpoint}'.format(client=client, endpoint=endpoint))
+        wait_for_radosgw(endpoint)
+
+    try:
+        yield
+    finally:
+        for client in config.iterkeys():
+            cluster_name, daemon_type, client_id = teuthology.split_role(client)
+            client_with_id = daemon_type + '.' + client_id
+            client_with_cluster = cluster_name + '.' + client_with_id
+            ctx.daemons.get_daemon('rgw', client_with_id, cluster_name).stop()
+            ctx.cluster.only(client).run(
+                args=[
+                    'rm',
+                    '-f',
+                    '{tdir}/rgw.opslog.{client}.sock'.format(tdir=testdir,
+                                                             client=client_with_cluster),
+                    ],
+                )
+
+def assign_ports(ctx, config):
+    """
+    Assign port numberst starting with port 7280.
+    """
+    port = 7280
+    role_endpoints = {}
+    for remote, roles_for_host in ctx.cluster.remotes.iteritems():
+        for role in roles_for_host:
+            if role in config:
+                role_endpoints[role] = (remote.name.split('@')[1], port)
+                port += 1
+
+    return role_endpoints
+
+@contextlib.contextmanager
+def create_pools(ctx, clients):
+    """Create replicated or erasure coded data pools for rgw."""
+
+    log.info('Creating data pools')
+    for client in clients:
+        log.debug("Obtaining remote for client {}".format(client))
+        (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+        data_pool = '.rgw.buckets'
+        cluster_name, daemon_type, client_id = teuthology.split_role(client)
+
+        if ctx.rgw.ec_data_pool:
+            create_ec_pool(remote, data_pool, client, 64,
+                           ctx.rgw.erasure_code_profile, cluster_name, 'rgw')
+        else:
+            create_replicated_pool(remote, data_pool, 64, cluster_name, 'rgw')
+        if ctx.rgw.cache_pools:
+            create_cache_pool(remote, data_pool, data_pool + '.cache', 64,
+                              64*1024*1024, cluster_name)
+    log.debug('Pools created')
+    yield
+
+@contextlib.contextmanager
+def configure_compression(ctx, clients, compression):
+    """ set a compression type in the default zone placement """
+    log.info('Configuring compression type = %s', compression)
+    for client in clients:
+        # XXX: the 'default' zone and zonegroup aren't created until we run RGWRados::init_complete().
+        # issue a 'radosgw-admin user list' command to trigger this
+        rgwadmin(ctx, client, cmd=['user', 'list'], check_status=True)
+
+        rgwadmin(ctx, client,
+                cmd=['zone', 'placement', 'modify', '--rgw-zone', 'default',
+                     '--placement-id', 'default-placement',
+                     '--compression', compression],
+                check_status=True)
+    yield
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    For example, to run rgw on all clients::
+
+        tasks:
+        - ceph:
+        - rgw:
+
+    To only run on certain clients::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0, client.3]
+
+    or
+
+        tasks:
+        - ceph:
+        - rgw:
+            client.0:
+            client.3:
+
+    To run radosgw through valgrind:
+
+        tasks:
+        - ceph:
+        - rgw:
+            client.0:
+              valgrind: [--tool=memcheck]
+            client.3:
+              valgrind: [--tool=memcheck]
+    """
+    if config is None:
+        config = dict(('client.{id}'.format(id=id_), None)
+                      for id_ in teuthology.all_roles_of_type(
+                          ctx.cluster, 'client'))
+    elif isinstance(config, list):
+        config = dict((name, None) for name in config)
+
+    clients = config.keys() # http://tracker.ceph.com/issues/20417
+
+    overrides = ctx.config.get('overrides', {})
+    teuthology.deep_merge(config, overrides.get('rgw', {}))
+
+    role_endpoints = assign_ports(ctx, config)
+    ctx.rgw = argparse.Namespace()
+    ctx.rgw.role_endpoints = role_endpoints
+
+    ctx.rgw.ec_data_pool = bool(config.pop('ec-data-pool', False))
+    ctx.rgw.erasure_code_profile = config.pop('erasure_code_profile', {})
+    ctx.rgw.cache_pools = bool(config.pop('cache-pools', False))
+    ctx.rgw.frontend = config.pop('frontend', 'civetweb')
+    ctx.rgw.compression_type = config.pop('compression type', None)
+    ctx.rgw.config = config
+
+    log.debug("config is {}".format(config))
+    log.debug("client list is {}".format(clients))
+    subtasks = [
+        lambda: create_pools(ctx=ctx, clients=clients),
+    ]
+    if ctx.rgw.compression_type:
+        subtasks.extend([
+            lambda: configure_compression(ctx=ctx, clients=clients,
+                                          compression=ctx.rgw.compression_type),
+        ])
+    subtasks.extend([
+        lambda: start_rgw(ctx=ctx, config=config, clients=clients),
+    ])
+
+    with contextutil.nested(*subtasks):
+        yield
diff --git a/src/ceph/qa/tasks/rgw_logsocket.py b/src/ceph/qa/tasks/rgw_logsocket.py
new file mode 100644
index 0000000..6f49b00
--- /dev/null
+++ b/src/ceph/qa/tasks/rgw_logsocket.py
@@ -0,0 +1,161 @@
+"""
+rgw s3tests logging wrappers
+"""
+from cStringIO import StringIO
+from configobj import ConfigObj
+import contextlib
+import logging
+import s3tests
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Run s3tests download function
+    """
+    return s3tests.download(ctx, config)
+
+def _config_user(s3tests_conf, section, user):
+    """
+    Run s3tests user config function
+    """
+    return s3tests._config_user(s3tests_conf, section, user)
+
+@contextlib.contextmanager
+def create_users(ctx, config):
+    """
+    Run s3tests user create function
+    """
+    return s3tests.create_users(ctx, config)
+
+@contextlib.contextmanager
+def configure(ctx, config):
+    """
+    Run s3tests user configure function
+    """
+    return s3tests.configure(ctx, config)
+
+@contextlib.contextmanager
+def run_tests(ctx, config):
+    """
+    Run remote netcat tests
+    """
+    assert isinstance(config, dict)
+    testdir = teuthology.get_testdir(ctx)
+    for client, client_config in config.iteritems():
+        client_config['extra_args'] = [
+            's3tests.functional.test_s3:test_bucket_list_return_data',
+        ]
+#        args = [
+#                'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
+#                '{tdir}/s3-tests/virtualenv/bin/nosetests'.format(tdir=testdir),
+#                '-w',
+#                '{tdir}/s3-tests'.format(tdir=testdir),
+#                '-v',
+#		's3tests.functional.test_s3:test_bucket_list_return_data',
+#                ]
+#        if client_config is not None and 'extra_args' in client_config:
+#            args.extend(client_config['extra_args'])
+#
+#        ctx.cluster.only(client).run(
+#            args=args,
+#            )
+
+    s3tests.run_tests(ctx, config)
+
+    netcat_out = StringIO()
+
+    for client, client_config in config.iteritems():
+        ctx.cluster.only(client).run(
+            args = [
+                'netcat',
+                '-w', '5',
+                '-U', '{tdir}/rgw.opslog.sock'.format(tdir=testdir),
+                ],
+             stdout = netcat_out,
+        )
+
+        out = netcat_out.getvalue()
+
+        assert len(out) > 100
+
+        log.info('Received', out)
+
+    yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run some s3-tests suite against rgw, verify opslog socket returns data
+
+    Must restrict testing to a particular client::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3tests: [client.0]
+
+    To pass extra arguments to nose (e.g. to run a certain test)::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3tests:
+            client.0:
+              extra_args: ['test_s3:test_object_acl_grand_public_read']
+            client.1:
+              extra_args: ['--exclude', 'test_100_continue']
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task s3tests only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    overrides = ctx.config.get('overrides', {})
+    # merge each client section, not the top level.
+    for (client, cconf) in config.iteritems():
+        teuthology.deep_merge(cconf, overrides.get('rgw-logsocket', {}))
+
+    log.debug('config is %s', config)
+
+    s3tests_conf = {}
+    for client in clients:
+        s3tests_conf[client] = ConfigObj(
+            indent_type='',
+            infile={
+                'DEFAULT':
+                    {
+                    'port'      : 7280,
+                    'is_secure' : 'no',
+                    },
+                'fixtures' : {},
+                's3 main'  : {},
+                's3 alt'   : {},
+                }
+            )
+
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: create_users(ctx=ctx, config=dict(
+                clients=clients,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: configure(ctx=ctx, config=dict(
+                clients=config,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: run_tests(ctx=ctx, config=config),
+        ):
+        yield
diff --git a/src/ceph/qa/tasks/rgw_multi b/src/ceph/qa/tasks/rgw_multi
new file mode 120000
index 0000000..abfc703
--- /dev/null
+++ b/src/ceph/qa/tasks/rgw_multi
@@ -0,0 +1 @@
+../../src/test/rgw/rgw_multi
+\ No newline at end of file
diff --git a/src/ceph/qa/tasks/rgw_multisite.py b/src/ceph/qa/tasks/rgw_multisite.py
new file mode 100644
index 0000000..74c1f3f
--- /dev/null
+++ b/src/ceph/qa/tasks/rgw_multisite.py
@@ -0,0 +1,427 @@
+"""
+rgw multisite configuration routines
+"""
+import argparse
+import contextlib
+import logging
+import random
+import string
+from copy import deepcopy
+from util.rgw import rgwadmin, wait_for_radosgw
+from util.rados import create_ec_pool, create_replicated_pool
+from rgw_multi import multisite
+from rgw_multi.zone_rados import RadosZone as RadosZone
+
+from teuthology.orchestra import run
+from teuthology import misc
+from teuthology.exceptions import ConfigError
+from teuthology.task import Task
+
+log = logging.getLogger(__name__)
+
+class RGWMultisite(Task):
+    """
+    Performs rgw multisite configuration to match the given realm definition.
+
+        - rgw-multisite:
+            realm:
+              name: test-realm
+              is_default: true
+
+    List one or more zonegroup definitions. These are provided as json
+    input to `radosgw-admin zonegroup set`, with the exception of these keys:
+
+    * 'is_master' is passed on the command line as --master
+    * 'is_default' is passed on the command line as --default
+    * 'endpoints' given as client names are replaced with actual endpoints
+
+            zonegroups:
+              - name: test-zonegroup
+                api_name: test-api
+                is_master: true
+                is_default: true
+                endpoints: [c1.client.0]
+
+    List each of the zones to be created in this zonegroup.
+
+                zones:
+                  - name: test-zone1
+                    is_master: true
+                    is_default: true
+                    endpoints: [c1.client.0]
+                  - name: test-zone2
+                    is_default: true
+                    endpoints: [c2.client.0]
+
+    A complete example:
+
+        tasks:
+        - install:
+        - ceph: {cluster: c1}
+        - ceph: {cluster: c2}
+        - rgw:
+            c1.client.0:
+            c2.client.0:
+        - rgw-multisite:
+            realm:
+              name: test-realm
+              is_default: true
+            zonegroups:
+              - name: test-zonegroup
+                is_master: true
+                is_default: true
+                zones:
+                  - name: test-zone1
+                    is_master: true
+                    is_default: true
+                    endpoints: [c1.client.0]
+                  - name: test-zone2
+                    is_default: true
+                    endpoints: [c2.client.0]
+
+    """
+    def __init__(self, ctx, config):
+        super(RGWMultisite, self).__init__(ctx, config)
+
+    def setup(self):
+        super(RGWMultisite, self).setup()
+
+        overrides = self.ctx.config.get('overrides', {})
+        misc.deep_merge(self.config, overrides.get('rgw-multisite', {}))
+
+        if not self.ctx.rgw:
+            raise ConfigError('rgw-multisite must run after the rgw task')
+        role_endpoints = self.ctx.rgw.role_endpoints
+
+        # construct Clusters and Gateways for each client in the rgw task
+        clusters, gateways = extract_clusters_and_gateways(self.ctx,
+                                                           role_endpoints)
+
+        # get the master zone and zonegroup configuration
+        mz, mzg = extract_master_zone_zonegroup(self.config['zonegroups'])
+        cluster1 = cluster_for_zone(clusters, mz)
+
+        # create the realm and period on the master zone's cluster
+        log.info('creating realm..')
+        realm = create_realm(cluster1, self.config['realm'])
+        period = realm.current_period
+
+        creds = gen_credentials()
+
+        # create the master zonegroup and its master zone
+        log.info('creating master zonegroup..')
+        master_zonegroup = create_zonegroup(cluster1, gateways, period,
+                                            deepcopy(mzg))
+        period.master_zonegroup = master_zonegroup
+
+        log.info('creating master zone..')
+        master_zone = create_zone(self.ctx, cluster1, gateways, creds,
+                                  master_zonegroup, deepcopy(mz))
+        master_zonegroup.master_zone = master_zone
+
+        period.update(master_zone, commit=True)
+        restart_zone_gateways(master_zone) # restart with --rgw-zone
+
+        # create the admin user on the master zone
+        log.info('creating admin user..')
+        user_args = ['--display-name', 'Realm Admin', '--system']
+        user_args += creds.credential_args()
+        admin_user = multisite.User('realm-admin')
+        admin_user.create(master_zone, user_args)
+
+        # process 'zonegroups'
+        for zg_config in self.config['zonegroups']:
+            zones_config = zg_config.pop('zones')
+
+            zonegroup = None
+            for zone_config in zones_config:
+                # get the cluster for this zone
+                cluster = cluster_for_zone(clusters, zone_config)
+
+                if cluster != cluster1: # already created on master cluster
+                    log.info('pulling realm configuration to %s', cluster.name)
+                    realm.pull(cluster, master_zone.gateways[0], creds)
+
+                # use the first zone's cluster to create the zonegroup
+                if not zonegroup:
+                    if zg_config['name'] == master_zonegroup.name:
+                        zonegroup = master_zonegroup
+                    else:
+                        log.info('creating zonegroup..')
+                        zonegroup = create_zonegroup(cluster, gateways,
+                                                     period, zg_config)
+
+                if zone_config['name'] == master_zone.name:
+                    # master zone was already created
+                    zone = master_zone
+                else:
+                    # create the zone and commit the period
+                    log.info('creating zone..')
+                    zone = create_zone(self.ctx, cluster, gateways, creds,
+                                       zonegroup, zone_config)
+                    period.update(zone, commit=True)
+
+                    restart_zone_gateways(zone) # restart with --rgw-zone
+
+        # attach configuration to the ctx for other tasks
+        self.ctx.rgw_multisite = argparse.Namespace()
+        self.ctx.rgw_multisite.clusters = clusters
+        self.ctx.rgw_multisite.gateways = gateways
+        self.ctx.rgw_multisite.realm = realm
+        self.ctx.rgw_multisite.admin_user = admin_user
+
+        log.info('rgw multisite configuration completed')
+
+    def end(self):
+        del self.ctx.rgw_multisite
+
+class Cluster(multisite.Cluster):
+    """ Issues 'radosgw-admin' commands with the rgwadmin() helper """
+    def __init__(self, ctx, name, client):
+        super(Cluster, self).__init__()
+        self.ctx = ctx
+        self.name = name
+        self.client = client
+
+    def admin(self, args = None, **kwargs):
+        """ radosgw-admin command """
+        args = args or []
+        args += ['--cluster', self.name]
+        args += ['--debug-rgw', '0']
+        if kwargs.pop('read_only', False):
+            args += ['--rgw-cache-enabled', 'false']
+        kwargs['decode'] = False
+        check_retcode = kwargs.pop('check_retcode', True)
+        r, s = rgwadmin(self.ctx, self.client, args, **kwargs)
+        if check_retcode:
+            assert r == 0
+        return s, r
+
+class Gateway(multisite.Gateway):
+    """ Controls a radosgw instance using its daemon """
+    def __init__(self, role, remote, daemon, *args, **kwargs):
+        super(Gateway, self).__init__(*args, **kwargs)
+        self.role = role
+        self.remote = remote
+        self.daemon = daemon
+
+    def set_zone(self, zone):
+        """ set the zone and add its args to the daemon's command line """
+        assert self.zone is None, 'zone can only be set once'
+        self.zone = zone
+        # daemon.restart_with_args() would be perfect for this, except that
+        # radosgw args likely include a pipe and redirect. zone arguments at
+        # the end won't actually apply to radosgw
+        args = self.daemon.command_kwargs.get('args', [])
+        try:
+            # insert zone args before the first |
+            pipe = args.index(run.Raw('|'))
+            args = args[0:pipe] + zone.zone_args() + args[pipe:]
+        except ValueError, e:
+            args += zone.zone_args()
+        self.daemon.command_kwargs['args'] = args
+
+    def start(self, args = None):
+        """ (re)start the daemon """
+        self.daemon.restart()
+        # wait until startup completes
+        wait_for_radosgw(self.endpoint())
+
+    def stop(self):
+        """ stop the daemon """
+        self.daemon.stop()
+
+def extract_clusters_and_gateways(ctx, role_endpoints):
+    """ create cluster and gateway instances for all of the radosgw roles """
+    clusters = {}
+    gateways = {}
+    for role, (host, port) in role_endpoints.iteritems():
+        cluster_name, daemon_type, client_id = misc.split_role(role)
+        # find or create the cluster by name
+        cluster = clusters.get(cluster_name)
+        if not cluster:
+            clusters[cluster_name] = cluster = Cluster(ctx, cluster_name, role)
+        # create a gateway for this daemon
+        client_with_id = daemon_type + '.' + client_id # match format from rgw.py
+        daemon = ctx.daemons.get_daemon('rgw', client_with_id, cluster_name)
+        if not daemon:
+            raise ConfigError('no daemon for role=%s cluster=%s type=rgw id=%s' % \
+                              (role, cluster_name, client_id))
+        (remote,) = ctx.cluster.only(role).remotes.keys()
+        gateways[role] = Gateway(role, remote, daemon, host, port, cluster)
+    return clusters, gateways
+
+def create_realm(cluster, config):
+    """ create a realm from configuration and initialize its first period """
+    realm = multisite.Realm(config['name'])
+    args = []
+    if config.get('is_default', False):
+        args += ['--default']
+    realm.create(cluster, args)
+    realm.current_period = multisite.Period(realm)
+    return realm
+
+def extract_user_credentials(config):
+    """ extract keys from configuration """
+    return multisite.Credentials(config['access_key'], config['secret_key'])
+
+def extract_master_zone(zonegroup_config):
+    """ find and return the master zone definition """
+    master = None
+    for zone in zonegroup_config['zones']:
+        if not zone.get('is_master', False):
+            continue
+        if master:
+            raise ConfigError('zones %s and %s cannot both set \'is_master\'' % \
+                              (master['name'], zone['name']))
+        master = zone
+        # continue the loop so we can detect duplicates
+    if not master:
+        raise ConfigError('one zone must set \'is_master\' in zonegroup %s' % \
+                          zonegroup_config['name'])
+    return master
+
+def extract_master_zone_zonegroup(zonegroups_config):
+    """ find and return the master zone and zonegroup definitions """
+    master_zone, master_zonegroup = (None, None)
+    for zonegroup in zonegroups_config:
+        # verify that all zonegroups have a master zone set, even if they
+        # aren't in the master zonegroup
+        zone = extract_master_zone(zonegroup)
+        if not zonegroup.get('is_master', False):
+            continue
+        if master_zonegroup:
+            raise ConfigError('zonegroups %s and %s cannot both set \'is_master\'' % \
+                              (master_zonegroup['name'], zonegroup['name']))
+        master_zonegroup = zonegroup
+        master_zone = zone
+        # continue the loop so we can detect duplicates
+    if not master_zonegroup:
+        raise ConfigError('one zonegroup must set \'is_master\'')
+    return master_zone, master_zonegroup
+
+def extract_zone_cluster_name(zone_config):
+    """ return the cluster (must be common to all zone endpoints) """
+    cluster_name = None
+    endpoints = zone_config.get('endpoints')
+    if not endpoints:
+        raise ConfigError('zone %s missing \'endpoints\' list' % \
+                          zone_config['name'])
+    for role in endpoints:
+        name, _, _ = misc.split_role(role)
+        if not cluster_name:
+            cluster_name = name
+        elif cluster_name != name:
+            raise ConfigError('all zone %s endpoints must be in the same cluster' % \
+                              zone_config['name'])
+    return cluster_name
+
+def cluster_for_zone(clusters, zone_config):
+    """ return the cluster entry for the given zone """
+    name = extract_zone_cluster_name(zone_config)
+    try:
+        return clusters[name]
+    except KeyError:
+        raise ConfigError('no cluster %s found' % name)
+
+def gen_access_key():
+    return ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(16))
+
+def gen_secret():
+    return ''.join(random.choice(string.ascii_uppercase + string.ascii_lowercase + string.digits) for _ in range(32))
+
+def gen_credentials():
+    return multisite.Credentials(gen_access_key(), gen_secret())
+
+def extract_gateway_endpoints(gateways, endpoints_config):
+    """ return a list of gateway endpoints associated with the given roles """
+    endpoints = []
+    for role in endpoints_config:
+        try:
+            # replace role names with their gateway's endpoint
+            endpoints.append(gateways[role].endpoint())
+        except KeyError:
+            raise ConfigError('no radosgw endpoint found for role %s' % role)
+    return endpoints
+
+def is_default_arg(config):
+    return ['--default'] if config.pop('is_default', False) else []
+
+def is_master_arg(config):
+    return ['--master'] if config.pop('is_master', False) else []
+
+def create_zonegroup(cluster, gateways, period, config):
+    """ pass the zonegroup configuration to `zonegroup set` """
+    config.pop('zones', None) # remove 'zones' from input to `zonegroup set`
+    endpoints = config.get('endpoints')
+    if endpoints:
+        # replace client names with their gateway endpoints
+        config['endpoints'] = extract_gateway_endpoints(gateways, endpoints)
+    zonegroup = multisite.ZoneGroup(config['name'], period)
+    # `zonegroup set` needs --default on command line, and 'is_master' in json
+    args = is_default_arg(config)
+    zonegroup.set(cluster, config, args)
+    period.zonegroups.append(zonegroup)
+    return zonegroup
+
+def create_zone(ctx, cluster, gateways, creds, zonegroup, config):
+    """ create a zone with the given configuration """
+    zone = multisite.Zone(config['name'], zonegroup, cluster)
+    zone = RadosZone(config['name'], zonegroup, cluster)
+
+    # collect Gateways for the zone's endpoints
+    endpoints = config.get('endpoints')
+    if not endpoints:
+        raise ConfigError('no \'endpoints\' for zone %s' % config['name'])
+    zone.gateways = [gateways[role] for role in endpoints]
+    for gateway in zone.gateways:
+        gateway.set_zone(zone)
+
+    # format the gateway endpoints
+    endpoints = [g.endpoint() for g in zone.gateways]
+
+    args = is_default_arg(config)
+    args += is_master_arg(config)
+    args += creds.credential_args()
+    if len(endpoints):
+        args += ['--endpoints', ','.join(endpoints)]
+    zone.create(cluster, args)
+    zonegroup.zones.append(zone)
+
+    create_zone_pools(ctx, zone)
+    if ctx.rgw.compression_type:
+        configure_zone_compression(zone, ctx.rgw.compression_type)
+
+    zonegroup.zones_by_type.setdefault(zone.tier_type(), []).append(zone)
+
+    if zone.is_read_only():
+        zonegroup.ro_zones.append(zone)
+    else:
+        zonegroup.rw_zones.append(zone)
+
+    return zone
+
+def create_zone_pools(ctx, zone):
+    """ Create the data_pool for each placement type """
+    gateway = zone.gateways[0]
+    cluster = zone.cluster
+    for pool_config in zone.data.get('placement_pools', []):
+        pool_name = pool_config['val']['data_pool']
+        if ctx.rgw.ec_data_pool:
+            create_ec_pool(gateway.remote, pool_name, zone.name, 64,
+                           ctx.rgw.erasure_code_profile, cluster.name, 'rgw')
+        else:
+            create_replicated_pool(gateway.remote, pool_name, 64, cluster.name, 'rgw')
+
+def configure_zone_compression(zone, compression):
+    """ Set compression type in the zone's default-placement """
+    zone.json_command(zone.cluster, 'placement', ['modify',
+                          '--placement-id', 'default-placement',
+                          '--compression', compression
+                      ])
+
+def restart_zone_gateways(zone):
+    zone.stop()
+    zone.start()
+
+task = RGWMultisite
diff --git a/src/ceph/qa/tasks/rgw_multisite_tests.py b/src/ceph/qa/tasks/rgw_multisite_tests.py
new file mode 100644
index 0000000..4e6e2b3
--- /dev/null
+++ b/src/ceph/qa/tasks/rgw_multisite_tests.py
@@ -0,0 +1,91 @@
+"""
+rgw multisite testing
+"""
+import logging
+import sys
+import nose.core
+import nose.config
+
+from teuthology.exceptions import ConfigError
+from teuthology.task import Task
+from teuthology import misc
+
+from rgw_multi import multisite, tests
+
+log = logging.getLogger(__name__)
+
+class RGWMultisiteTests(Task):
+    """
+    Runs the rgw_multi tests against a multisite configuration created by the
+    rgw-multisite task. Tests are run with nose, using any additional 'args'
+    provided. Overrides for tests.Config can be set in 'config'.
+
+        - rgw-multisite-tests:
+            args:
+            - tasks.rgw_multi.tests:test_object_sync
+            config:
+              reconfigure_delay: 60
+
+    """
+    def __init__(self, ctx, config):
+        super(RGWMultisiteTests, self).__init__(ctx, config)
+
+    def setup(self):
+        super(RGWMultisiteTests, self).setup()
+
+        overrides = self.ctx.config.get('overrides', {})
+        misc.deep_merge(self.config, overrides.get('rgw-multisite-tests', {}))
+
+        if not self.ctx.rgw_multisite:
+            raise ConfigError('rgw-multisite-tests must run after the rgw-multisite task')
+        realm = self.ctx.rgw_multisite.realm
+        master_zone = realm.meta_master_zone()
+
+        # create the test user
+        log.info('creating test user..')
+        user = multisite.User('rgw-multisite-test-user')
+        user.create(master_zone, ['--display-name', 'Multisite Test User',
+                                  '--gen-access-key', '--gen-secret'])
+
+        config = self.config.get('config', {})
+        tests.init_multi(realm, user, tests.Config(**config))
+        tests.realm_meta_checkpoint(realm)
+
+    def begin(self):
+        # extra arguments for nose can be passed as a string or list
+        extra_args = self.config.get('args', [])
+        if not isinstance(extra_args, list):
+            extra_args = [extra_args]
+        argv = [__name__] + extra_args
+
+        log.info("running rgw multisite tests on '%s' with args=%r",
+                 tests.__name__, extra_args)
+
+        # run nose tests in the rgw_multi.tests module
+        conf = nose.config.Config(stream=get_log_stream(), verbosity=2)
+        result = nose.run(defaultTest=tests.__name__, argv=argv, config=conf)
+        if not result:
+            raise RuntimeError('rgw multisite test failures')
+
+def get_log_stream():
+    """ return a log stream for nose output """
+    # XXX: this is a workaround for IOErrors when nose writes to stderr,
+    # copied from vstart_runner.py
+    class LogStream(object):
+        def __init__(self):
+            self.buffer = ""
+
+        def write(self, data):
+            self.buffer += data
+            if "\n" in self.buffer:
+                lines = self.buffer.split("\n")
+                for line in lines[:-1]:
+                    log.info(line)
+                self.buffer = lines[-1]
+
+        def flush(self):
+            pass
+
+    return LogStream()
+
+task = RGWMultisiteTests
diff --git a/src/ceph/qa/tasks/s3a_hadoop.py b/src/ceph/qa/tasks/s3a_hadoop.py
new file mode 100644
index 0000000..c01fe1d
--- /dev/null
+++ b/src/ceph/qa/tasks/s3a_hadoop.py
@@ -0,0 +1,343 @@
+import contextlib
+import logging
+import time
+from teuthology import misc
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+     Run Hadoop S3A tests using Ceph
+     usage:
+      -tasks:
+         ceph-ansible:
+         s3a-hadoop:
+           maven-version: '3.3.9' (default)
+           hadoop-version: '2.7.3'
+           bucket-name: 's3atest' (default)
+           access-key: 'anykey' (uses a default value)
+           secret-key: 'secretkey' ( uses a default value)
+    """
+    if config is None:
+        config = {}
+
+    assert isinstance(config, dict), \
+        "task only supports a dictionary for configuration"
+
+    overrides = ctx.config.get('overrides', {})
+    misc.deep_merge(config, overrides.get('s3a-hadoop', {}))
+    testdir = misc.get_testdir(ctx)
+    rgws = ctx.cluster.only(misc.is_type('rgw'))
+    # use the first rgw node to test s3a
+    rgw_node = rgws.remotes.keys()[0]
+    # get versions
+    maven_major = config.get('maven-major', 'maven-3')
+    maven_version = config.get('maven-version', '3.3.9')
+    hadoop_ver = config.get('hadoop-version', '2.7.3')
+    bucket_name = config.get('bucket-name', 's3atest')
+    access_key = config.get('access-key', 'EGAQRD2ULOIFKFSKCT4F')
+    secret_key = config.get(
+        'secret-key',
+        'zi816w1vZKfaSM85Cl0BxXTwSLyN7zB4RbTswrGb')
+
+    # set versions for cloning the repo
+    apache_maven = 'apache-maven-{maven_version}-bin.tar.gz'.format(
+        maven_version=maven_version)
+    maven_link = 'http://mirror.jax.hugeserver.com/apache/maven/' + \
+        '{maven_major}/{maven_version}/binaries/'.format(maven_major=maven_major, maven_version=maven_version) + apache_maven
+    hadoop_git = 'https://github.com/apache/hadoop'
+    hadoop_rel = 'hadoop-{ver} rel/release-{ver}'.format(ver=hadoop_ver)
+    install_prereq(rgw_node)
+    rgw_node.run(
+        args=[
+            'cd',
+            testdir,
+            run.Raw('&&'),
+            'wget',
+            maven_link,
+            run.Raw('&&'),
+            'tar',
+            '-xvf',
+            apache_maven,
+            run.Raw('&&'),
+            'git',
+            'clone',
+            run.Raw(hadoop_git),
+            run.Raw('&&'),
+            'cd',
+            'hadoop',
+            run.Raw('&&'),
+            'git',
+            'checkout',
+            '-b',
+            run.Raw(hadoop_rel)
+        ]
+    )
+    dnsmasq_name = 's3.ceph.com'
+    configure_s3a(rgw_node, dnsmasq_name, access_key, secret_key, bucket_name, testdir)
+    setup_dnsmasq(rgw_node, dnsmasq_name)
+    fix_rgw_config(rgw_node, dnsmasq_name)
+    setup_user_bucket(rgw_node, dnsmasq_name, access_key, secret_key, bucket_name, testdir)
+    if hadoop_ver.startswith('2.8'):
+        # test all ITtests but skip AWS test using public bucket landsat-pds
+        # which is not available from within this test
+        test_options = '-Dit.test=ITestS3A* -Dit.test=\!ITestS3AAWSCredentialsProvider* -Dparallel-tests -Dscale -Dfs.s3a.scale.test.huge.filesize=128M verify'
+    else:
+        test_options = 'test -Dtest=S3a*,TestS3A*'
+    try:
+        run_s3atest(rgw_node, maven_version, testdir, test_options)
+        yield
+    finally:
+        log.info("Done s3a testing, Cleaning up")
+        for fil in ['apache*', 'hadoop*', 'venv*', 'create*']:
+            rgw_node.run(args=['rm', run.Raw('-rf'), run.Raw('{tdir}/{file}'.format(tdir=testdir, file=fil))])
+        # restart and let NM restore original config
+        rgw_node.run(args=['sudo', 'systemctl', 'stop', 'dnsmasq'])
+        rgw_node.run(args=['sudo', 'systemctl', 'restart', 'network.service'], check_status=False)
+        rgw_node.run(args=['sudo', 'systemctl', 'status', 'network.service'], check_status=False)
+
+
+def install_prereq(client):
+    """
+    Install pre requisites for RHEL and CentOS
+    TBD: Ubuntu
+    """
+    if client.os.name == 'rhel' or client.os.name == 'centos':
+        client.run(
+               args=[
+                    'sudo',
+                    'yum',
+                    'install',
+                    '-y',
+                    'protobuf-c.x86_64',
+                    'java',
+                    'java-1.8.0-openjdk-devel',
+                    'dnsmasq'
+                    ]
+                )
+
+
+def setup_dnsmasq(client, name):
+    """
+    Setup simple dnsmasq name eg: s3.ceph.com
+    Local RGW host can then be used with whatever name has been setup with.
+    """
+    resolv_conf = "nameserver 127.0.0.1\n"
+    dnsmasq_template = """address=/{name}/{ip_address}
+server=8.8.8.8
+server=8.8.4.4
+""".format(name=name, ip_address=client.ip_address)
+    dnsmasq_config_path = '/etc/dnsmasq.d/ceph'
+    # point resolv.conf to local dnsmasq
+    misc.sudo_write_file(
+        remote=client,
+        path='/etc/resolv.conf',
+        data=resolv_conf,
+    )
+    misc.sudo_write_file(
+        remote=client,
+        path=dnsmasq_config_path,
+        data=dnsmasq_template,
+    )
+    client.run(args=['cat', dnsmasq_config_path])
+    # restart dnsmasq
+    client.run(args=['sudo', 'systemctl', 'restart', 'dnsmasq'])
+    client.run(args=['sudo', 'systemctl', 'status', 'dnsmasq'])
+    time.sleep(5)
+    # verify dns name is set
+    client.run(args=['ping', '-c', '4', name])
+
+
+def fix_rgw_config(client, name):
+    """
+    Fix RGW config in ceph.conf, we need rgw dns name entry
+    and also modify the port to use :80 for s3a tests to work
+    """
+    rgw_dns_name = 'rgw dns name = {name}'.format(name=name)
+    ceph_conf_path = '/etc/ceph/ceph.conf'
+    # append rgw_dns_name
+    client.run(
+        args=[
+            'sudo',
+            'sed',
+            run.Raw('-i'),
+            run.Raw("'/client.rgw*/a {rgw_name}'".format(rgw_name=rgw_dns_name)),
+            ceph_conf_path
+
+        ]
+    )
+    # listen on port 80
+    client.run(
+        args=[
+            'sudo',
+            'sed',
+            run.Raw('-i'),
+            run.Raw('s/:8080/:80/'),
+            ceph_conf_path
+        ]
+    )
+    client.run(args=['cat', ceph_conf_path])
+    client.run(args=['sudo', 'systemctl', 'restart', 'ceph-radosgw.target'])
+    client.run(args=['sudo', 'systemctl', 'status', 'ceph-radosgw.target'])
+
+
+def setup_user_bucket(client, dns_name, access_key, secret_key, bucket_name, testdir):
+    """
+    Create user with access_key and secret_key that will be
+    used for the s3a testdir
+    """
+    client.run(
+        args=[
+            'sudo',
+            'radosgw-admin',
+            'user',
+            'create',
+            run.Raw('--uid'),
+            's3a',
+            run.Raw('--display-name=s3a cephtests'),
+            run.Raw('--access-key={access_key}'.format(access_key=access_key)),
+            run.Raw('--secret-key={secret_key}'.format(secret_key=secret_key)),
+            run.Raw('--email=s3a@ceph.com'),
+        ]
+    )
+    client.run(
+        args=[
+            'virtualenv',
+            '{testdir}/venv'.format(testdir=testdir),
+            run.Raw('&&'),
+            run.Raw('{testdir}/venv/bin/pip'.format(testdir=testdir)),
+            'install',
+            'boto'
+        ]
+    )
+    create_bucket = """
+#!/usr/bin/env python
+import boto
+import boto.s3.connection
+access_key = '{access_key}'
+secret_key = '{secret_key}'
+
+conn = boto.connect_s3(
+        aws_access_key_id = access_key,
+        aws_secret_access_key = secret_key,
+        host = '{dns_name}',
+        is_secure=False,
+        calling_format = boto.s3.connection.OrdinaryCallingFormat(),
+        )
+bucket = conn.create_bucket('{bucket_name}')
+for bucket in conn.get_all_buckets():
+        print bucket.name + "\t" + bucket.creation_date
+""".format(access_key=access_key, secret_key=secret_key, dns_name=dns_name, bucket_name=bucket_name)
+    py_bucket_file = '{testdir}/create_bucket.py'.format(testdir=testdir)
+    misc.sudo_write_file(
+        remote=client,
+        path=py_bucket_file,
+        data=create_bucket,
+        perms='0744',
+        )
+    client.run(
+        args=[
+            'cat',
+            '{testdir}/create_bucket.py'.format(testdir=testdir),
+        ]
+    )
+    client.run(
+        args=[
+            '{testdir}/venv/bin/python'.format(testdir=testdir),
+            '{testdir}/create_bucket.py'.format(testdir=testdir),
+        ]
+    )
+
+
+def run_s3atest(client, maven_version, testdir, test_options):
+    """
+    Finally run the s3a test
+    """
+    aws_testdir = '{testdir}/hadoop/hadoop-tools/hadoop-aws/'.format(testdir=testdir)
+    run_test = '{testdir}/apache-maven-{maven_version}/bin/mvn'.format(testdir=testdir, maven_version=maven_version)
+    client.run(
+        args=[
+            'cd',
+            run.Raw(aws_testdir),
+            run.Raw('&&'),
+            run.Raw(run_test),
+            run.Raw(test_options)
+        ]
+    )
+
+
+def configure_s3a(client, dns_name, access_key, secret_key, bucket_name, testdir):
+    """
+    Use the template to configure s3a test, Fill in access_key, secret_key
+    and other details required for test.
+    """
+    config_template = """<configuration>
+<property>
+<name>fs.s3a.endpoint</name>
+<value>{name}</value>
+</property>
+
+<property>
+<name>fs.s3a.connection.ssl.enabled</name>
+<value>false</value>
+</property>
+
+<property>
+<name>test.fs.s3n.name</name>
+<value>s3n://{bucket_name}/</value>
+</property>
+
+<property>
+<name>test.fs.s3a.name</name>
+<value>s3a://{bucket_name}/</value>
+</property>
+
+<property>
+<name>test.fs.s3.name</name>
+<value>s3://{bucket_name}/</value>
+</property>
+
+<property>
+<name>fs.s3.awsAccessKeyId</name>
+<value>{access_key}</value>
+</property>
+
+<property>
+<name>fs.s3.awsSecretAccessKey</name>
+<value>{secret_key}</value>
+</property>
+
+<property>
+<name>fs.s3n.awsAccessKeyId</name>
+<value>{access_key}</value>
+</property>
+
+<property>
+<name>fs.s3n.awsSecretAccessKey</name>
+<value>{secret_key}</value>
+</property>
+
+<property>
+<name>fs.s3a.access.key</name>
+<description>AWS access key ID. Omit for Role-based authentication.</description>
+<value>{access_key}</value>
+</property>
+
+<property>
+<name>fs.s3a.secret.key</name>
+<description>AWS secret key. Omit for Role-based authentication.</description>
+<value>{secret_key}</value>
+</property>
+</configuration>
+""".format(name=dns_name, bucket_name=bucket_name, access_key=access_key, secret_key=secret_key)
+    config_path = testdir + '/hadoop/hadoop-tools/hadoop-aws/src/test/resources/auth-keys.xml'
+    misc.write_file(
+        remote=client,
+        path=config_path,
+        data=config_template,
+    )
+    # output for debug
+    client.run(args=['cat', config_path])
diff --git a/src/ceph/qa/tasks/s3readwrite.py b/src/ceph/qa/tasks/s3readwrite.py
new file mode 100644
index 0000000..9f1507e
--- /dev/null
+++ b/src/ceph/qa/tasks/s3readwrite.py
@@ -0,0 +1,346 @@
+"""
+Run rgw s3 readwite tests
+"""
+from cStringIO import StringIO
+import base64
+import contextlib
+import logging
+import os
+import random
+import string
+import yaml
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.orchestra import run
+from teuthology.orchestra.connection import split_user
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Download the s3 tests from the git builder.
+    Remove downloaded s3 file upon exit.
+    
+    The context passed in should be identical to the context
+    passed in to the main task.
+    """
+    assert isinstance(config, dict)
+    log.info('Downloading s3-tests...')
+    testdir = teuthology.get_testdir(ctx)
+    for (client, cconf) in config.items():
+        branch = cconf.get('force-branch', None)
+        if not branch:
+            branch = cconf.get('branch', 'master')
+        sha1 = cconf.get('sha1')
+        ctx.cluster.only(client).run(
+            args=[
+                'git', 'clone',
+                '-b', branch,
+                teuth_config.ceph_git_base_url + 's3-tests.git',
+                '{tdir}/s3-tests'.format(tdir=testdir),
+                ],
+            )
+        if sha1 is not None:
+            ctx.cluster.only(client).run(
+                args=[
+                    'cd', '{tdir}/s3-tests'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'git', 'reset', '--hard', sha1,
+                    ],
+                )
+    try:
+        yield
+    finally:
+        log.info('Removing s3-tests...')
+        testdir = teuthology.get_testdir(ctx)
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=[
+                    'rm',
+                    '-rf',
+                    '{tdir}/s3-tests'.format(tdir=testdir),
+                    ],
+                )
+
+
+def _config_user(s3tests_conf, section, user):
+    """
+    Configure users for this section by stashing away keys, ids, and
+    email addresses.
+    """
+    s3tests_conf[section].setdefault('user_id', user)
+    s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user))
+    s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user))
+    s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20)))
+    s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40)))
+
+@contextlib.contextmanager
+def create_users(ctx, config):
+    """
+    Create a default s3 user.
+    """
+    assert isinstance(config, dict)
+    log.info('Creating rgw users...')
+    testdir = teuthology.get_testdir(ctx)
+    users = {'s3': 'foo'}
+    cached_client_user_names = dict()
+    for client in config['clients']:
+        cached_client_user_names[client] = dict()
+        s3tests_conf = config['s3tests_conf'][client]
+        s3tests_conf.setdefault('readwrite', {})
+        s3tests_conf['readwrite'].setdefault('bucket', 'rwtest-' + client + '-{random}-')
+        s3tests_conf['readwrite'].setdefault('readers', 10)
+        s3tests_conf['readwrite'].setdefault('writers', 3)
+        s3tests_conf['readwrite'].setdefault('duration', 300)
+        s3tests_conf['readwrite'].setdefault('files', {})
+        rwconf = s3tests_conf['readwrite']
+        rwconf['files'].setdefault('num', 10)
+        rwconf['files'].setdefault('size', 2000)
+        rwconf['files'].setdefault('stddev', 500)
+        for section, user in users.iteritems():
+            _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client))
+            log.debug('creating user {user} on {client}'.format(user=s3tests_conf[section]['user_id'],
+                                                                client=client))
+
+            # stash the 'delete_user' flag along with user name for easier cleanup
+            delete_this_user = True
+            if 'delete_user' in s3tests_conf['s3']:
+                delete_this_user = s3tests_conf['s3']['delete_user']
+                log.debug('delete_user set to {flag} for {client}'.format(flag=delete_this_user, client=client))
+            cached_client_user_names[client][section+user] = (s3tests_conf[section]['user_id'], delete_this_user)
+
+            # skip actual user creation if the create_user flag is set to false for this client
+            if 'create_user' in s3tests_conf['s3'] and s3tests_conf['s3']['create_user'] == False:
+                log.debug('create_user set to False, skipping user creation for {client}'.format(client=client))
+                continue
+            else:
+                ctx.cluster.only(client).run(
+                    args=[
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        '{tdir}/archive/coverage'.format(tdir=testdir),
+                        'radosgw-admin',
+                        '-n', client,
+                        'user', 'create',
+                        '--uid', s3tests_conf[section]['user_id'],
+                        '--display-name', s3tests_conf[section]['display_name'],
+                        '--access-key', s3tests_conf[section]['access_key'],
+                        '--secret', s3tests_conf[section]['secret_key'],
+                        '--email', s3tests_conf[section]['email'],
+                    ],
+                )
+    try:
+        yield
+    finally:
+        for client in config['clients']:
+            for section, user in users.iteritems():
+                #uid = '{user}.{client}'.format(user=user, client=client)
+                real_uid, delete_this_user  = cached_client_user_names[client][section+user]
+                if delete_this_user:
+                    ctx.cluster.only(client).run(
+                        args=[
+                            'adjust-ulimits',
+                            'ceph-coverage',
+                            '{tdir}/archive/coverage'.format(tdir=testdir),
+                            'radosgw-admin',
+                            '-n', client,
+                            'user', 'rm',
+                            '--uid', real_uid,
+                            '--purge-data',
+                            ],
+                        )
+                else:
+                    log.debug('skipping delete for user {uid} on {client}'.format(uid=real_uid, client=client))
+
+@contextlib.contextmanager
+def configure(ctx, config):
+    """
+    Configure the s3-tests.  This includes the running of the
+    bootstrap code and the updating of local conf files.
+    """
+    assert isinstance(config, dict)
+    log.info('Configuring s3-readwrite-tests...')
+    for client, properties in config['clients'].iteritems():
+        s3tests_conf = config['s3tests_conf'][client]
+        if properties is not None and 'rgw_server' in properties:
+            host = None
+            for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
+                log.info('roles: ' + str(roles))
+                log.info('target: ' + str(target))
+                if properties['rgw_server'] in roles:
+                    _, host = split_user(target)
+            assert host is not None, "Invalid client specified as the rgw_server"
+            s3tests_conf['s3']['host'] = host
+        else:
+            s3tests_conf['s3']['host'] = 'localhost'
+
+        def_conf = s3tests_conf['DEFAULT']
+        s3tests_conf['s3'].setdefault('port', def_conf['port'])
+        s3tests_conf['s3'].setdefault('is_secure', def_conf['is_secure'])
+
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        remote.run(
+            args=[
+                'cd',
+                '{tdir}/s3-tests'.format(tdir=teuthology.get_testdir(ctx)),
+                run.Raw('&&'),
+                './bootstrap',
+                ],
+            )
+        conf_fp = StringIO()
+        conf = dict(
+                        s3=s3tests_conf['s3'],
+                        readwrite=s3tests_conf['readwrite'],
+                    )
+        yaml.safe_dump(conf, conf_fp, default_flow_style=False)
+        teuthology.write_file(
+            remote=remote,
+            path='{tdir}/archive/s3readwrite.{client}.config.yaml'.format(tdir=teuthology.get_testdir(ctx), client=client),
+            data=conf_fp.getvalue(),
+            )
+    yield
+
+
+@contextlib.contextmanager
+def run_tests(ctx, config):
+    """
+    Run the s3readwrite tests after everything is set up.
+
+    :param ctx: Context passed to task
+    :param config: specific configuration information
+    """
+    assert isinstance(config, dict)
+    testdir = teuthology.get_testdir(ctx)
+    for client, client_config in config.iteritems():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        conf = teuthology.get_file(remote, '{tdir}/archive/s3readwrite.{client}.config.yaml'.format(tdir=testdir, client=client))
+        args = [
+                '{tdir}/s3-tests/virtualenv/bin/s3tests-test-readwrite'.format(tdir=testdir),
+                ]
+        if client_config is not None and 'extra_args' in client_config:
+            args.extend(client_config['extra_args'])
+
+        ctx.cluster.only(client).run(
+            args=args,
+            stdin=conf,
+            )
+    yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run the s3tests-test-readwrite suite against rgw.
+
+    To run all tests on all clients::
+
+        tasks:
+        - ceph:
+        - rgw:
+        - s3readwrite:
+
+    To restrict testing to particular clients::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3readwrite: [client.0]
+
+    To run against a server on client.1::
+
+        tasks:
+        - ceph:
+        - rgw: [client.1]
+        - s3readwrite:
+            client.0:
+              rgw_server: client.1
+
+    To pass extra test arguments
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3readwrite:
+            client.0:
+              readwrite:
+                bucket: mybucket
+                readers: 10
+                writers: 3
+                duration: 600
+                files:
+                  num: 10
+                  size: 2000
+                  stddev: 500
+            client.1:
+              ...
+
+    To override s3 configuration
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3readwrite:
+            client.0:
+              s3:
+                user_id: myuserid
+                display_name: myname
+                email: my@email
+                access_key: myaccesskey
+                secret_key: mysecretkey
+
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task s3tests only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    overrides = ctx.config.get('overrides', {})
+    # merge each client section, not the top level.
+    for client in config.iterkeys():
+        if not config[client]:
+            config[client] = {}
+        teuthology.deep_merge(config[client], overrides.get('s3readwrite', {}))
+
+    log.debug('in s3readwrite, config is %s', config)
+
+    s3tests_conf = {}
+    for client in clients:
+        if config[client] is None:
+            config[client] = {}
+        config[client].setdefault('s3', {})
+        config[client].setdefault('readwrite', {})
+
+        s3tests_conf[client] = ({
+                'DEFAULT':
+                    {
+                    'port'      : 7280,
+                    'is_secure' : False,
+                    },
+                'readwrite' : config[client]['readwrite'],
+                's3'  : config[client]['s3'],
+                })
+
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: create_users(ctx=ctx, config=dict(
+                clients=clients,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: configure(ctx=ctx, config=dict(
+                clients=config,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: run_tests(ctx=ctx, config=config),
+        ):
+        pass
+    yield
diff --git a/src/ceph/qa/tasks/s3roundtrip.py b/src/ceph/qa/tasks/s3roundtrip.py
new file mode 100644
index 0000000..620b9d4
--- /dev/null
+++ b/src/ceph/qa/tasks/s3roundtrip.py
@@ -0,0 +1,306 @@
+"""
+Run rgw roundtrip message tests
+"""
+from cStringIO import StringIO
+import base64
+import contextlib
+import logging
+import os
+import random
+import string
+import yaml
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.orchestra import run
+from teuthology.orchestra.connection import split_user
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Download the s3 tests from the git builder.
+    Remove downloaded s3 file upon exit.
+    
+    The context passed in should be identical to the context
+    passed in to the main task.
+    """
+    assert isinstance(config, dict)
+    log.info('Downloading s3-tests...')
+    testdir = teuthology.get_testdir(ctx)
+    for (client, cconf) in config.iteritems():
+        branch = cconf.get('force-branch', None)
+        if not branch:
+            branch = cconf.get('branch', 'master')
+        ctx.cluster.only(client).run(
+            args=[
+                'git', 'clone',
+                '-b', branch,
+                teuth_config.ceph_git_base_url + 's3-tests.git',
+                '{tdir}/s3-tests'.format(tdir=testdir),
+                ],
+            )
+    try:
+        yield
+    finally:
+        log.info('Removing s3-tests...')
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=[
+                    'rm',
+                    '-rf',
+                    '{tdir}/s3-tests'.format(tdir=testdir),
+                    ],
+                )
+
+def _config_user(s3tests_conf, section, user):
+    """
+    Configure users for this section by stashing away keys, ids, and
+    email addresses.
+    """
+    s3tests_conf[section].setdefault('user_id', user)
+    s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user))
+    s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user))
+    s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20)))
+    s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40)))
+
+@contextlib.contextmanager
+def create_users(ctx, config):
+    """
+    Create a default s3 user.
+    """
+    assert isinstance(config, dict)
+    log.info('Creating rgw users...')
+    testdir = teuthology.get_testdir(ctx)
+    users = {'s3': 'foo'}
+    for client in config['clients']:
+        s3tests_conf = config['s3tests_conf'][client]
+        s3tests_conf.setdefault('roundtrip', {})
+        s3tests_conf['roundtrip'].setdefault('bucket', 'rttest-' + client + '-{random}-')
+        s3tests_conf['roundtrip'].setdefault('readers', 10)
+        s3tests_conf['roundtrip'].setdefault('writers', 3)
+        s3tests_conf['roundtrip'].setdefault('duration', 300)
+        s3tests_conf['roundtrip'].setdefault('files', {})
+        rtconf = s3tests_conf['roundtrip']
+        rtconf['files'].setdefault('num', 10)
+        rtconf['files'].setdefault('size', 2000)
+        rtconf['files'].setdefault('stddev', 500)
+        for section, user in [('s3', 'foo')]:
+            _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client))
+            ctx.cluster.only(client).run(
+                args=[
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'radosgw-admin',
+                    '-n', client,
+                    'user', 'create',
+                    '--uid', s3tests_conf[section]['user_id'],
+                    '--display-name', s3tests_conf[section]['display_name'],
+                    '--access-key', s3tests_conf[section]['access_key'],
+                    '--secret', s3tests_conf[section]['secret_key'],
+                    '--email', s3tests_conf[section]['email'],
+                ],
+            )
+    try:
+        yield
+    finally:
+        for client in config['clients']:
+            for user in users.itervalues():
+                uid = '{user}.{client}'.format(user=user, client=client)
+                ctx.cluster.only(client).run(
+                    args=[
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        '{tdir}/archive/coverage'.format(tdir=testdir),
+                        'radosgw-admin',
+                        '-n', client,
+                        'user', 'rm',
+                        '--uid', uid,
+                        '--purge-data',
+                        ],
+                    )
+
+@contextlib.contextmanager
+def configure(ctx, config):
+    """
+    Configure the s3-tests.  This includes the running of the
+    bootstrap code and the updating of local conf files.
+    """
+    assert isinstance(config, dict)
+    log.info('Configuring s3-roundtrip-tests...')
+    testdir = teuthology.get_testdir(ctx)
+    for client, properties in config['clients'].iteritems():
+        s3tests_conf = config['s3tests_conf'][client]
+        if properties is not None and 'rgw_server' in properties:
+            host = None
+            for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
+                log.info('roles: ' + str(roles))
+                log.info('target: ' + str(target))
+                if properties['rgw_server'] in roles:
+                    _, host = split_user(target)
+            assert host is not None, "Invalid client specified as the rgw_server"
+            s3tests_conf['s3']['host'] = host
+        else:
+            s3tests_conf['s3']['host'] = 'localhost'
+
+        def_conf = s3tests_conf['DEFAULT']
+        s3tests_conf['s3'].setdefault('port', def_conf['port'])
+        s3tests_conf['s3'].setdefault('is_secure', def_conf['is_secure'])
+
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        remote.run(
+            args=[
+                'cd',
+                '{tdir}/s3-tests'.format(tdir=testdir),
+                run.Raw('&&'),
+                './bootstrap',
+                ],
+            )
+        conf_fp = StringIO()
+        conf = dict(
+                        s3=s3tests_conf['s3'],
+                        roundtrip=s3tests_conf['roundtrip'],
+                    )
+        yaml.safe_dump(conf, conf_fp, default_flow_style=False)
+        teuthology.write_file(
+            remote=remote,
+            path='{tdir}/archive/s3roundtrip.{client}.config.yaml'.format(tdir=testdir, client=client),
+            data=conf_fp.getvalue(),
+            )
+    yield
+
+
+@contextlib.contextmanager
+def run_tests(ctx, config):
+    """
+    Run the s3 roundtrip after everything is set up.
+
+    :param ctx: Context passed to task
+    :param config: specific configuration information
+    """
+    assert isinstance(config, dict)
+    testdir = teuthology.get_testdir(ctx)
+    for client, client_config in config.iteritems():
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        conf = teuthology.get_file(remote, '{tdir}/archive/s3roundtrip.{client}.config.yaml'.format(tdir=testdir, client=client))
+        args = [
+                '{tdir}/s3-tests/virtualenv/bin/s3tests-test-roundtrip'.format(tdir=testdir),
+                ]
+        if client_config is not None and 'extra_args' in client_config:
+            args.extend(client_config['extra_args'])
+
+        ctx.cluster.only(client).run(
+            args=args,
+            stdin=conf,
+            )
+    yield
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run the s3tests-test-roundtrip suite against rgw.
+
+    To run all tests on all clients::
+
+        tasks:
+        - ceph:
+        - rgw:
+        - s3roundtrip:
+
+    To restrict testing to particular clients::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3roundtrip: [client.0]
+
+    To run against a server on client.1::
+
+        tasks:
+        - ceph:
+        - rgw: [client.1]
+        - s3roundtrip:
+            client.0:
+              rgw_server: client.1
+
+    To pass extra test arguments
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3roundtrip:
+            client.0:
+              roundtrip:
+                bucket: mybucket
+                readers: 10
+                writers: 3
+                duration: 600
+                files:
+                  num: 10
+                  size: 2000
+                  stddev: 500
+            client.1:
+              ...
+
+    To override s3 configuration
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3roundtrip:
+            client.0:
+              s3:
+                user_id: myuserid
+                display_name: myname
+                email: my@email
+                access_key: myaccesskey
+                secret_key: mysecretkey
+
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task s3tests only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    s3tests_conf = {}
+    for client in clients:
+        if config[client] is None:
+            config[client] = {}
+        config[client].setdefault('s3', {})
+        config[client].setdefault('roundtrip', {})
+
+        s3tests_conf[client] = ({
+                'DEFAULT':
+                    {
+                    'port'      : 7280,
+                    'is_secure' : False,
+                    },
+                'roundtrip' : config[client]['roundtrip'],
+                's3'  : config[client]['s3'],
+                })
+
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: create_users(ctx=ctx, config=dict(
+                clients=clients,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: configure(ctx=ctx, config=dict(
+                clients=config,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: run_tests(ctx=ctx, config=config),
+        ):
+        pass
+    yield
diff --git a/src/ceph/qa/tasks/s3tests.py b/src/ceph/qa/tasks/s3tests.py
new file mode 100644
index 0000000..ef5680d
--- /dev/null
+++ b/src/ceph/qa/tasks/s3tests.py
@@ -0,0 +1,386 @@
+"""
+Run a set of s3 tests on rgw.
+"""
+from cStringIO import StringIO
+from configobj import ConfigObj
+import base64
+import contextlib
+import logging
+import os
+import random
+import string
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.orchestra import run
+from teuthology.orchestra.connection import split_user
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Download the s3 tests from the git builder.
+    Remove downloaded s3 file upon exit.
+
+    The context passed in should be identical to the context
+    passed in to the main task.
+    """
+    assert isinstance(config, dict)
+    log.info('Downloading s3-tests...')
+    testdir = teuthology.get_testdir(ctx)
+    s3_branches = [ 'giant', 'firefly', 'firefly-original', 'hammer' ]
+    for (client, cconf) in config.items():
+        branch = cconf.get('force-branch', None)
+        if not branch:
+            ceph_branch = ctx.config.get('branch')
+            suite_branch = ctx.config.get('suite_branch', ceph_branch)
+            if suite_branch in s3_branches:
+                branch = cconf.get('branch', suite_branch)
+	    else:
+                branch = cconf.get('branch', 'ceph-' + suite_branch)
+        if not branch:
+            raise ValueError(
+                "Could not determine what branch to use for s3tests!")
+        else:
+            log.info("Using branch '%s' for s3tests", branch)
+        sha1 = cconf.get('sha1')
+        git_remote = cconf.get('git_remote', None) or teuth_config.ceph_git_base_url
+        ctx.cluster.only(client).run(
+            args=[
+                'git', 'clone',
+                '-b', branch,
+                git_remote + 's3-tests.git',
+                '{tdir}/s3-tests'.format(tdir=testdir),
+                ],
+            )
+        if sha1 is not None:
+            ctx.cluster.only(client).run(
+                args=[
+                    'cd', '{tdir}/s3-tests'.format(tdir=testdir),
+                    run.Raw('&&'),
+                    'git', 'reset', '--hard', sha1,
+                    ],
+                )
+    try:
+        yield
+    finally:
+        log.info('Removing s3-tests...')
+        testdir = teuthology.get_testdir(ctx)
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=[
+                    'rm',
+                    '-rf',
+                    '{tdir}/s3-tests'.format(tdir=testdir),
+                    ],
+                )
+
+
+def _config_user(s3tests_conf, section, user):
+    """
+    Configure users for this section by stashing away keys, ids, and
+    email addresses.
+    """
+    s3tests_conf[section].setdefault('user_id', user)
+    s3tests_conf[section].setdefault('email', '{user}+test@test.test'.format(user=user))
+    s3tests_conf[section].setdefault('display_name', 'Mr. {user}'.format(user=user))
+    s3tests_conf[section].setdefault('access_key', ''.join(random.choice(string.uppercase) for i in xrange(20)))
+    s3tests_conf[section].setdefault('secret_key', base64.b64encode(os.urandom(40)))
+
+
+@contextlib.contextmanager
+def create_users(ctx, config):
+    """
+    Create a main and an alternate s3 user.
+    """
+    assert isinstance(config, dict)
+    log.info('Creating rgw users...')
+    testdir = teuthology.get_testdir(ctx)
+    users = {'s3 main': 'foo', 's3 alt': 'bar', 's3 tenant': 'testx$tenanteduser'}
+    for client in config['clients']:
+        s3tests_conf = config['s3tests_conf'][client]
+        s3tests_conf.setdefault('fixtures', {})
+        s3tests_conf['fixtures'].setdefault('bucket prefix', 'test-' + client + '-{random}-')
+        for section, user in users.iteritems():
+            _config_user(s3tests_conf, section, '{user}.{client}'.format(user=user, client=client))
+            log.debug('Creating user {user} on {host}'.format(user=s3tests_conf[section]['user_id'], host=client))
+            cluster_name, daemon_type, client_id = teuthology.split_role(client)
+            client_with_id = daemon_type + '.' + client_id
+            ctx.cluster.only(client).run(
+                args=[
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'radosgw-admin',
+                    '-n', client_with_id,
+                    'user', 'create',
+                    '--uid', s3tests_conf[section]['user_id'],
+                    '--display-name', s3tests_conf[section]['display_name'],
+                    '--access-key', s3tests_conf[section]['access_key'],
+                    '--secret', s3tests_conf[section]['secret_key'],
+                    '--email', s3tests_conf[section]['email'],
+                    '--cluster', cluster_name,
+                ],
+            )
+    try:
+        yield
+    finally:
+        for client in config['clients']:
+            for user in users.itervalues():
+                uid = '{user}.{client}'.format(user=user, client=client)
+                cluster_name, daemon_type, client_id = teuthology.split_role(client)
+                client_with_id = daemon_type + '.' + client_id
+                ctx.cluster.only(client).run(
+                    args=[
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        '{tdir}/archive/coverage'.format(tdir=testdir),
+                        'radosgw-admin',
+                        '-n', client_with_id,
+                        'user', 'rm',
+                        '--uid', uid,
+                        '--purge-data',
+                        '--cluster', cluster_name,
+                        ],
+                    )
+
+
+@contextlib.contextmanager
+def configure(ctx, config):
+    """
+    Configure the s3-tests.  This includes the running of the
+    bootstrap code and the updating of local conf files.
+    """
+    assert isinstance(config, dict)
+    log.info('Configuring s3-tests...')
+    testdir = teuthology.get_testdir(ctx)
+    for client, properties in config['clients'].iteritems():
+        s3tests_conf = config['s3tests_conf'][client]
+        if properties is not None and 'rgw_server' in properties:
+            host = None
+            for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
+                log.info('roles: ' + str(roles))
+                log.info('target: ' + str(target))
+                if properties['rgw_server'] in roles:
+                    _, host = split_user(target)
+            assert host is not None, "Invalid client specified as the rgw_server"
+            s3tests_conf['DEFAULT']['host'] = host
+        else:
+            s3tests_conf['DEFAULT']['host'] = 'localhost'
+
+        if properties is not None and 'slow_backend' in properties:
+	    s3tests_conf['fixtures']['slow backend'] = properties['slow_backend']
+
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        remote.run(
+            args=[
+                'cd',
+                '{tdir}/s3-tests'.format(tdir=testdir),
+                run.Raw('&&'),
+                './bootstrap',
+                ],
+            )
+        conf_fp = StringIO()
+        s3tests_conf.write(conf_fp)
+        teuthology.write_file(
+            remote=remote,
+            path='{tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
+            data=conf_fp.getvalue(),
+            )
+
+    log.info('Configuring boto...')
+    boto_src = os.path.join(os.path.dirname(__file__), 'boto.cfg.template')
+    for client, properties in config['clients'].iteritems():
+        with file(boto_src, 'rb') as f:
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            conf = f.read().format(
+                idle_timeout=config.get('idle_timeout', 30)
+                )
+            teuthology.write_file(
+                remote=remote,
+                path='{tdir}/boto.cfg'.format(tdir=testdir),
+                data=conf,
+                )
+
+    try:
+        yield
+
+    finally:
+        log.info('Cleaning up boto...')
+        for client, properties in config['clients'].iteritems():
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            remote.run(
+                args=[
+                    'rm',
+                    '{tdir}/boto.cfg'.format(tdir=testdir),
+                    ],
+                )
+
+@contextlib.contextmanager
+def run_tests(ctx, config):
+    """
+    Run the s3tests after everything is set up.
+
+    :param ctx: Context passed to task
+    :param config: specific configuration information
+    """
+    assert isinstance(config, dict)
+    testdir = teuthology.get_testdir(ctx)
+    attrs = ["!fails_on_rgw", "!lifecycle"]
+    for client, client_config in config.iteritems():
+        args = [
+            'S3TEST_CONF={tdir}/archive/s3-tests.{client}.conf'.format(tdir=testdir, client=client),
+            'BOTO_CONFIG={tdir}/boto.cfg'.format(tdir=testdir),
+            '{tdir}/s3-tests/virtualenv/bin/nosetests'.format(tdir=testdir),
+            '-w',
+            '{tdir}/s3-tests'.format(tdir=testdir),
+            '-v',
+            '-a', ','.join(attrs),
+            ]
+        if client_config is not None and 'extra_args' in client_config:
+            args.extend(client_config['extra_args'])
+
+        ctx.cluster.only(client).run(
+            args=args,
+            label="s3 tests against rgw"
+            )
+    yield
+
+@contextlib.contextmanager
+def scan_for_leaked_encryption_keys(ctx, config):
+    """
+    Scan radosgw logs for the encryption keys used by s3tests to
+    verify that we're not leaking secrets.
+
+    :param ctx: Context passed to task
+    :param config: specific configuration information
+    """
+    assert isinstance(config, dict)
+
+    try:
+        yield
+    finally:
+        # x-amz-server-side-encryption-customer-key
+        s3test_customer_key = 'pO3upElrwuEXSoFwCfnZPdSsmt/xWeFa0N9KgDijwVs='
+
+        log.debug('Scanning radosgw logs for leaked encryption keys...')
+        procs = list()
+        for client, client_config in config.iteritems():
+            if not client_config.get('scan_for_encryption_keys', True):
+                continue
+            cluster_name, daemon_type, client_id = teuthology.split_role(client)
+            client_with_cluster = '.'.join((cluster_name, daemon_type, client_id))
+            (remote,) = ctx.cluster.only(client).remotes.keys()
+            proc = remote.run(
+                args=[
+                    'grep',
+                    '--binary-files=text',
+                    s3test_customer_key,
+                    '/var/log/ceph/rgw.{client}.log'.format(client=client_with_cluster),
+                ],
+                wait=False,
+                check_status=False,
+            )
+            procs.append(proc)
+
+        for proc in procs:
+            proc.wait()
+            if proc.returncode == 1: # 1 means no matches
+                continue
+            log.error('radosgw log is leaking encryption keys!')
+            raise Exception('radosgw log is leaking encryption keys')
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run the s3-tests suite against rgw.
+
+    To run all tests on all clients::
+
+        tasks:
+        - ceph:
+        - rgw:
+        - s3tests:
+
+    To restrict testing to particular clients::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3tests: [client.0]
+
+    To run against a server on client.1 and increase the boto timeout to 10m::
+
+        tasks:
+        - ceph:
+        - rgw: [client.1]
+        - s3tests:
+            client.0:
+              rgw_server: client.1
+              idle_timeout: 600
+
+    To pass extra arguments to nose (e.g. to run a certain test)::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - s3tests:
+            client.0:
+              extra_args: ['test_s3:test_object_acl_grand_public_read']
+            client.1:
+              extra_args: ['--exclude', 'test_100_continue']
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task s3tests only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    overrides = ctx.config.get('overrides', {})
+    # merge each client section, not the top level.
+    for client in config.iterkeys():
+        if not config[client]:
+            config[client] = {}
+        teuthology.deep_merge(config[client], overrides.get('s3tests', {}))
+
+    log.debug('s3tests config is %s', config)
+
+    s3tests_conf = {}
+    for client in clients:
+        s3tests_conf[client] = ConfigObj(
+            indent_type='',
+            infile={
+                'DEFAULT':
+                    {
+                    'port'      : 7280,
+                    'is_secure' : 'no',
+                    },
+                'fixtures' : {},
+                's3 main'  : {},
+                's3 alt'   : {},
+		's3 tenant': {},
+                }
+            )
+
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=config),
+        lambda: create_users(ctx=ctx, config=dict(
+                clients=clients,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: configure(ctx=ctx, config=dict(
+                clients=config,
+                s3tests_conf=s3tests_conf,
+                )),
+        lambda: run_tests(ctx=ctx, config=config),
+        lambda: scan_for_leaked_encryption_keys(ctx=ctx, config=config),
+        ):
+        pass
+    yield
diff --git a/src/ceph/qa/tasks/samba.py b/src/ceph/qa/tasks/samba.py
new file mode 100644
index 0000000..8272e8b
--- /dev/null
+++ b/src/ceph/qa/tasks/samba.py
@@ -0,0 +1,245 @@
+"""
+Samba
+"""
+import contextlib
+import logging
+import sys
+import time
+
+from teuthology import misc as teuthology
+from teuthology.orchestra import run
+from teuthology.orchestra.daemon import DaemonGroup
+
+log = logging.getLogger(__name__)
+
+
+def get_sambas(ctx, roles):
+    """
+    Scan for roles that are samba.  Yield the id of the the samba role
+    (samba.0, samba.1...)  and the associated remote site
+
+    :param ctx: Context
+    :param roles: roles for this test (extracted from yaml files)
+    """
+    for role in roles:
+        assert isinstance(role, basestring)
+        PREFIX = 'samba.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+        yield (id_, remote)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Setup samba smbd with ceph vfs module.  This task assumes the samba
+    package has already been installed via the install task.
+
+    The config is optional and defaults to starting samba on all nodes.
+    If a config is given, it is expected to be a list of
+    samba nodes to start smbd servers on.
+
+    Example that starts smbd on all samba nodes::
+
+        tasks:
+        - install:
+        - install:
+            project: samba
+            extra_packages: ['samba']
+        - ceph:
+        - samba:
+        - interactive:
+
+    Example that starts smbd on just one of the samba nodes and cifs on the other::
+
+        tasks:
+        - samba: [samba.0]
+        - cifs: [samba.1]
+
+    An optional backend can be specified, and requires a path which smbd will
+    use as the backend storage location:
+
+        roles:
+            - [osd.0, osd.1, osd.2, mon.0, mon.1, mon.2, mds.a]
+            - [client.0, samba.0]
+
+        tasks:
+        - ceph:
+        - ceph-fuse: [client.0]
+        - samba:
+            samba.0:
+              cephfuse: "{testdir}/mnt.0"
+
+    This mounts ceph to {testdir}/mnt.0 using fuse, and starts smbd with
+    a UNC of //localhost/cephfuse.  Access through that UNC will be on
+    the ceph fuse mount point.
+
+    If no arguments are specified in the samba
+    role, the default behavior is to enable the ceph UNC //localhost/ceph
+    and use the ceph vfs module as the smbd backend.
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    log.info("Setting up smbd with ceph vfs...")
+    assert config is None or isinstance(config, list) or isinstance(config, dict), \
+        "task samba got invalid config"
+
+    if config is None:
+        config = dict(('samba.{id}'.format(id=id_), None)
+                  for id_ in teuthology.all_roles_of_type(ctx.cluster, 'samba'))
+    elif isinstance(config, list):
+        config = dict((name, None) for name in config)
+
+    samba_servers = list(get_sambas(ctx=ctx, roles=config.keys()))
+
+    testdir = teuthology.get_testdir(ctx)
+
+    if not hasattr(ctx, 'daemons'):
+        ctx.daemons = DaemonGroup()
+
+    for id_, remote in samba_servers:
+
+        rolestr = "samba.{id_}".format(id_=id_)
+
+        confextras = """vfs objects = ceph
+  ceph:config_file = /etc/ceph/ceph.conf"""
+
+        unc = "ceph"
+        backend = "/"
+
+        if config[rolestr] is not None:
+            # verify that there's just one parameter in role
+            if len(config[rolestr]) != 1:
+                log.error("samba config for role samba.{id_} must have only one parameter".format(id_=id_))
+                raise Exception('invalid config')
+            confextras = ""
+            (unc, backendstr) = config[rolestr].items()[0]
+            backend = backendstr.format(testdir=testdir)
+
+        # on first samba role, set ownership and permissions of ceph root
+        # so that samba tests succeed
+        if config[rolestr] is None and id_ == samba_servers[0][0]:
+            remote.run(
+                    args=[
+                        'mkdir', '-p', '/tmp/cmnt', run.Raw('&&'),
+                        'sudo', 'ceph-fuse', '/tmp/cmnt', run.Raw('&&'),
+                        'sudo', 'chown', 'ubuntu:ubuntu', '/tmp/cmnt/', run.Raw('&&'),
+                        'sudo', 'chmod', '1777', '/tmp/cmnt/', run.Raw('&&'),
+                        'sudo', 'umount', '/tmp/cmnt/', run.Raw('&&'),
+                        'rm', '-rf', '/tmp/cmnt',
+                        ],
+                    )
+        else:
+            remote.run(
+                    args=[
+                        'sudo', 'chown', 'ubuntu:ubuntu', backend, run.Raw('&&'),
+                        'sudo', 'chmod', '1777', backend,
+                        ],
+                    )
+
+        teuthology.sudo_write_file(remote, "/usr/local/samba/etc/smb.conf", """
+[global]
+  workgroup = WORKGROUP
+  netbios name = DOMAIN
+
+[{unc}]
+  path = {backend}
+  {extras}
+  writeable = yes
+  valid users = ubuntu
+""".format(extras=confextras, unc=unc, backend=backend))
+
+        # create ubuntu user
+        remote.run(
+            args=[
+                'sudo', '/usr/local/samba/bin/smbpasswd', '-e', 'ubuntu',
+                run.Raw('||'),
+                'printf', run.Raw('"ubuntu\nubuntu\n"'),
+                run.Raw('|'),
+                'sudo', '/usr/local/samba/bin/smbpasswd', '-s', '-a', 'ubuntu'
+            ])
+
+        smbd_cmd = [
+                'sudo',
+                'daemon-helper',
+                'term',
+                'nostdin',
+                '/usr/local/samba/sbin/smbd',
+                '-F',
+                ]
+        ctx.daemons.add_daemon(remote, 'smbd', id_,
+                               args=smbd_cmd,
+                               logger=log.getChild("smbd.{id_}".format(id_=id_)),
+                               stdin=run.PIPE,
+                               wait=False,
+                               )
+
+        # let smbd initialize, probably a better way...
+        seconds_to_sleep = 100
+        log.info('Sleeping for %s  seconds...' % seconds_to_sleep)
+        time.sleep(seconds_to_sleep)
+        log.info('Sleeping stopped...')
+
+    try:
+        yield
+    finally:
+        log.info('Stopping smbd processes...')
+        exc_info = (None, None, None)
+        for d in ctx.daemons.iter_daemons_of_role('smbd'):
+            try:
+                d.stop()
+            except (run.CommandFailedError,
+                    run.CommandCrashedError,
+                    run.ConnectionLostError):
+                exc_info = sys.exc_info()
+                log.exception('Saw exception from %s.%s', d.role, d.id_)
+        if exc_info != (None, None, None):
+            raise exc_info[0], exc_info[1], exc_info[2]
+
+        for id_, remote in samba_servers:
+            remote.run(
+                args=[
+                    'sudo',
+                    'rm', '-rf',
+                    '/usr/local/samba/etc/smb.conf',
+                    '/usr/local/samba/private/*',
+                    '/usr/local/samba/var/run/',
+                    '/usr/local/samba/var/locks',
+                    '/usr/local/samba/var/lock',
+                    ],
+                )
+            # make sure daemons are gone
+            try:
+                remote.run(
+                    args=[
+                        'while',
+                        'sudo', 'killall', '-9', 'smbd',
+                        run.Raw(';'),
+                        'do', 'sleep', '1',
+                        run.Raw(';'),
+                        'done',
+                        ],
+                    )
+
+                remote.run(
+                    args=[
+                        'sudo',
+                        'lsof',
+                        backend,
+                        ],
+                    check_status=False
+                    )
+                remote.run(
+                    args=[
+                        'sudo',
+                        'fuser',
+                        '-M',
+                        backend,
+                        ],
+                    check_status=False
+                    )
+            except Exception:
+                log.exception("Saw exception")
+                pass
diff --git a/src/ceph/qa/tasks/scrub.py b/src/ceph/qa/tasks/scrub.py
new file mode 100644
index 0000000..9800d1e
--- /dev/null
+++ b/src/ceph/qa/tasks/scrub.py
@@ -0,0 +1,117 @@
+"""
+Scrub osds
+"""
+import contextlib
+import gevent
+import logging
+import random
+import time
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run scrub periodically. Randomly chooses an OSD to scrub.
+
+    The config should be as follows:
+
+    scrub:
+        frequency: <seconds between scrubs>
+        deep: <bool for deepness>
+
+    example:
+
+    tasks:
+    - ceph:
+    - scrub:
+        frequency: 30
+        deep: 0
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'scrub task only accepts a dict for configuration'
+
+    log.info('Beginning scrub...')
+
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    while len(manager.get_osd_status()['up']) < num_osds:
+        time.sleep(10)
+
+    scrub_proc = Scrubber(
+        manager,
+        config,
+        )
+    try:
+        yield
+    finally:
+        log.info('joining scrub')
+        scrub_proc.do_join()
+
+class Scrubber:
+    """
+    Scrubbing is actually performed during initialzation
+    """
+    def __init__(self, manager, config):
+        """
+        Spawn scrubbing thread upon completion.
+        """
+        self.ceph_manager = manager
+        self.ceph_manager.wait_for_clean()
+
+        osd_status = self.ceph_manager.get_osd_status()
+        self.osds = osd_status['up']
+
+        self.config = config
+        if self.config is None:
+            self.config = dict()
+
+        else:
+            def tmp(x):
+                """Local display"""
+                print x
+            self.log = tmp
+
+        self.stopping = False
+
+        log.info("spawning thread")
+
+        self.thread = gevent.spawn(self.do_scrub)
+
+    def do_join(self):
+        """Scrubbing thread finished"""
+        self.stopping = True
+        self.thread.get()
+
+    def do_scrub(self):
+        """Perform the scrub operation"""
+        frequency = self.config.get("frequency", 30)
+        deep = self.config.get("deep", 0)
+
+        log.info("stopping %s" % self.stopping)
+
+        while not self.stopping:
+            osd = str(random.choice(self.osds))
+
+            if deep:
+                cmd = 'deep-scrub'
+            else:
+                cmd = 'scrub'
+
+            log.info('%sbing %s' % (cmd, osd))
+            self.ceph_manager.raw_cluster_cmd('osd', cmd, osd)
+
+            time.sleep(frequency)
diff --git a/src/ceph/qa/tasks/scrub_test.py b/src/ceph/qa/tasks/scrub_test.py
new file mode 100644
index 0000000..a545c9b
--- /dev/null
+++ b/src/ceph/qa/tasks/scrub_test.py
@@ -0,0 +1,412 @@
+"""Scrub testing"""
+from cStringIO import StringIO
+
+import contextlib
+import json
+import logging
+import os
+import time
+import tempfile
+
+import ceph_manager
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+
+def wait_for_victim_pg(manager):
+    """Return a PG with some data and its acting set"""
+    # wait for some PG to have data that we can mess with
+    victim = None
+    while victim is None:
+        stats = manager.get_pg_stats()
+        for pg in stats:
+            size = pg['stat_sum']['num_bytes']
+            if size > 0:
+                victim = pg['pgid']
+                acting = pg['acting']
+                return victim, acting
+        time.sleep(3)
+
+
+def find_victim_object(ctx, pg, osd):
+    """Return a file to be fuzzed"""
+    (osd_remote,) = ctx.cluster.only('osd.%d' % osd).remotes.iterkeys()
+    data_path = os.path.join(
+        '/var/lib/ceph/osd',
+        'ceph-{id}'.format(id=osd),
+        'fuse',
+        '{pg}_head'.format(pg=pg),
+        'all',
+        )
+
+    # fuzz time
+    with contextlib.closing(StringIO()) as ls_fp:
+        osd_remote.run(
+            args=['sudo', 'ls', data_path],
+            stdout=ls_fp,
+        )
+        ls_out = ls_fp.getvalue()
+
+    # find an object file we can mess with (and not the pg info object)
+    osdfilename = next(line for line in ls_out.split('\n')
+                       if not line.endswith('::::head#'))
+    assert osdfilename is not None
+
+    # Get actual object name from osd stored filename
+    objname = osdfilename.split(':')[4]
+    return osd_remote, os.path.join(data_path, osdfilename), objname
+
+
+def corrupt_file(osd_remote, path):
+    # put a single \0 at the beginning of the file
+    osd_remote.run(
+        args=['sudo', 'dd',
+              'if=/dev/zero',
+              'of=%s/data' % path,
+              'bs=1', 'count=1', 'conv=notrunc']
+    )
+
+
+def get_pgnum(pgid):
+    pos = pgid.find('.')
+    assert pos != -1
+    return pgid[pos+1:]
+
+
+def deep_scrub(manager, victim, pool):
+    # scrub, verify inconsistent
+    pgnum = get_pgnum(victim)
+    manager.do_pg_scrub(pool, pgnum, 'deep-scrub')
+
+    stats = manager.get_single_pg_stats(victim)
+    inconsistent = stats['state'].find('+inconsistent') != -1
+    assert inconsistent
+
+
+def repair(manager, victim, pool):
+    # repair, verify no longer inconsistent
+    pgnum = get_pgnum(victim)
+    manager.do_pg_scrub(pool, pgnum, 'repair')
+
+    stats = manager.get_single_pg_stats(victim)
+    inconsistent = stats['state'].find('+inconsistent') != -1
+    assert not inconsistent
+
+
+def test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, pool):
+    corrupt_file(osd_remote, obj_path)
+    deep_scrub(manager, pg, pool)
+    repair(manager, pg, pool)
+
+
+def test_repair_bad_omap(ctx, manager, pg, osd, objname):
+    # Test deep-scrub with various omap modifications
+    # Modify omap on specific osd
+    log.info('fuzzing omap of %s' % objname)
+    manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'key'])
+    manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname,
+                                   'badkey', 'badval'])
+    manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'badhdr'])
+
+    deep_scrub(manager, pg, 'rbd')
+    # please note, the repair here is errnomous, it rewrites the correct omap
+    # digest and data digest on the replicas with the corresponding digests
+    # from the primary osd which is hosting the victim object, see
+    # find_victim_object().
+    # so we need to either put this test and the end of this task or
+    # undo the mess-up manually before the "repair()" that just ensures
+    # the cleanup is sane, otherwise the succeeding tests will fail. if they
+    # try set "badkey" in hope to get an "inconsistent" pg with a deep-scrub.
+    manager.osd_admin_socket(osd, ['setomapheader', 'rbd', objname, 'hdr'])
+    manager.osd_admin_socket(osd, ['rmomapkey', 'rbd', objname, 'badkey'])
+    manager.osd_admin_socket(osd, ['setomapval', 'rbd', objname,
+                                   'key', 'val'])
+    repair(manager, pg, 'rbd')
+
+
+class MessUp:
+    def __init__(self, manager, osd_remote, pool, osd_id,
+                 obj_name, obj_path, omap_key, omap_val):
+        self.manager = manager
+        self.osd = osd_remote
+        self.pool = pool
+        self.osd_id = osd_id
+        self.obj = obj_name
+        self.path = obj_path
+        self.omap_key = omap_key
+        self.omap_val = omap_val
+
+    @contextlib.contextmanager
+    def _test_with_file(self, messup_cmd, *checks):
+        temp = tempfile.mktemp()
+        backup_cmd = ['sudo', 'cp', os.path.join(self.path, 'data'), temp]
+        self.osd.run(args=backup_cmd)
+        self.osd.run(args=messup_cmd.split())
+        yield checks
+        create_cmd = ['sudo', 'mkdir', self.path]
+        self.osd.run(args=create_cmd, check_status=False)
+        restore_cmd = ['sudo', 'cp', temp, os.path.join(self.path, 'data')]
+        self.osd.run(args=restore_cmd)
+
+    def remove(self):
+        cmd = 'sudo rmdir {path}'.format(path=self.path)
+        return self._test_with_file(cmd, 'missing')
+
+    def append(self):
+        cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
+              'conv=notrunc oflag=append'.format(path=self.path)
+        return self._test_with_file(cmd,
+                                    'data_digest_mismatch',
+                                    'size_mismatch')
+
+    def truncate(self):
+        cmd = 'sudo dd if=/dev/null of={path}/data'.format(path=self.path)
+        return self._test_with_file(cmd,
+                                    'data_digest_mismatch',
+                                    'size_mismatch')
+
+    def change_obj(self):
+        cmd = 'sudo dd if=/dev/zero of={path}/data bs=1 count=1 ' \
+              'conv=notrunc'.format(path=self.path)
+        return self._test_with_file(cmd,
+                                    'data_digest_mismatch')
+
+    @contextlib.contextmanager
+    def rm_omap(self):
+        cmd = ['rmomapkey', self.pool, self.obj, self.omap_key]
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+        yield ('omap_digest_mismatch',)
+        cmd = ['setomapval', self.pool, self.obj,
+               self.omap_key, self.omap_val]
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+
+    @contextlib.contextmanager
+    def add_omap(self):
+        cmd = ['setomapval', self.pool, self.obj, 'badkey', 'badval']
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+        yield ('omap_digest_mismatch',)
+        cmd = ['rmomapkey', self.pool, self.obj, 'badkey']
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+
+    @contextlib.contextmanager
+    def change_omap(self):
+        cmd = ['setomapval', self.pool, self.obj, self.omap_key, 'badval']
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+        yield ('omap_digest_mismatch',)
+        cmd = ['setomapval', self.pool, self.obj, self.omap_key, self.omap_val]
+        self.manager.osd_admin_socket(self.osd_id, cmd)
+
+
+class InconsistentObjChecker:
+    """Check the returned inconsistents/inconsistent info"""
+
+    def __init__(self, osd, acting, obj_name):
+        self.osd = osd
+        self.acting = acting
+        self.obj = obj_name
+        assert self.osd in self.acting
+
+    def basic_checks(self, inc):
+        assert inc['object']['name'] == self.obj
+        assert inc['object']['snap'] == "head"
+        assert len(inc['shards']) == len(self.acting), \
+            "the number of returned shard does not match with the acting set"
+
+    def run(self, check, inc):
+        func = getattr(self, check)
+        func(inc)
+
+    def _check_errors(self, inc, err_name):
+        bad_found = False
+        good_found = False
+        for shard in inc['shards']:
+            log.info('shard = %r' % shard)
+            log.info('err = %s' % err_name)
+            assert 'osd' in shard
+            osd = shard['osd']
+            err = err_name in shard['errors']
+            if osd == self.osd:
+                assert bad_found is False, \
+                    "multiple entries found for the given OSD"
+                assert err is True, \
+                    "Didn't find '{err}' in errors".format(err=err_name)
+                bad_found = True
+            else:
+                assert osd in self.acting, "shard not in acting set"
+                assert err is False, \
+                    "Expected '{err}' in errors".format(err=err_name)
+                good_found = True
+        assert bad_found is True, \
+            "Shard for osd.{osd} not found".format(osd=self.osd)
+        assert good_found is True, \
+            "No other acting shards found"
+
+    def _check_attrs(self, inc, attr_name):
+        bad_attr = None
+        good_attr = None
+        for shard in inc['shards']:
+            log.info('shard = %r' % shard)
+            log.info('attr = %s' % attr_name)
+            assert 'osd' in shard
+            osd = shard['osd']
+            attr = shard.get(attr_name, False)
+            if osd == self.osd:
+                assert bad_attr is None, \
+                    "multiple entries found for the given OSD"
+                bad_attr = attr
+            else:
+                assert osd in self.acting, "shard not in acting set"
+                assert good_attr is None or good_attr == attr, \
+                    "multiple good attrs found"
+                good_attr = attr
+        assert bad_attr is not None, \
+            "bad {attr} not found".format(attr=attr_name)
+        assert good_attr is not None, \
+            "good {attr} not found".format(attr=attr_name)
+        assert good_attr != bad_attr, \
+            "bad attr is identical to the good ones: " \
+            "{0} == {1}".format(good_attr, bad_attr)
+
+    def data_digest_mismatch(self, inc):
+        assert 'data_digest_mismatch' in inc['errors']
+        self._check_attrs(inc, 'data_digest')
+
+    def missing(self, inc):
+        assert 'missing' in inc['union_shard_errors']
+        self._check_errors(inc, 'missing')
+
+    def size_mismatch(self, inc):
+        assert 'size_mismatch' in inc['errors']
+        self._check_attrs(inc, 'size')
+
+    def omap_digest_mismatch(self, inc):
+        assert 'omap_digest_mismatch' in inc['errors']
+        self._check_attrs(inc, 'omap_digest')
+
+
+def test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd_id,
+                               obj_name, obj_path):
+    mon = manager.controller
+    pool = 'rbd'
+    omap_key = 'key'
+    omap_val = 'val'
+    manager.do_rados(mon, ['-p', pool, 'setomapval', obj_name,
+                           omap_key, omap_val])
+    # Update missing digests, requires "osd deep scrub update digest min age: 0"
+    pgnum = get_pgnum(pg)
+    manager.do_pg_scrub(pool, pgnum, 'deep-scrub')
+
+    messup = MessUp(manager, osd_remote, pool, osd_id, obj_name, obj_path,
+                    omap_key, omap_val)
+    for test in [messup.rm_omap, messup.add_omap, messup.change_omap,
+                 messup.append, messup.truncate, messup.change_obj,
+                 messup.remove]:
+        with test() as checks:
+            deep_scrub(manager, pg, pool)
+            cmd = 'rados list-inconsistent-pg {pool} ' \
+                  '--format=json'.format(pool=pool)
+            with contextlib.closing(StringIO()) as out:
+                mon.run(args=cmd.split(), stdout=out)
+                pgs = json.loads(out.getvalue())
+            assert pgs == [pg]
+
+            cmd = 'rados list-inconsistent-obj {pg} ' \
+                  '--format=json'.format(pg=pg)
+            with contextlib.closing(StringIO()) as out:
+                mon.run(args=cmd.split(), stdout=out)
+                objs = json.loads(out.getvalue())
+            assert len(objs['inconsistents']) == 1
+
+            checker = InconsistentObjChecker(osd_id, acting, obj_name)
+            inc_obj = objs['inconsistents'][0]
+            log.info('inc = %r', inc_obj)
+            checker.basic_checks(inc_obj)
+            for check in checks:
+                checker.run(check, inc_obj)
+
+
+def task(ctx, config):
+    """
+    Test [deep] scrub
+
+    tasks:
+    - chef:
+    - install:
+    - ceph:
+        log-whitelist:
+        - '!= data_digest'
+        - '!= omap_digest'
+        - '!= size'
+        - deep-scrub 0 missing, 1 inconsistent objects
+        - deep-scrub [0-9]+ errors
+        - repair 0 missing, 1 inconsistent objects
+        - repair [0-9]+ errors, [0-9]+ fixed
+        - shard [0-9]+ missing
+        - deep-scrub 1 missing, 1 inconsistent objects
+        - does not match object info size
+        - attr name mistmatch
+        - deep-scrub 1 missing, 0 inconsistent objects
+        - failed to pick suitable auth object
+      conf:
+        osd:
+          osd deep scrub update digest min age: 0
+    - scrub_test:
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'scrub_test task only accepts a dict for configuration'
+    first_mon = teuthology.get_first_mon(ctx, config)
+    (mon,) = ctx.cluster.only(first_mon).remotes.iterkeys()
+
+    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
+    log.info('num_osds is %s' % num_osds)
+
+    manager = ceph_manager.CephManager(
+        mon,
+        ctx=ctx,
+        logger=log.getChild('ceph_manager'),
+        )
+
+    while len(manager.get_osd_status()['up']) < num_osds:
+        time.sleep(10)
+
+    for i in range(num_osds):
+        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
+                                '--', '--osd-objectstore-fuse')
+    manager.flush_pg_stats(range(num_osds))
+    manager.wait_for_clean()
+
+    # write some data
+    p = manager.do_rados(mon, ['-p', 'rbd', 'bench', '--no-cleanup', '1',
+                               'write', '-b', '4096'])
+    log.info('err is %d' % p.exitstatus)
+
+    # wait for some PG to have data that we can mess with
+    pg, acting = wait_for_victim_pg(manager)
+    osd = acting[0]
+
+    osd_remote, obj_path, obj_name = find_victim_object(ctx, pg, osd)
+    manager.do_rados(mon, ['-p', 'rbd', 'setomapval', obj_name, 'key', 'val'])
+    log.info('err is %d' % p.exitstatus)
+    manager.do_rados(mon, ['-p', 'rbd', 'setomapheader', obj_name, 'hdr'])
+    log.info('err is %d' % p.exitstatus)
+
+    # Update missing digests, requires "osd deep scrub update digest min age: 0"
+    pgnum = get_pgnum(pg)
+    manager.do_pg_scrub('rbd', pgnum, 'deep-scrub')
+
+    log.info('messing with PG %s on osd %d' % (pg, osd))
+    test_repair_corrupted_obj(ctx, manager, pg, osd_remote, obj_path, 'rbd')
+    test_repair_bad_omap(ctx, manager, pg, osd, obj_name)
+    test_list_inconsistent_obj(ctx, manager, osd_remote, pg, acting, osd,
+                               obj_name, obj_path)
+    log.info('test successful!')
+
+    # shut down fuse mount
+    for i in range(num_osds):
+        manager.raw_cluster_cmd('tell', 'osd.%d' % i, 'injectargs',
+                                '--', '--no-osd-objectstore-fuse')
+    time.sleep(5)
+    log.info('done')
diff --git a/src/ceph/qa/tasks/swift.py b/src/ceph/qa/tasks/swift.py
new file mode 100644
index 0000000..28f75dd
--- /dev/null
+++ b/src/ceph/qa/tasks/swift.py
@@ -0,0 +1,263 @@
+"""
+Test Swift API
+"""
+from cStringIO import StringIO
+from configobj import ConfigObj
+import base64
+import contextlib
+import logging
+import os
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+from teuthology.config import config as teuth_config
+from teuthology.orchestra import run
+from teuthology.orchestra.connection import split_user
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def download(ctx, config):
+    """
+    Download the Swift API.
+    """
+    testdir = teuthology.get_testdir(ctx)
+    assert isinstance(config, list)
+    log.info('Downloading swift...')
+    for client in config:
+        ctx.cluster.only(client).run(
+            args=[
+                'git', 'clone',
+                teuth_config.ceph_git_base_url + 'swift.git',
+                '{tdir}/swift'.format(tdir=testdir),
+                ],
+            )
+    try:
+        yield
+    finally:
+        log.info('Removing swift...')
+        testdir = teuthology.get_testdir(ctx)
+        for client in config:
+            ctx.cluster.only(client).run(
+                args=[
+                    'rm',
+                    '-rf',
+                    '{tdir}/swift'.format(tdir=testdir),
+                    ],
+                )
+
+def _config_user(testswift_conf, account, user, suffix):
+    """
+    Configure a swift user
+
+    :param account: Swift account
+    :param user: User name
+    :param suffix: user name and email suffixes.
+    """
+    testswift_conf['func_test'].setdefault('account{s}'.format(s=suffix), account)
+    testswift_conf['func_test'].setdefault('username{s}'.format(s=suffix), user)
+    testswift_conf['func_test'].setdefault('email{s}'.format(s=suffix), '{account}+test@test.test'.format(account=account))
+    testswift_conf['func_test'].setdefault('display_name{s}'.format(s=suffix), 'Mr. {account} {user}'.format(account=account, user=user))
+    testswift_conf['func_test'].setdefault('password{s}'.format(s=suffix), base64.b64encode(os.urandom(40)))
+
+@contextlib.contextmanager
+def create_users(ctx, config):
+    """
+    Create rgw users to interact with the swift interface.
+    """
+    assert isinstance(config, dict)
+    log.info('Creating rgw users...')
+    testdir = teuthology.get_testdir(ctx)
+    users = {'': 'foo', '2': 'bar'}
+    for client in config['clients']:
+        cluster_name, daemon_type, client_id = teuthology.split_role(client)
+        testswift_conf = config['testswift_conf'][client]
+        for suffix, user in users.iteritems():
+            _config_user(testswift_conf, '{user}.{client}'.format(user=user, client=client), user, suffix)
+            ctx.cluster.only(client).run(
+                args=[
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir),
+                    'radosgw-admin',
+                    '-n', client,
+                    '--cluster', cluster_name,
+                    'user', 'create',
+                    '--subuser', '{account}:{user}'.format(account=testswift_conf['func_test']['account{s}'.format(s=suffix)],user=user),
+                    '--display-name', testswift_conf['func_test']['display_name{s}'.format(s=suffix)],
+                    '--secret', testswift_conf['func_test']['password{s}'.format(s=suffix)],
+                    '--email', testswift_conf['func_test']['email{s}'.format(s=suffix)],
+                    '--key-type', 'swift',
+                    '--access', 'full',
+                ],
+            )
+    try:
+        yield
+    finally:
+        for client in config['clients']:
+            for user in users.itervalues():
+                uid = '{user}.{client}'.format(user=user, client=client)
+                cluster_name, daemon_type, client_id = teuthology.split_role(client)
+                ctx.cluster.only(client).run(
+                    args=[
+                        'adjust-ulimits',
+                        'ceph-coverage',
+                        '{tdir}/archive/coverage'.format(tdir=testdir),
+                        'radosgw-admin',
+                        '-n', client,
+                        '--cluster', cluster_name,
+                        'user', 'rm',
+                        '--uid', uid,
+                        '--purge-data',
+                        ],
+                    )
+
+@contextlib.contextmanager
+def configure(ctx, config):
+    """
+    Configure rgw and Swift
+    """
+    assert isinstance(config, dict)
+    log.info('Configuring testswift...')
+    testdir = teuthology.get_testdir(ctx)
+    for client, properties in config['clients'].iteritems():
+        log.info('client={c}'.format(c=client))
+        log.info('config={c}'.format(c=config))
+        testswift_conf = config['testswift_conf'][client]
+        if properties is not None and 'rgw_server' in properties:
+            host = None
+            for target, roles in zip(ctx.config['targets'].iterkeys(), ctx.config['roles']):
+                log.info('roles: ' + str(roles))
+                log.info('target: ' + str(target))
+                if properties['rgw_server'] in roles:
+                    _, host = split_user(target)
+            assert host is not None, "Invalid client specified as the rgw_server"
+            testswift_conf['func_test']['auth_host'] = host
+        else:
+            testswift_conf['func_test']['auth_host'] = 'localhost'
+
+        log.info(client)
+        (remote,) = ctx.cluster.only(client).remotes.keys()
+        remote.run(
+            args=[
+                'cd',
+                '{tdir}/swift'.format(tdir=testdir),
+                run.Raw('&&'),
+                './bootstrap',
+                ],
+            )
+        conf_fp = StringIO()
+        testswift_conf.write(conf_fp)
+        teuthology.write_file(
+            remote=remote,
+            path='{tdir}/archive/testswift.{client}.conf'.format(tdir=testdir, client=client),
+            data=conf_fp.getvalue(),
+            )
+    yield
+
+
+@contextlib.contextmanager
+def run_tests(ctx, config):
+    """
+    Run an individual Swift test.
+    """
+    assert isinstance(config, dict)
+    testdir = teuthology.get_testdir(ctx)
+    for client, client_config in config.iteritems():
+        args = [
+                'SWIFT_TEST_CONFIG_FILE={tdir}/archive/testswift.{client}.conf'.format(tdir=testdir, client=client),
+                '{tdir}/swift/virtualenv/bin/nosetests'.format(tdir=testdir),
+                '-w',
+                '{tdir}/swift/test/functional'.format(tdir=testdir),
+                '-v',
+                '-a', '!fails_on_rgw',
+                ]
+        if client_config is not None and 'extra_args' in client_config:
+            args.extend(client_config['extra_args'])
+
+        ctx.cluster.only(client).run(
+            args=args,
+            )
+    yield
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run the testswift suite against rgw.
+
+    To run all tests on all clients::
+
+        tasks:
+        - ceph:
+        - rgw:
+        - testswift:
+
+    To restrict testing to particular clients::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - testswift: [client.0]
+
+    To run against a server on client.1::
+
+        tasks:
+        - ceph:
+        - rgw: [client.1]
+        - testswift:
+            client.0:
+              rgw_server: client.1
+
+    To pass extra arguments to nose (e.g. to run a certain test)::
+
+        tasks:
+        - ceph:
+        - rgw: [client.0]
+        - testswift:
+            client.0:
+              extra_args: ['test.functional.tests:TestFileUTF8', '-m', 'testCopy']
+            client.1:
+              extra_args: ['--exclude', 'TestFile']
+    """
+    assert config is None or isinstance(config, list) \
+        or isinstance(config, dict), \
+        "task testswift only supports a list or dictionary for configuration"
+    all_clients = ['client.{id}'.format(id=id_)
+                   for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    if config is None:
+        config = all_clients
+    if isinstance(config, list):
+        config = dict.fromkeys(config)
+    clients = config.keys()
+
+    log.info('clients={c}'.format(c=clients))
+
+    testswift_conf = {}
+    for client in clients:
+        testswift_conf[client] = ConfigObj(
+                indent_type='',
+                infile={
+                    'func_test':
+                        {
+                        'auth_port'      : 7280,
+                        'auth_ssl' : 'no',
+                        'auth_prefix' : '/auth/',
+                        },
+                    }
+                )
+
+    with contextutil.nested(
+        lambda: download(ctx=ctx, config=clients),
+        lambda: create_users(ctx=ctx, config=dict(
+                clients=clients,
+                testswift_conf=testswift_conf,
+                )),
+        lambda: configure(ctx=ctx, config=dict(
+                clients=config,
+                testswift_conf=testswift_conf,
+                )),
+        lambda: run_tests(ctx=ctx, config=config),
+        ):
+        pass
+    yield
diff --git a/src/ceph/qa/tasks/systemd.py b/src/ceph/qa/tasks/systemd.py
new file mode 100644
index 0000000..50471db
--- /dev/null
+++ b/src/ceph/qa/tasks/systemd.py
@@ -0,0 +1,142 @@
+"""
+Systemd test
+"""
+import contextlib
+import logging
+import re
+import time
+
+from cStringIO import StringIO
+from teuthology.orchestra import run
+from teuthology.misc import reconnect, get_first_mon, wait_until_healthy
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+      - tasks:
+          ceph-deploy:
+          systemd:
+
+    Test ceph systemd services can start, stop and restart and
+    check for any failed services and report back errors
+    """
+    for remote, roles in ctx.cluster.remotes.iteritems():
+        remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
+                         'grep', 'ceph'])
+        r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'),
+                             'grep', 'ceph'], stdout=StringIO(),
+                       check_status=False)
+        log.info(r.stdout.getvalue())
+        if r.stdout.getvalue().find('failed'):
+            log.info("Ceph services in failed state")
+
+        # test overall service stop and start using ceph.target
+        # ceph.target tests are meant for ceph systemd tests
+        # and not actual process testing using 'ps'
+        log.info("Stopping all Ceph services")
+        remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
+        r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
+                       stdout=StringIO(), check_status=False)
+        log.info(r.stdout.getvalue())
+        log.info("Checking process status")
+        r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
+                             'grep', 'ceph'], stdout=StringIO())
+        if r.stdout.getvalue().find('Active: inactive'):
+            log.info("Sucessfully stopped all ceph services")
+        else:
+            log.info("Failed to stop ceph services")
+
+        log.info("Starting all Ceph services")
+        remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
+        r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
+                       stdout=StringIO())
+        log.info(r.stdout.getvalue())
+        if r.stdout.getvalue().find('Active: active'):
+            log.info("Sucessfully started all Ceph services")
+        else:
+            log.info("info", "Failed to start Ceph services")
+        r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
+                             'grep', 'ceph'], stdout=StringIO())
+        log.info(r.stdout.getvalue())
+        time.sleep(4)
+
+        # test individual services start stop
+        name = remote.shortname
+        mon_name = 'ceph-mon@' + name + '.service'
+        mds_name = 'ceph-mds@' + name + '.service'
+        mgr_name = 'ceph-mgr@' + name + '.service'
+        mon_role_name = 'mon.' + name
+        mds_role_name = 'mds.' + name
+        mgr_role_name = 'mgr.' + name
+        m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue())
+        if m_osd:
+            osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
+            remote.run(args=['sudo', 'systemctl', 'status',
+                             osd_service])
+            remote.run(args=['sudo', 'systemctl', 'stop',
+                             osd_service])
+            time.sleep(4)  # immediate check will result in deactivating state
+            r = remote.run(args=['sudo', 'systemctl', 'status', osd_service],
+                           stdout=StringIO(), check_status=False)
+            log.info(r.stdout.getvalue())
+            if r.stdout.getvalue().find('Active: inactive'):
+                log.info("Sucessfully stopped single osd ceph service")
+            else:
+                log.info("Failed to stop ceph osd services")
+            remote.run(args=['sudo', 'systemctl', 'start',
+                             osd_service])
+            time.sleep(4)
+        if mon_role_name in roles:
+            remote.run(args=['sudo', 'systemctl', 'status', mon_name])
+            remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
+            time.sleep(4)  # immediate check will result in deactivating state
+            r = remote.run(args=['sudo', 'systemctl', 'status', mon_name],
+                           stdout=StringIO(), check_status=False)
+            if r.stdout.getvalue().find('Active: inactive'):
+                log.info("Sucessfully stopped single mon ceph service")
+            else:
+                log.info("Failed to stop ceph mon service")
+            remote.run(args=['sudo', 'systemctl', 'start', mon_name])
+            time.sleep(4)
+        if mgr_role_name in roles:
+            remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
+            remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
+            time.sleep(4)  # immediate check will result in deactivating state
+            r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name],
+                           stdout=StringIO(), check_status=False)
+            if r.stdout.getvalue().find('Active: inactive'):
+                log.info("Sucessfully stopped single ceph mgr service")
+            else:
+                log.info("Failed to stop ceph mgr service")
+            remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
+            time.sleep(4)
+        if mds_role_name in roles:
+            remote.run(args=['sudo', 'systemctl', 'status', mds_name])
+            remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
+            time.sleep(4)  # immediate check will result in deactivating state
+            r = remote.run(args=['sudo', 'systemctl', 'status', mds_name],
+                           stdout=StringIO(), check_status=False)
+            if r.stdout.getvalue().find('Active: inactive'):
+                log.info("Sucessfully stopped single ceph mds service")
+            else:
+                log.info("Failed to stop ceph mds service")
+            remote.run(args=['sudo', 'systemctl', 'start', mds_name])
+            time.sleep(4)
+
+    # reboot all nodes and verify the systemd units restart
+    # workunit that runs would fail if any of the systemd unit doesnt start
+    ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
+    # avoid immediate reconnect
+    time.sleep(120)
+    reconnect(ctx, 480)  # reconnect all nodes
+    # for debug info
+    ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
+                          'grep', 'ceph'])
+    # wait for HEALTH_OK
+    mon = get_first_mon(ctx, config)
+    (mon_remote,) = ctx.cluster.only(mon).remotes.iterkeys()
+    wait_until_healthy(ctx, mon_remote, use_sudo=True)
+    yield
diff --git a/src/ceph/qa/tasks/tests/__init__.py b/src/ceph/qa/tasks/tests/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/ceph/qa/tasks/tests/__init__.py
diff --git a/src/ceph/qa/tasks/tests/test_buildpackages.py b/src/ceph/qa/tasks/tests/test_buildpackages.py
new file mode 100644
index 0000000..fed5aa0
--- /dev/null
+++ b/src/ceph/qa/tasks/tests/test_buildpackages.py
@@ -0,0 +1,170 @@
+# py.test -v -s tests/test_buildpackages.py
+
+from mock import patch, Mock
+
+from .. import buildpackages
+from teuthology import packaging
+
+def test_get_tag_branch_sha1():
+    gitbuilder = packaging.GitbuilderProject(
+        'ceph',
+        {
+            'os_type': 'centos',
+            'os_version': '7.0',
+        })
+    (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
+    assert tag == None
+    assert branch == None
+    assert sha1 is not None
+
+    gitbuilder = packaging.GitbuilderProject(
+        'ceph',
+        {
+            'os_type': 'centos',
+            'os_version': '7.0',
+            'sha1': 'asha1',
+        })
+    (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
+    assert tag == None
+    assert branch == None
+    assert sha1 == 'asha1'
+
+    remote = Mock
+    remote.arch = 'x86_64'
+    remote.os = Mock
+    remote.os.name = 'ubuntu'
+    remote.os.version = '14.04'
+    remote.os.codename = 'trusty'
+    remote.system_type = 'deb'
+    ctx = Mock
+    ctx.cluster = Mock
+    ctx.cluster.remotes = {remote: ['client.0']}
+
+    expected_tag = 'v0.94.1'
+    expected_sha1 = 'expectedsha1'
+    def check_output(cmd, shell):
+        assert shell == True
+        return expected_sha1 + " refs/tags/" + expected_tag
+    with patch.multiple(
+            buildpackages,
+            check_output=check_output,
+    ):
+        gitbuilder = packaging.GitbuilderProject(
+            'ceph',
+            {
+                'os_type': 'centos',
+                'os_version': '7.0',
+                'sha1': 'asha1',
+                'all': {
+                    'tag': tag,
+                },
+            },
+            ctx = ctx,
+            remote = remote)
+        (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
+        assert tag == expected_tag
+        assert branch == None
+        assert sha1 == expected_sha1
+
+    expected_branch = 'hammer'
+    expected_sha1 = 'otherexpectedsha1'
+    def check_output(cmd, shell):
+        assert shell == True
+        return expected_sha1 + " refs/heads/" + expected_branch
+    with patch.multiple(
+            buildpackages,
+            check_output=check_output,
+    ):
+        gitbuilder = packaging.GitbuilderProject(
+            'ceph',
+            {
+                'os_type': 'centos',
+                'os_version': '7.0',
+                'sha1': 'asha1',
+                'all': {
+                    'branch': branch,
+                },
+            },
+            ctx = ctx,
+            remote = remote)
+        (tag, branch, sha1) = buildpackages.get_tag_branch_sha1(gitbuilder)
+        assert tag == None
+        assert branch == expected_branch
+        assert sha1 == expected_sha1
+
+def test_lookup_configs():
+    expected_system_type = 'deb'
+    def make_remote():
+        remote = Mock()
+        remote.arch = 'x86_64'
+        remote.os = Mock()
+        remote.os.name = 'ubuntu'
+        remote.os.version = '14.04'
+        remote.os.codename = 'trusty'
+        remote.system_type = expected_system_type
+        return remote
+    ctx = Mock()
+    class cluster:
+        remote1 = make_remote()
+        remote2 = make_remote()
+        remotes = {
+            remote1: ['client.0'],
+            remote2: ['mon.a','osd.0'],
+        }
+        def only(self, role):
+            result = Mock()
+            if role in ('client.0',):
+                result.remotes = { cluster.remote1: None }
+            elif role in ('osd.0', 'mon.a'):
+                result.remotes = { cluster.remote2: None }
+            else:
+                result.remotes = None
+            return result
+    ctx.cluster = cluster()
+    ctx.config = {
+        'roles': [ ['client.0'], ['mon.a','osd.0'] ],
+    }
+
+    # nothing -> nothing
+    assert buildpackages.lookup_configs(ctx, {}) == []
+    assert buildpackages.lookup_configs(ctx, {1:[1,2,3]}) == []
+    assert buildpackages.lookup_configs(ctx, [[1,2,3]]) == []
+    assert buildpackages.lookup_configs(ctx, None) == []
+
+    #
+    # the overrides applies to install and to install.upgrade
+    # that have no tag, branch or sha1
+    #
+    config = {
+        'overrides': {
+            'install': {
+                'ceph': {
+                    'sha1': 'overridesha1',
+                    'tag': 'overridetag',
+                    'branch': 'overridebranch',
+                },
+            },
+        },
+        'tasks': [
+            {
+                'install': {
+                    'sha1': 'installsha1',
+                },
+            },
+            {
+                'install.upgrade': {
+                    'osd.0': {
+                    },
+                    'client.0': {
+                        'sha1': 'client0sha1',
+                    },
+                },
+            }
+        ],
+    }
+    ctx.config = config
+    expected_configs = [{'branch': 'overridebranch', 'sha1': 'overridesha1', 'tag': 'overridetag'},
+                        {'project': 'ceph', 'branch': 'overridebranch', 'sha1': 'overridesha1', 'tag': 'overridetag'},
+                        {'project': 'ceph', 'sha1': 'client0sha1'}]
+
+    assert buildpackages.lookup_configs(ctx, config) == expected_configs
diff --git a/src/ceph/qa/tasks/tests/test_devstack.py b/src/ceph/qa/tasks/tests/test_devstack.py
new file mode 100644
index 0000000..117b307
--- /dev/null
+++ b/src/ceph/qa/tasks/tests/test_devstack.py
@@ -0,0 +1,48 @@
+from textwrap import dedent
+
+from .. import devstack
+
+
+class TestDevstack(object):
+    def test_parse_os_table(self):
+        table_str = dedent("""
+            +---------------------+--------------------------------------+
+            |       Property      |                Value                 |
+            +---------------------+--------------------------------------+
+            |     attachments     |                  []                  |
+            |  availability_zone  |                 nova                 |
+            |       bootable      |                false                 |
+            |      created_at     |      2014-02-21T17:14:47.548361      |
+            | display_description |                 None                 |
+            |     display_name    |                 NAME                 |
+            |          id         | ffdbd1bb-60dc-4d95-acfe-88774c09ad3e |
+            |       metadata      |                  {}                  |
+            |         size        |                  1                   |
+            |     snapshot_id     |                 None                 |
+            |     source_volid    |                 None                 |
+            |        status       |               creating               |
+            |     volume_type     |                 None                 |
+            +---------------------+--------------------------------------+
+            """).strip()
+        expected = {
+            'Property': 'Value',
+            'attachments': '[]',
+            'availability_zone': 'nova',
+            'bootable': 'false',
+            'created_at': '2014-02-21T17:14:47.548361',
+            'display_description': 'None',
+            'display_name': 'NAME',
+            'id': 'ffdbd1bb-60dc-4d95-acfe-88774c09ad3e',
+            'metadata': '{}',
+            'size': '1',
+            'snapshot_id': 'None',
+            'source_volid': 'None',
+            'status': 'creating',
+            'volume_type': 'None'}
+
+        vol_info = devstack.parse_os_table(table_str)
+        assert vol_info == expected
+
+
+
+
diff --git a/src/ceph/qa/tasks/tests/test_radosgw_admin.py b/src/ceph/qa/tasks/tests/test_radosgw_admin.py
new file mode 100644
index 0000000..59f3578
--- /dev/null
+++ b/src/ceph/qa/tasks/tests/test_radosgw_admin.py
@@ -0,0 +1,31 @@
+from mock import Mock
+
+from .. import radosgw_admin
+
+acl_with_version = """<?xml version="1.0" encoding="UTF-8"?><AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>foo</ID><DisplayName>Foo</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>foo</ID><DisplayName>Foo</DisplayName></Grantee><Permission>FULL_CONTROL</Permission></Grant></AccessControlList></AccessControlPolicy>
+"""  # noqa
+
+
+acl_without_version = """<AccessControlPolicy xmlns="http://s3.amazonaws.com/doc/2006-03-01/"><Owner><ID>foo</ID><DisplayName>Foo</DisplayName></Owner><AccessControlList><Grant><Grantee xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:type="CanonicalUser"><ID>foo</ID><DisplayName>Foo</DisplayName></Grantee><Permission>FULL_CONTROL</Permission></Grant></AccessControlList></AccessControlPolicy>
+"""  # noqa
+
+
+class TestGetAcl(object):
+
+    def setup(self):
+        self.key = Mock()
+
+    def test_removes_xml_version(self):
+        self.key.get_xml_acl = Mock(return_value=acl_with_version)
+        result = radosgw_admin.get_acl(self.key)
+        assert result.startswith('<AccessControlPolicy')
+
+    def test_xml_version_is_already_removed(self):
+        self.key.get_xml_acl = Mock(return_value=acl_without_version)
+        result = radosgw_admin.get_acl(self.key)
+        assert result.startswith('<AccessControlPolicy')
+
+    def test_newline_gets_trimmed(self):
+        self.key.get_xml_acl = Mock(return_value=acl_without_version)
+        result = radosgw_admin.get_acl(self.key)
+        assert result.endswith('\n') is False
diff --git a/src/ceph/qa/tasks/teuthology_integration.py b/src/ceph/qa/tasks/teuthology_integration.py
new file mode 100644
index 0000000..b5a2278
--- /dev/null
+++ b/src/ceph/qa/tasks/teuthology_integration.py
@@ -0,0 +1,19 @@
+import logging
+from teuthology import misc
+from teuthology.task import Task
+
+log = logging.getLogger(__name__)
+
+
+class TeuthologyIntegration(Task):
+
+    def begin(self):
+        misc.sh("""
+        set -x
+        pip install tox
+        tox
+        # tox -e py27-integration
+        tox -e openstack-integration
+        """)
+
+task = TeuthologyIntegration
diff --git a/src/ceph/qa/tasks/tgt.py b/src/ceph/qa/tasks/tgt.py
new file mode 100644
index 0000000..c2b322e
--- /dev/null
+++ b/src/ceph/qa/tasks/tgt.py
@@ -0,0 +1,177 @@
+"""
+Task to handle tgt
+
+Assumptions made:
+    The ceph-extras tgt package may need to get installed.
+    The open-iscsi package needs to get installed.
+"""
+import logging
+import contextlib
+
+from teuthology import misc as teuthology
+from teuthology import contextutil
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def start_tgt_remotes(ctx, start_tgtd):
+    """
+    This subtask starts up a tgtd on the clients specified
+    """
+    remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
+    tgtd_list = []
+    for rem, roles in remotes.iteritems():
+        for _id in roles:
+            if _id in start_tgtd:
+                if not rem in tgtd_list:
+                    tgtd_list.append(rem)
+                    size = ctx.config.get('image_size', 10240)
+                    rem.run(
+                        args=[
+                            'rbd',
+                            'create',
+                            'iscsi-image',
+                            '--size',
+                            str(size),
+                    ])
+                    rem.run(
+                        args=[
+                            'sudo',
+                            'tgtadm',
+                            '--lld',
+                            'iscsi',
+                            '--mode',
+                            'target',
+                            '--op',
+                            'new',
+                            '--tid',
+                            '1',
+                            '--targetname',
+                            'rbd',
+                        ])
+                    rem.run(
+                        args=[
+                            'sudo',
+                            'tgtadm',
+                            '--lld',
+                            'iscsi',
+                            '--mode',
+                            'logicalunit',
+                            '--op',
+                            'new',
+                            '--tid',
+                            '1',
+                            '--lun',
+                            '1',
+                            '--backing-store',
+                            'iscsi-image',
+                            '--bstype',
+                            'rbd',
+                        ])
+                    rem.run(
+                        args=[
+                            'sudo',
+                            'tgtadm',
+                            '--lld',
+                            'iscsi',
+                            '--op',
+                            'bind',
+                            '--mode',
+                            'target',
+                            '--tid',
+                            '1',
+                            '-I',
+                            'ALL',
+                        ])
+    try:
+        yield
+
+    finally:
+        for rem in tgtd_list:
+            rem.run(
+                args=[
+                    'sudo',
+                    'tgtadm',
+                    '--lld',
+                    'iscsi',
+                    '--mode',
+                    'target',
+                    '--op',
+                    'delete',
+                    '--force',
+                    '--tid',
+                    '1',
+                ])
+            rem.run(
+                args=[
+                    'rbd',
+                    'snap',
+                    'purge',
+                    'iscsi-image',
+                ])
+            rem.run(
+                args=[
+                    'sudo',
+                    'rbd',
+                    'rm',
+                    'iscsi-image',
+                ])
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Start up tgt.
+
+    To start on on all clients::
+
+        tasks:
+        - ceph:
+        - tgt:
+
+    To start on certain clients::
+
+        tasks:
+        - ceph:
+        - tgt: [client.0, client.3]
+
+    or
+
+        tasks:
+        - ceph:
+        - tgt:
+            client.0:
+            client.3:
+
+    An image blocksize size can also be specified::
+        
+        tasks:
+        - ceph:
+        - tgt:
+            image_size = 20480
+
+    The general flow of things here is:
+        1. Find clients on which tgt is supposed to run (start_tgtd)
+        2. Remotely start up tgt daemon
+    On cleanup:
+        3. Stop tgt daemon
+
+    The iscsi administration is handled by the iscsi task.
+    """
+    if config:
+        config = {key : val for key, val in config.items()
+                if key.startswith('client')}
+    # config at this point should only contain keys starting with 'client'
+    start_tgtd = []
+    remotes = ctx.cluster.only(teuthology.is_type('client')).remotes
+    log.info(remotes)
+    if not config:
+        start_tgtd = ['client.{id}'.format(id=id_)
+            for id_ in teuthology.all_roles_of_type(ctx.cluster, 'client')]
+    else:
+        start_tgtd = config
+    log.info(start_tgtd)
+    with contextutil.nested(
+            lambda: start_tgt_remotes(ctx=ctx, start_tgtd=start_tgtd),):
+        yield
diff --git a/src/ceph/qa/tasks/thrash_pool_snaps.py b/src/ceph/qa/tasks/thrash_pool_snaps.py
new file mode 100644
index 0000000..c71c9ce
--- /dev/null
+++ b/src/ceph/qa/tasks/thrash_pool_snaps.py
@@ -0,0 +1,61 @@
+"""
+Thrash -- Simulate random osd failures.
+"""
+import contextlib
+import logging
+import gevent
+import time
+import random
+
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    "Thrash" snap creation and removal on the listed pools
+
+    Example:
+
+    thrash_pool_snaps:
+      pools: [.rgw.buckets, .rgw.buckets.index]
+      max_snaps: 10
+      min_snaps: 5
+      period: 10
+    """
+    stopping = False
+    def do_thrash():
+        pools = config.get('pools', [])
+        max_snaps = config.get('max_snaps', 10)
+        min_snaps = config.get('min_snaps', 5)
+        period = config.get('period', 30)
+        snaps = []
+        manager = ctx.managers['ceph']
+        def remove_snap():
+            assert len(snaps) > 0
+            snap = random.choice(snaps)
+            log.info("Removing snap %s" % (snap,))
+            for pool in pools:
+                manager.remove_pool_snap(pool, str(snap))
+            snaps.remove(snap)
+        def add_snap(snap):
+            log.info("Adding snap %s" % (snap,))
+            for pool in pools:
+                manager.add_pool_snap(pool, str(snap))
+            snaps.append(snap)
+        index = 0
+        while not stopping:
+            index += 1
+            time.sleep(period)
+            if len(snaps) <= min_snaps:
+                add_snap(index)
+            elif len(snaps) >= max_snaps:
+                remove_snap()
+            else:
+                random.choice([lambda: add_snap(index), remove_snap])()
+        log.info("Stopping")
+    thread = gevent.spawn(do_thrash)
+    yield
+    stopping = True
+    thread.join()
+
diff --git a/src/ceph/qa/tasks/thrashosds-health.yaml b/src/ceph/qa/tasks/thrashosds-health.yaml
new file mode 100644
index 0000000..9defe69
--- /dev/null
+++ b/src/ceph/qa/tasks/thrashosds-health.yaml
@@ -0,0 +1,14 @@
+overrides:
+  ceph:
+    log-whitelist:
+      - overall HEALTH_
+      - \(OSDMAP_FLAGS\)
+      - \(OSD_
+      - \(PG_
+      - \(POOL_
+      - \(CACHE_POOL_
+      - \(SMALLER_PGP_NUM\)
+      - \(OBJECT_
+      - \(REQUEST_SLOW\)
+      - \(TOO_FEW_PGS\)
+      - \(MON_DOWN\)
diff --git a/src/ceph/qa/tasks/thrashosds.py b/src/ceph/qa/tasks/thrashosds.py
new file mode 100644
index 0000000..420b735
--- /dev/null
+++ b/src/ceph/qa/tasks/thrashosds.py
@@ -0,0 +1,204 @@
+"""
+Thrash -- Simulate random osd failures.
+"""
+import contextlib
+import logging
+import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    "Thrash" the OSDs by randomly marking them out/down (and then back
+    in) until the task is ended. This loops, and every op_delay
+    seconds it randomly chooses to add or remove an OSD (even odds)
+    unless there are fewer than min_out OSDs out of the cluster, or
+    more than min_in OSDs in the cluster.
+
+    All commands are run on mon0 and it stops when __exit__ is called.
+
+    The config is optional, and is a dict containing some or all of:
+
+    cluster: (default 'ceph') the name of the cluster to thrash
+
+    min_in: (default 4) the minimum number of OSDs to keep in the
+       cluster
+
+    min_out: (default 0) the minimum number of OSDs to keep out of the
+       cluster
+
+    op_delay: (5) the length of time to sleep between changing an
+       OSD's status
+
+    min_dead: (0) minimum number of osds to leave down/dead.
+
+    max_dead: (0) maximum number of osds to leave down/dead before waiting
+       for clean.  This should probably be num_replicas - 1.
+
+    clean_interval: (60) the approximate length of time to loop before
+       waiting until the cluster goes clean. (In reality this is used
+       to probabilistically choose when to wait, and the method used
+       makes it closer to -- but not identical to -- the half-life.)
+
+    scrub_interval: (-1) the approximate length of time to loop before
+       waiting until a scrub is performed while cleaning. (In reality
+       this is used to probabilistically choose when to wait, and it
+       only applies to the cases where cleaning is being performed).
+       -1 is used to indicate that no scrubbing will be done.
+
+    chance_down: (0.4) the probability that the thrasher will mark an
+       OSD down rather than marking it out. (The thrasher will not
+       consider that OSD out of the cluster, since presently an OSD
+       wrongly marked down will mark itself back up again.) This value
+       can be either an integer (eg, 75) or a float probability (eg
+       0.75).
+
+    chance_test_min_size: (0) chance to run test_pool_min_size,
+       which:
+       - kills all but one osd
+       - waits
+       - kills that osd
+       - revives all other osds
+       - verifies that the osds fully recover
+
+    timeout: (360) the number of seconds to wait for the cluster
+       to become clean after each cluster change. If this doesn't
+       happen within the timeout, an exception will be raised.
+
+    revive_timeout: (150) number of seconds to wait for an osd asok to
+       appear after attempting to revive the osd
+
+    thrash_primary_affinity: (true) randomly adjust primary-affinity
+
+    chance_pgnum_grow: (0) chance to increase a pool's size
+    chance_pgpnum_fix: (0) chance to adjust pgpnum to pg for a pool
+    pool_grow_by: (10) amount to increase pgnum by
+    max_pgs_per_pool_osd: (1200) don't expand pools past this size per osd
+
+    pause_short: (3) duration of short pause
+    pause_long: (80) duration of long pause
+    pause_check_after: (50) assert osd down after this long
+    chance_inject_pause_short: (1) chance of injecting short stall
+    chance_inject_pause_long: (0) chance of injecting long stall
+
+    clean_wait: (0) duration to wait before resuming thrashing once clean
+
+    sighup_delay: (0.1) duration to delay between sending signal.SIGHUP to a
+                  random live osd
+
+    powercycle: (false) whether to power cycle the node instead
+        of just the osd process. Note that this assumes that a single
+        osd is the only important process on the node.
+
+    bdev_inject_crash: (0) seconds to delay while inducing a synthetic crash.
+        the delay lets the BlockDevice "accept" more aio operations but blocks
+        any flush, and then eventually crashes (losing some or all ios).  If 0,
+        no bdev failure injection is enabled.
+
+    bdev_inject_crash_probability: (.5) probability of doing a bdev failure
+        injection crash vs a normal OSD kill.
+
+    chance_test_backfill_full: (0) chance to simulate full disks stopping
+        backfill
+
+    chance_test_map_discontinuity: (0) chance to test map discontinuity
+    map_discontinuity_sleep_time: (40) time to wait for map trims
+
+    ceph_objectstore_tool: (true) whether to export/import a pg while an osd is down
+    chance_move_pg: (1.0) chance of moving a pg if more than 1 osd is down (default 100%)
+
+    optrack_toggle_delay: (2.0) duration to delay between toggling op tracker
+                  enablement to all osds
+
+    dump_ops_enable: (true) continuously dump ops on all live osds
+
+    noscrub_toggle_delay: (2.0) duration to delay between toggling noscrub
+
+    disable_objectstore_tool_tests: (false) disable ceph_objectstore_tool based
+                                    tests
+
+    chance_thrash_cluster_full: .05
+
+    chance_thrash_pg_upmap: 1.0
+    chance_thrash_pg_upmap_items: 1.0
+
+    example:
+
+    tasks:
+    - ceph:
+    - thrashosds:
+        cluster: ceph
+        chance_down: 10
+        op_delay: 3
+        min_in: 1
+        timeout: 600
+    - interactive:
+    """
+    if config is None:
+        config = {}
+    assert isinstance(config, dict), \
+        'thrashosds task only accepts a dict for configuration'
+    # add default value for sighup_delay
+    config['sighup_delay'] = config.get('sighup_delay', 0.1)
+    # add default value for optrack_toggle_delay
+    config['optrack_toggle_delay'] = config.get('optrack_toggle_delay', 2.0)
+    # add default value for dump_ops_enable
+    config['dump_ops_enable'] = config.get('dump_ops_enable', "true")
+    # add default value for noscrub_toggle_delay
+    config['noscrub_toggle_delay'] = config.get('noscrub_toggle_delay', 2.0)
+    # add default value for random_eio
+    config['random_eio'] = config.get('random_eio', 0.0)
+
+    log.info("config is {config}".format(config=str(config)))
+
+    overrides = ctx.config.get('overrides', {})
+    log.info("overrides is {overrides}".format(overrides=str(overrides)))
+    teuthology.deep_merge(config, overrides.get('thrashosds', {}))
+    cluster = config.get('cluster', 'ceph')
+
+    log.info("config is {config}".format(config=str(config)))
+
+    if 'powercycle' in config:
+
+        # sync everyone first to avoid collateral damage to / etc.
+        log.info('Doing preliminary sync to avoid collateral damage...')
+        ctx.cluster.run(args=['sync'])
+
+        if 'ipmi_user' in ctx.teuthology_config:
+            for remote in ctx.cluster.remotes.keys():
+                log.debug('checking console status of %s' % remote.shortname)
+                if not remote.console.check_status():
+                    log.warn('Failed to get console status for %s',
+                             remote.shortname)
+
+            # check that all osd remotes have a valid console
+            osds = ctx.cluster.only(teuthology.is_type('osd', cluster))
+            for remote in osds.remotes.keys():
+                if not remote.console.has_ipmi_credentials:
+                    raise Exception(
+                        'IPMI console required for powercycling, '
+                        'but not available on osd role: {r}'.format(
+                            r=remote.name))
+
+    cluster_manager = ctx.managers[cluster]
+    for f in ['powercycle', 'bdev_inject_crash']:
+        if config.get(f):
+            cluster_manager.config[f] = config.get(f)
+
+    log.info('Beginning thrashosds...')
+    thrash_proc = ceph_manager.Thrasher(
+        cluster_manager,
+        config,
+        logger=log.getChild('thrasher')
+        )
+    try:
+        yield
+    finally:
+        log.info('joining thrashosds')
+        thrash_proc.do_join()
+        cluster_manager.wait_for_all_osds_up()
+        cluster_manager.flush_all_pg_stats()
+        cluster_manager.wait_for_recovery(config.get('timeout', 360))
diff --git a/src/ceph/qa/tasks/userdata_setup.yaml b/src/ceph/qa/tasks/userdata_setup.yaml
new file mode 100644
index 0000000..d39695b
--- /dev/null
+++ b/src/ceph/qa/tasks/userdata_setup.yaml
@@ -0,0 +1,25 @@
+#cloud-config-archive
+
+- type: text/cloud-config
+  content: |
+    output:
+      all: '| tee -a /var/log/cloud-init-output.log'
+
+# allow passwordless access for debugging
+- |
+  #!/bin/bash
+  exec passwd -d ubuntu
+
+- |
+  #!/bin/bash
+
+  # mount a NFS share for storing logs
+  apt-get update
+  apt-get -y install nfs-common
+  mkdir /mnt/log
+  # 10.0.2.2 is the host
+  mount -v -t nfs -o proto=tcp 10.0.2.2:{mnt_dir} /mnt/log
+
+  # mount the iso image that has the test script
+  mkdir /mnt/cdrom
+  mount -t auto /dev/cdrom /mnt/cdrom
diff --git a/src/ceph/qa/tasks/userdata_teardown.yaml b/src/ceph/qa/tasks/userdata_teardown.yaml
new file mode 100644
index 0000000..7f3d64f
--- /dev/null
+++ b/src/ceph/qa/tasks/userdata_teardown.yaml
@@ -0,0 +1,11 @@
+- |
+  #!/bin/bash
+  cp /var/log/cloud-init-output.log /mnt/log
+
+- |
+  #!/bin/bash
+  umount /mnt/log
+
+- |
+  #!/bin/bash
+  shutdown -h -P now
diff --git a/src/ceph/qa/tasks/util/__init__.py b/src/ceph/qa/tasks/util/__init__.py
new file mode 100644
index 0000000..5b8575e
--- /dev/null
+++ b/src/ceph/qa/tasks/util/__init__.py
@@ -0,0 +1,26 @@
+from teuthology import misc
+
+def get_remote(ctx, cluster, service_type, service_id):
+    """
+    Get the Remote for the host where a particular role runs.
+
+    :param cluster: name of the cluster the service is part of
+    :param service_type: e.g. 'mds', 'osd', 'client'
+    :param service_id: The third part of a role, e.g. '0' for
+                       the role 'ceph.client.0'
+    :return: a Remote instance for the host where the
+             requested role is placed
+    """
+    def _is_instance(role):
+        role_tuple = misc.split_role(role)
+        return role_tuple == (cluster, service_type, str(service_id))
+    try:
+        (remote,) = ctx.cluster.only(_is_instance).remotes.keys()
+    except ValueError:
+        raise KeyError("Service {0}.{1}.{2} not found".format(cluster,
+                                                              service_type,
+                                                              service_id))
+    return remote
+
+def get_remote_for_role(ctx, role):
+    return get_remote(ctx, *misc.split_role(role))
diff --git a/src/ceph/qa/tasks/util/rados.py b/src/ceph/qa/tasks/util/rados.py
new file mode 100644
index 0000000..a83f9e1
--- /dev/null
+++ b/src/ceph/qa/tasks/util/rados.py
@@ -0,0 +1,87 @@
+import logging
+
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def rados(ctx, remote, cmd, wait=True, check_status=False):
+    testdir = teuthology.get_testdir(ctx)
+    log.info("rados %s" % ' '.join(cmd))
+    pre = [
+        'adjust-ulimits',
+        'ceph-coverage',
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'rados',
+        ];
+    pre.extend(cmd)
+    proc = remote.run(
+        args=pre,
+        check_status=check_status,
+        wait=wait,
+        )
+    if wait:
+        return proc.exitstatus
+    else:
+        return proc
+
+def create_ec_pool(remote, name, profile_name, pgnum, profile={}, cluster_name="ceph", application=None):
+    remote.run(args=['sudo', 'ceph'] +
+               cmd_erasure_code_profile(profile_name, profile) + ['--cluster', cluster_name])
+    remote.run(args=[
+        'sudo', 'ceph', 'osd', 'pool', 'create', name,
+        str(pgnum), str(pgnum), 'erasure', profile_name, '--cluster', cluster_name
+        ])
+    if application:
+        remote.run(args=[
+            'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
+        ], check_status=False) # may fail as EINVAL when run in jewel upgrade test
+
+def create_replicated_pool(remote, name, pgnum, cluster_name="ceph", application=None):
+    remote.run(args=[
+        'sudo', 'ceph', 'osd', 'pool', 'create', name, str(pgnum), str(pgnum), '--cluster', cluster_name
+        ])
+    if application:
+        remote.run(args=[
+            'sudo', 'ceph', 'osd', 'pool', 'application', 'enable', name, application, '--cluster', cluster_name
+        ], check_status=False)
+
+def create_cache_pool(remote, base_name, cache_name, pgnum, size, cluster_name="ceph"):
+    remote.run(args=[
+        'sudo', 'ceph', 'osd', 'pool', 'create', cache_name, str(pgnum), '--cluster', cluster_name
+    ])
+    remote.run(args=[
+        'sudo', 'ceph', 'osd', 'tier', 'add-cache', base_name, cache_name,
+        str(size), '--cluster', cluster_name
+    ])
+
+def cmd_erasure_code_profile(profile_name, profile):
+    """
+    Return the shell command to run to create the erasure code profile
+    described by the profile parameter.
+    
+    :param profile_name: a string matching [A-Za-z0-9-_.]+
+    :param profile: a map whose semantic depends on the erasure code plugin
+    :returns: a shell command as an array suitable for Remote.run
+
+    If profile is {}, it is replaced with 
+
+      { 'k': '2', 'm': '1', 'crush-failure-domain': 'osd'}
+
+    for backward compatibility. In previous versions of teuthology,
+    these values were hardcoded as function arguments and some yaml
+    files were designed with these implicit values. The teuthology
+    code should not know anything about the erasure code profile
+    content or semantic. The valid values and parameters are outside
+    its scope.
+    """
+
+    if profile == {}:
+        profile = {
+            'k': '2',
+            'm': '1',
+            'crush-failure-domain': 'osd'
+        }
+    return [
+        'osd', 'erasure-code-profile', 'set',
+        profile_name
+        ] + [ str(key) + '=' + str(value) for key, value in profile.iteritems() ]
diff --git a/src/ceph/qa/tasks/util/rgw.py b/src/ceph/qa/tasks/util/rgw.py
new file mode 100644
index 0000000..ab76b50
--- /dev/null
+++ b/src/ceph/qa/tasks/util/rgw.py
@@ -0,0 +1,81 @@
+from cStringIO import StringIO
+import logging
+import json
+import requests
+
+from requests.packages.urllib3 import PoolManager
+from requests.packages.urllib3.util import Retry
+from urlparse import urlparse
+
+from teuthology.orchestra.connection import split_user
+from teuthology import misc as teuthology
+
+log = logging.getLogger(__name__)
+
+def rgwadmin(ctx, client, cmd, stdin=StringIO(), check_status=False,
+             format='json', decode=True, log_level=logging.DEBUG):
+    log.info('rgwadmin: {client} : {cmd}'.format(client=client,cmd=cmd))
+    testdir = teuthology.get_testdir(ctx)
+    cluster_name, daemon_type, client_id = teuthology.split_role(client)
+    client_with_id = daemon_type + '.' + client_id
+    pre = [
+        'adjust-ulimits',
+        'ceph-coverage'.format(tdir=testdir),
+        '{tdir}/archive/coverage'.format(tdir=testdir),
+        'radosgw-admin'.format(tdir=testdir),
+        '--log-to-stderr',
+        '--format', format,
+        '-n',  client_with_id,
+        '--cluster', cluster_name,
+        ]
+    pre.extend(cmd)
+    log.log(log_level, 'rgwadmin: cmd=%s' % pre)
+    (remote,) = ctx.cluster.only(client).remotes.iterkeys()
+    proc = remote.run(
+        args=pre,
+        check_status=check_status,
+        stdout=StringIO(),
+        stderr=StringIO(),
+        stdin=stdin,
+        )
+    r = proc.exitstatus
+    out = proc.stdout.getvalue()
+    if not decode:
+        return (r, out)
+    j = None
+    if not r and out != '':
+        try:
+            j = json.loads(out)
+            log.log(log_level, ' json result: %s' % j)
+        except ValueError:
+            j = out
+            log.log(log_level, ' raw result: %s' % j)
+    return (r, j)
+
+def get_user_summary(out, user):
+    """Extract the summary for a given user"""
+    user_summary = None
+    for summary in out['summary']:
+        if summary.get('user') == user:
+            user_summary = summary
+
+    if not user_summary:
+        raise AssertionError('No summary info found for user: %s' % user)
+
+    return user_summary
+
+def get_user_successful_ops(out, user):
+    summary = out['summary']
+    if len(summary) == 0:
+        return 0
+    return get_user_summary(out, user)['total']['successful_ops']
+
+def wait_for_radosgw(url):
+    """ poll the given url until it starts accepting connections
+
+    add_daemon() doesn't wait until radosgw finishes startup, so this is used
+    to avoid racing with later tasks that expect radosgw to be up and listening
+    """
+    # use a connection pool with retry/backoff to poll until it starts listening
+    http = PoolManager(retries=Retry(connect=8, backoff_factor=1))
+    http.request('GET', url)
diff --git a/src/ceph/qa/tasks/util/test/__init__.py b/src/ceph/qa/tasks/util/test/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/ceph/qa/tasks/util/test/__init__.py
diff --git a/src/ceph/qa/tasks/util/test/test_rados.py b/src/ceph/qa/tasks/util/test/test_rados.py
new file mode 100644
index 0000000..ee1cfa6
--- /dev/null
+++ b/src/ceph/qa/tasks/util/test/test_rados.py
@@ -0,0 +1,40 @@
+#
+#  The MIT License
+#
+# Copyright (C) 2014 Cloudwatt <libre.licensing@cloudwatt.com>
+#
+# Author: Loic Dachary <loic@dachary.org>
+#
+#  Permission is hereby granted, free of charge, to any person
+#  obtaining a copy of this software and associated documentation
+#  files (the "Software"), to deal in the Software without
+#  restriction, including without limitation the rights to use,
+#  copy, modify, merge, publish, distribute, sublicense, and/or sell
+#  copies of the Software, and to permit persons to whom the
+#  Software is furnished to do so, subject to the following
+#  conditions:
+#
+#  The above copyright notice and this permission notice shall be
+#  included in all copies or substantial portions of the Software.
+#
+#  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+#  EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+#  OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+#  NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+#  HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+#  WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+#  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+#  OTHER DEALINGS IN THE SOFTWARE.
+#
+from .. import rados
+
+class TestRados(object):
+
+    def test_cmd_erasure_code_profile(self):
+        name = 'NAME'
+        cmd = rados.cmd_erasure_code_profile(name, {})
+        assert 'k=2' in cmd
+        assert name in cmd
+        cmd = rados.cmd_erasure_code_profile(name, { 'k': '88' })
+        assert 'k=88' in cmd
+        assert name in cmd
diff --git a/src/ceph/qa/tasks/vstart_runner.py b/src/ceph/qa/tasks/vstart_runner.py
new file mode 100644
index 0000000..842e80d
--- /dev/null
+++ b/src/ceph/qa/tasks/vstart_runner.py
@@ -0,0 +1,1079 @@
+"""
+vstart_runner: override Filesystem and Mount interfaces to run a CephFSTestCase against a vstart
+ceph instance instead of a packaged/installed cluster.  Use this to turn around test cases
+quickly during development.
+
+Simple usage (assuming teuthology and ceph checked out in ~/git):
+
+    # Activate the teuthology virtualenv
+    source ~/git/teuthology/virtualenv/bin/activate
+    # Go into your ceph build directory
+    cd ~/git/ceph/build
+    # Invoke a test using this script
+    python ~/git/ceph/qa/tasks/vstart_runner.py --create tasks.cephfs.test_data_scan
+
+Alternative usage:
+
+    # Alternatively, if you use different paths, specify them as follows:
+    LD_LIBRARY_PATH=`pwd`/lib PYTHONPATH=~/git/teuthology:~/git/ceph/qa:`pwd`/../src/pybind:`pwd`/lib/cython_modules/lib.2 python ~/git/ceph/qa/tasks/vstart_runner.py
+
+    # If you wish to drop to a python shell on failures, use --interactive:
+    python ~/git/ceph/qa/tasks/vstart_runner.py --interactive
+
+    # If you wish to run a named test case, pass it as an argument:
+    python ~/git/ceph/qa/tasks/vstart_runner.py tasks.cephfs.test_data_scan
+
+"""
+
+from StringIO import StringIO
+from collections import defaultdict
+import getpass
+import signal
+import tempfile
+import threading
+import datetime
+import shutil
+import re
+import os
+import time
+import json
+import sys
+import errno
+from unittest import suite, loader
+import unittest
+import platform
+from teuthology.orchestra.run import Raw, quote
+from teuthology.orchestra.daemon import DaemonGroup
+from teuthology.config import config as teuth_config
+
+import logging
+
+log = logging.getLogger(__name__)
+
+handler = logging.FileHandler("./vstart_runner.log")
+formatter = logging.Formatter(
+    fmt=u'%(asctime)s.%(msecs)03d %(levelname)s:%(name)s:%(message)s',
+    datefmt='%Y-%m-%dT%H:%M:%S')
+handler.setFormatter(formatter)
+log.addHandler(handler)
+log.setLevel(logging.INFO)
+
+
+def respawn_in_path(lib_path, python_paths):
+    execv_cmd = ['python']
+    if platform.system() == "Darwin":
+        lib_path_var = "DYLD_LIBRARY_PATH"
+    else:
+        lib_path_var = "LD_LIBRARY_PATH"
+
+    py_binary = os.environ.get("PYTHON", "python")
+
+    if lib_path_var in os.environ:
+        if lib_path not in os.environ[lib_path_var]:
+            os.environ[lib_path_var] += ':' + lib_path
+            os.execvp(py_binary, execv_cmd + sys.argv)
+    else:
+        os.environ[lib_path_var] = lib_path
+        os.execvp(py_binary, execv_cmd + sys.argv)
+
+    for p in python_paths:
+        sys.path.insert(0, p)
+
+
+# Let's use some sensible defaults
+if os.path.exists("./CMakeCache.txt") and os.path.exists("./bin"):
+
+    # A list of candidate paths for each package we need
+    guesses = [
+        ["~/git/teuthology", "~/scm/teuthology", "~/teuthology"],
+        ["lib/cython_modules/lib.2"],
+        ["../src/pybind"],
+    ]
+
+    python_paths = []
+
+    # Up one level so that "tasks.foo.bar" imports work
+    python_paths.append(os.path.abspath(
+        os.path.join(os.path.dirname(os.path.realpath(__file__)), "..")
+    ))
+
+    for package_guesses in guesses:
+        for g in package_guesses:
+            g_exp = os.path.abspath(os.path.expanduser(g))
+            if os.path.exists(g_exp):
+                python_paths.append(g_exp)
+
+    ld_path = os.path.join(os.getcwd(), "lib/")
+    print "Using guessed paths {0} {1}".format(ld_path, python_paths)
+    respawn_in_path(ld_path, python_paths)
+
+
+try:
+    from teuthology.exceptions import CommandFailedError
+    from tasks.ceph_manager import CephManager
+    from tasks.cephfs.fuse_mount import FuseMount
+    from tasks.cephfs.filesystem import Filesystem, MDSCluster, CephCluster
+    from mgr.mgr_test_case import MgrCluster
+    from teuthology.contextutil import MaxWhileTries
+    from teuthology.task import interactive
+except ImportError:
+    sys.stderr.write("***\nError importing packages, have you activated your teuthology virtualenv "
+                     "and set PYTHONPATH to point to teuthology and ceph-qa-suite?\n***\n\n")
+    raise
+
+# Must import after teuthology because of gevent monkey patching
+import subprocess
+
+if os.path.exists("./CMakeCache.txt"):
+    # Running in build dir of a cmake build
+    BIN_PREFIX = "./bin/"
+    SRC_PREFIX = "../src"
+else:
+    # Running in src/ of an autotools build
+    BIN_PREFIX = "./"
+    SRC_PREFIX = "./"
+
+
+class LocalRemoteProcess(object):
+    def __init__(self, args, subproc, check_status, stdout, stderr):
+        self.args = args
+        self.subproc = subproc
+        if stdout is None:
+            self.stdout = StringIO()
+        else:
+            self.stdout = stdout
+
+        if stderr is None:
+            self.stderr = StringIO()
+        else:
+            self.stderr = stderr
+
+        self.check_status = check_status
+        self.exitstatus = self.returncode = None
+
+    def wait(self):
+        if self.finished:
+            # Avoid calling communicate() on a dead process because it'll
+            # give you stick about std* already being closed
+            if self.exitstatus != 0:
+                raise CommandFailedError(self.args, self.exitstatus)
+            else:
+                return
+
+        out, err = self.subproc.communicate()
+        self.stdout.write(out)
+        self.stderr.write(err)
+
+        self.exitstatus = self.returncode = self.subproc.returncode
+
+        if self.exitstatus != 0:
+            sys.stderr.write(out)
+            sys.stderr.write(err)
+
+        if self.check_status and self.exitstatus != 0:
+            raise CommandFailedError(self.args, self.exitstatus)
+
+    @property
+    def finished(self):
+        if self.exitstatus is not None:
+            return True
+
+        if self.subproc.poll() is not None:
+            out, err = self.subproc.communicate()
+            self.stdout.write(out)
+            self.stderr.write(err)
+            self.exitstatus = self.returncode = self.subproc.returncode
+            return True
+        else:
+            return False
+
+    def kill(self):
+        log.info("kill ")
+        if self.subproc.pid and not self.finished:
+            log.info("kill: killing pid {0} ({1})".format(
+                self.subproc.pid, self.args))
+            safe_kill(self.subproc.pid)
+        else:
+            log.info("kill: already terminated ({0})".format(self.args))
+
+    @property
+    def stdin(self):
+        class FakeStdIn(object):
+            def __init__(self, mount_daemon):
+                self.mount_daemon = mount_daemon
+
+            def close(self):
+                self.mount_daemon.kill()
+
+        return FakeStdIn(self)
+
+
+class LocalRemote(object):
+    """
+    Amusingly named class to present the teuthology RemoteProcess interface when we are really
+    running things locally for vstart
+
+    Run this inside your src/ dir!
+    """
+
+    def __init__(self):
+        self.name = "local"
+        self.hostname = "localhost"
+        self.user = getpass.getuser()
+
+    def get_file(self, path, sudo, dest_dir):
+        tmpfile = tempfile.NamedTemporaryFile(delete=False).name
+        shutil.copy(path, tmpfile)
+        return tmpfile
+
+    def put_file(self, src, dst, sudo=False):
+        shutil.copy(src, dst)
+
+    def run(self, args, check_status=True, wait=True,
+            stdout=None, stderr=None, cwd=None, stdin=None,
+            logger=None, label=None, env=None):
+        log.info("run args={0}".format(args))
+
+        # We don't need no stinkin' sudo
+        args = [a for a in args if a != "sudo"]
+
+        # We have to use shell=True if any run.Raw was present, e.g. &&
+        shell = any([a for a in args if isinstance(a, Raw)])
+
+        if shell:
+            filtered = []
+            i = 0
+            while i < len(args):
+                if args[i] == 'adjust-ulimits':
+                    i += 1
+                elif args[i] == 'ceph-coverage':
+                    i += 2
+                elif args[i] == 'timeout':
+                    i += 2
+                else:
+                    filtered.append(args[i])
+                    i += 1
+
+            args = quote(filtered)
+            log.info("Running {0}".format(args))
+
+            subproc = subprocess.Popen(args,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE,
+                                       stdin=subprocess.PIPE,
+                                       cwd=cwd,
+                                       shell=True)
+        else:
+            log.info("Running {0}".format(args))
+
+            for arg in args:
+                if not isinstance(arg, basestring):
+                    raise RuntimeError("Oops, can't handle arg {0} type {1}".format(
+                        arg, arg.__class__
+                    ))
+
+            subproc = subprocess.Popen(args,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.PIPE,
+                                       stdin=subprocess.PIPE,
+                                       cwd=cwd,
+                                       env=env)
+
+        if stdin:
+            if not isinstance(stdin, basestring):
+                raise RuntimeError("Can't handle non-string stdins on a vstart cluster")
+
+            # Hack: writing to stdin is not deadlock-safe, but it "always" works
+            # as long as the input buffer is "small"
+            subproc.stdin.write(stdin)
+
+        proc = LocalRemoteProcess(
+            args, subproc, check_status,
+            stdout, stderr
+        )
+
+        if wait:
+            proc.wait()
+
+        return proc
+
+
+class LocalDaemon(object):
+    def __init__(self, daemon_type, daemon_id):
+        self.daemon_type = daemon_type
+        self.daemon_id = daemon_id
+        self.controller = LocalRemote()
+        self.proc = None
+
+    @property
+    def remote(self):
+        return LocalRemote()
+
+    def running(self):
+        return self._get_pid() is not None
+
+    def _get_pid(self):
+        """
+        Return PID as an integer or None if not found
+        """
+        ps_txt = self.controller.run(
+            args=["ps", "ww", "-u"+str(os.getuid())]
+        ).stdout.getvalue().strip()
+        lines = ps_txt.split("\n")[1:]
+
+        for line in lines:
+            if line.find("ceph-{0} -i {1}".format(self.daemon_type, self.daemon_id)) != -1:
+                log.info("Found ps line for daemon: {0}".format(line))
+                return int(line.split()[0])
+        log.info("No match for {0} {1}: {2}".format(
+            self.daemon_type, self.daemon_id, ps_txt
+            ))
+        return None
+
+    def wait(self, timeout):
+        waited = 0
+        while self._get_pid() is not None:
+            if waited > timeout:
+                raise MaxWhileTries("Timed out waiting for daemon {0}.{1}".format(self.daemon_type, self.daemon_id))
+            time.sleep(1)
+            waited += 1
+
+    def stop(self, timeout=300):
+        if not self.running():
+            log.error('tried to stop a non-running daemon')
+            return
+
+        pid = self._get_pid()
+        log.info("Killing PID {0} for {1}.{2}".format(pid, self.daemon_type, self.daemon_id))
+        os.kill(pid, signal.SIGKILL)
+
+        waited = 0
+        while pid is not None:
+            new_pid = self._get_pid()
+            if new_pid is not None and new_pid != pid:
+                log.info("Killing new PID {0}".format(new_pid))
+                pid = new_pid
+                os.kill(pid, signal.SIGKILL)
+
+            if new_pid is None:
+                break
+            else:
+                if waited > timeout:
+                    raise MaxWhileTries(
+                        "Timed out waiting for daemon {0}.{1}".format(
+                            self.daemon_type, self.daemon_id))
+                time.sleep(1)
+                waited += 1
+
+        self.wait(timeout=timeout)
+
+    def restart(self):
+        if self._get_pid() is not None:
+            self.stop()
+
+        self.proc = self.controller.run([os.path.join(BIN_PREFIX, "./ceph-{0}".format(self.daemon_type)), "-i", self.daemon_id])
+
+
+def safe_kill(pid):
+    """
+    os.kill annoyingly raises exception if process already dead.  Ignore it.
+    """
+    try:
+        return os.kill(pid, signal.SIGKILL)
+    except OSError as e:
+        if e.errno == errno.ESRCH:
+            # Raced with process termination
+            pass
+        else:
+            raise
+
+
+class LocalFuseMount(FuseMount):
+    def __init__(self, test_dir, client_id):
+        super(LocalFuseMount, self).__init__(None, test_dir, client_id, LocalRemote())
+
+    @property
+    def config_path(self):
+        return "./ceph.conf"
+
+    def get_keyring_path(self):
+        # This is going to end up in a config file, so use an absolute path
+        # to avoid assumptions about daemons' pwd
+        return os.path.abspath("./client.{0}.keyring".format(self.client_id))
+
+    def run_shell(self, args, wait=True):
+        # FIXME maybe should add a pwd arg to teuthology.orchestra so that
+        # the "cd foo && bar" shenanigans isn't needed to begin with and
+        # then we wouldn't have to special case this
+        return self.client_remote.run(
+            args, wait=wait, cwd=self.mountpoint
+        )
+
+    @property
+    def _prefix(self):
+        return BIN_PREFIX
+
+    def _asok_path(self):
+        # In teuthology, the asok is named after the PID of the ceph-fuse process, because it's
+        # run foreground.  When running it daemonized however, the asok is named after
+        # the PID of the launching process, not the long running ceph-fuse process.  Therefore
+        # we need to give an exact path here as the logic for checking /proc/ for which
+        # asok is alive does not work.
+        path = "./out/client.{0}.{1}.asok".format(self.client_id, self.fuse_daemon.subproc.pid)
+        log.info("I think my launching pid was {0}".format(self.fuse_daemon.subproc.pid))
+        return path
+
+    def umount(self):
+        if self.is_mounted():
+            super(LocalFuseMount, self).umount()
+
+    def mount(self, mount_path=None, mount_fs_name=None):
+        self.client_remote.run(
+            args=[
+                'mkdir',
+                '--',
+                self.mountpoint,
+            ],
+        )
+
+        def list_connections():
+            self.client_remote.run(
+                args=["mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"],
+                check_status=False
+            )
+            p = self.client_remote.run(
+                args=["ls", "/sys/fs/fuse/connections"],
+                check_status=False
+            )
+            if p.exitstatus != 0:
+                log.warn("ls conns failed with {0}, assuming none".format(p.exitstatus))
+                return []
+
+            ls_str = p.stdout.getvalue().strip()
+            if ls_str:
+                return [int(n) for n in ls_str.split("\n")]
+            else:
+                return []
+
+        # Before starting ceph-fuse process, note the contents of
+        # /sys/fs/fuse/connections
+        pre_mount_conns = list_connections()
+        log.info("Pre-mount connections: {0}".format(pre_mount_conns))
+
+        prefix = [os.path.join(BIN_PREFIX, "ceph-fuse")]
+        if os.getuid() != 0:
+            prefix += ["--client-die-on-failed-remount=false"]
+
+        if mount_path is not None:
+            prefix += ["--client_mountpoint={0}".format(mount_path)]
+
+        if mount_fs_name is not None:
+            prefix += ["--client_mds_namespace={0}".format(mount_fs_name)]
+
+        self.fuse_daemon = self.client_remote.run(args=
+                                            prefix + [
+                                                "-f",
+                                                "--name",
+                                                "client.{0}".format(self.client_id),
+                                                self.mountpoint
+                                            ], wait=False)
+
+        log.info("Mounting client.{0} with pid {1}".format(self.client_id, self.fuse_daemon.subproc.pid))
+
+        # Wait for the connection reference to appear in /sys
+        waited = 0
+        post_mount_conns = list_connections()
+        while len(post_mount_conns) <= len(pre_mount_conns):
+            if self.fuse_daemon.finished:
+                # Did mount fail?  Raise the CommandFailedError instead of
+                # hitting the "failed to populate /sys/" timeout
+                self.fuse_daemon.wait()
+            time.sleep(1)
+            waited += 1
+            if waited > 30:
+                raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format(
+                    waited
+                ))
+            post_mount_conns = list_connections()
+
+        log.info("Post-mount connections: {0}".format(post_mount_conns))
+
+        # Record our fuse connection number so that we can use it when
+        # forcing an unmount
+        new_conns = list(set(post_mount_conns) - set(pre_mount_conns))
+        if len(new_conns) == 0:
+            raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns))
+        elif len(new_conns) > 1:
+            raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns))
+        else:
+            self._fuse_conn = new_conns[0]
+
+    def _run_python(self, pyscript):
+        """
+        Override this to remove the daemon-helper prefix that is used otherwise
+        to make the process killable.
+        """
+        return self.client_remote.run(args=[
+            'python', '-c', pyscript
+        ], wait=False)
+
+
+class LocalCephManager(CephManager):
+    def __init__(self):
+        # Deliberately skip parent init, only inheriting from it to get
+        # util methods like osd_dump that sit on top of raw_cluster_cmd
+        self.controller = LocalRemote()
+
+        # A minority of CephManager fns actually bother locking for when
+        # certain teuthology tests want to run tasks in parallel
+        self.lock = threading.RLock()
+
+        self.log = lambda x: log.info(x)
+
+    def find_remote(self, daemon_type, daemon_id):
+        """
+        daemon_type like 'mds', 'osd'
+        daemon_id like 'a', '0'
+        """
+        return LocalRemote()
+
+    def run_ceph_w(self):
+        proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph"), "-w"], wait=False, stdout=StringIO())
+        return proc
+
+    def raw_cluster_cmd(self, *args):
+        """
+        args like ["osd", "dump"}
+        return stdout string
+        """
+        proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args))
+        return proc.stdout.getvalue()
+
+    def raw_cluster_cmd_result(self, *args):
+        """
+        like raw_cluster_cmd but don't check status, just return rc
+        """
+        proc = self.controller.run([os.path.join(BIN_PREFIX, "ceph")] + list(args), check_status=False)
+        return proc.exitstatus
+
+    def admin_socket(self, daemon_type, daemon_id, command, check_status=True):
+        return self.controller.run(
+            args=[os.path.join(BIN_PREFIX, "ceph"), "daemon", "{0}.{1}".format(daemon_type, daemon_id)] + command, check_status=check_status
+        )
+
+    # FIXME: copypasta
+    def get_mds_status(self, mds):
+        """
+        Run cluster commands for the mds in order to get mds information
+        """
+        out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
+        j = json.loads(' '.join(out.splitlines()[1:]))
+        # collate; for dup ids, larger gid wins.
+        for info in j['info'].itervalues():
+            if info['name'] == mds:
+                return info
+        return None
+
+    # FIXME: copypasta
+    def get_mds_status_by_rank(self, rank):
+        """
+        Run cluster commands for the mds in order to get mds information
+        check rank.
+        """
+        j = self.get_mds_status_all()
+        # collate; for dup ids, larger gid wins.
+        for info in j['info'].itervalues():
+            if info['rank'] == rank:
+                return info
+        return None
+
+    def get_mds_status_all(self):
+        """
+        Run cluster command to extract all the mds status.
+        """
+        out = self.raw_cluster_cmd('mds', 'dump', '--format=json')
+        j = json.loads(' '.join(out.splitlines()[1:]))
+        return j
+
+
+class LocalCephCluster(CephCluster):
+    def __init__(self, ctx):
+        # Deliberately skip calling parent constructor
+        self._ctx = ctx
+        self.mon_manager = LocalCephManager()
+        self._conf = defaultdict(dict)
+
+    @property
+    def admin_remote(self):
+        return LocalRemote()
+
+    def get_config(self, key, service_type=None):
+        if service_type is None:
+            service_type = 'mon'
+
+        # FIXME hardcoded vstart service IDs
+        service_id = {
+            'mon': 'a',
+            'mds': 'a',
+            'osd': '0'
+        }[service_type]
+
+        return self.json_asok(['config', 'get', key], service_type, service_id)[key]
+
+    def _write_conf(self):
+        # In teuthology, we have the honour of writing the entire ceph.conf, but
+        # in vstart land it has mostly already been written and we need to carefully
+        # append to it.
+        conf_path = "./ceph.conf"
+        banner = "\n#LOCAL_TEST\n"
+        existing_str = open(conf_path).read()
+
+        if banner in existing_str:
+            existing_str = existing_str[0:existing_str.find(banner)]
+
+        existing_str += banner
+
+        for subsys, kvs in self._conf.items():
+            existing_str += "\n[{0}]\n".format(subsys)
+            for key, val in kvs.items():
+                # Comment out existing instance if it exists
+                log.info("Searching for existing instance {0}/{1}".format(
+                    key, subsys
+                ))
+                existing_section = re.search("^\[{0}\]$([\n]|[^\[])+".format(
+                    subsys
+                ), existing_str, re.MULTILINE)
+
+                if existing_section:
+                    section_str = existing_str[existing_section.start():existing_section.end()]
+                    existing_val = re.search("^\s*[^#]({0}) =".format(key), section_str, re.MULTILINE)
+                    if existing_val:
+                        start = existing_section.start() + existing_val.start(1)
+                        log.info("Found string to replace at {0}".format(
+                            start
+                        ))
+                        existing_str = existing_str[0:start] + "#" + existing_str[start:]
+
+                existing_str += "{0} = {1}\n".format(key, val)
+
+        open(conf_path, "w").write(existing_str)
+
+    def set_ceph_conf(self, subsys, key, value):
+        self._conf[subsys][key] = value
+        self._write_conf()
+
+    def clear_ceph_conf(self, subsys, key):
+        del self._conf[subsys][key]
+        self._write_conf()
+
+
+class LocalMDSCluster(LocalCephCluster, MDSCluster):
+    def __init__(self, ctx):
+        super(LocalMDSCluster, self).__init__(ctx)
+
+        self.mds_ids = ctx.daemons.daemons['mds'].keys()
+        self.mds_daemons = dict([(id_, LocalDaemon("mds", id_)) for id_ in self.mds_ids])
+
+    def clear_firewall(self):
+        # FIXME: unimplemented
+        pass
+
+    def newfs(self, name='cephfs', create=True):
+        return LocalFilesystem(self._ctx, name=name, create=create)
+
+
+class LocalMgrCluster(LocalCephCluster, MgrCluster):
+    def __init__(self, ctx):
+        super(LocalMgrCluster, self).__init__(ctx)
+
+        self.mgr_ids = ctx.daemons.daemons['mgr'].keys()
+        self.mgr_daemons = dict([(id_, LocalDaemon("mgr", id_)) for id_ in self.mgr_ids])
+
+
+class LocalFilesystem(Filesystem, LocalMDSCluster):
+    def __init__(self, ctx, fscid=None, name='cephfs', create=False):
+        # Deliberately skip calling parent constructor
+        self._ctx = ctx
+
+        self.id = None
+        self.name = None
+        self.metadata_pool_name = None
+        self.metadata_overlay = False
+        self.data_pool_name = None
+        self.data_pools = None
+
+        # Hack: cheeky inspection of ceph.conf to see what MDSs exist
+        self.mds_ids = set()
+        for line in open("ceph.conf").readlines():
+            match = re.match("^\[mds\.(.+)\]$", line)
+            if match:
+                self.mds_ids.add(match.group(1))
+
+        if not self.mds_ids:
+            raise RuntimeError("No MDSs found in ceph.conf!")
+
+        self.mds_ids = list(self.mds_ids)
+
+        log.info("Discovered MDS IDs: {0}".format(self.mds_ids))
+
+        self.mon_manager = LocalCephManager()
+
+        self.mds_daemons = dict([(id_, LocalDaemon("mds", id_)) for id_ in self.mds_ids])
+
+        self.client_remote = LocalRemote()
+
+        self._conf = defaultdict(dict)
+
+        if name is not None:
+            if fscid is not None:
+                raise RuntimeError("cannot specify fscid when creating fs")
+            if create and not self.legacy_configured():
+                self.create()
+        else:
+            if fscid is not None:
+                self.id = fscid
+                self.getinfo(refresh=True)
+
+        # Stash a reference to the first created filesystem on ctx, so
+        # that if someone drops to the interactive shell they can easily
+        # poke our methods.
+        if not hasattr(self._ctx, "filesystem"):
+            self._ctx.filesystem = self
+
+    @property
+    def _prefix(self):
+        return BIN_PREFIX
+
+    def set_clients_block(self, blocked, mds_id=None):
+        raise NotImplementedError()
+
+    def get_pgs_per_fs_pool(self):
+        # FIXME: assuming there are 3 OSDs
+        return 3 * int(self.get_config('mon_pg_warn_min_per_osd'))
+
+
+class InteractiveFailureResult(unittest.TextTestResult):
+    """
+    Specialization that implements interactive-on-error style
+    behavior.
+    """
+    def addFailure(self, test, err):
+        super(InteractiveFailureResult, self).addFailure(test, err)
+        log.error(self._exc_info_to_string(err, test))
+        log.error("Failure in test '{0}', going interactive".format(
+            self.getDescription(test)
+        ))
+        interactive.task(ctx=None, config=None)
+
+    def addError(self, test, err):
+        super(InteractiveFailureResult, self).addError(test, err)
+        log.error(self._exc_info_to_string(err, test))
+        log.error("Error in test '{0}', going interactive".format(
+            self.getDescription(test)
+        ))
+        interactive.task(ctx=None, config=None)
+
+
+def enumerate_methods(s):
+    log.info("e: {0}".format(s))
+    for t in s._tests:
+        if isinstance(t, suite.BaseTestSuite):
+            for sub in enumerate_methods(t):
+                yield sub
+        else:
+            yield s, t
+
+
+def load_tests(modules, loader):
+    if modules:
+        log.info("Executing modules: {0}".format(modules))
+        module_suites = []
+        for mod_name in modules:
+            # Test names like cephfs.test_auto_repair
+            module_suites.append(loader.loadTestsFromName(mod_name))
+        log.info("Loaded: {0}".format(list(module_suites)))
+        return suite.TestSuite(module_suites)
+    else:
+        log.info("Executing all cephfs tests")
+        return loader.discover(
+            os.path.join(os.path.dirname(os.path.abspath(__file__)), "cephfs")
+        )
+
+
+def scan_tests(modules):
+    overall_suite = load_tests(modules, loader.TestLoader())
+
+    max_required_mds = 0
+    max_required_clients = 0
+    max_required_mgr = 0
+
+    for suite, case in enumerate_methods(overall_suite):
+        max_required_mds = max(max_required_mds,
+                               getattr(case, "MDSS_REQUIRED", 0))
+        max_required_clients = max(max_required_clients,
+                               getattr(case, "CLIENTS_REQUIRED", 0))
+        max_required_mgr = max(max_required_mgr,
+                               getattr(case, "MGRS_REQUIRED", 0))
+
+    return max_required_mds, max_required_clients, max_required_mgr
+
+
+class LocalCluster(object):
+    def __init__(self, rolename="placeholder"):
+        self.remotes = {
+            LocalRemote(): [rolename]
+        }
+
+    def only(self, requested):
+        return self.__class__(rolename=requested)
+
+
+class LocalContext(object):
+    def __init__(self):
+        self.config = {}
+        self.teuthology_config = teuth_config
+        self.cluster = LocalCluster()
+        self.daemons = DaemonGroup()
+
+        # Shove some LocalDaemons into the ctx.daemons DaemonGroup instance so that any
+        # tests that want to look these up via ctx can do so.
+        # Inspect ceph.conf to see what roles exist
+        for conf_line in open("ceph.conf").readlines():
+            for svc_type in ["mon", "osd", "mds", "mgr"]:
+                if svc_type not in self.daemons.daemons:
+                    self.daemons.daemons[svc_type] = {}
+                match = re.match("^\[{0}\.(.+)\]$".format(svc_type), conf_line)
+                if match:
+                    svc_id = match.group(1)
+                    self.daemons.daemons[svc_type][svc_id] = LocalDaemon(svc_type, svc_id)
+
+    def __del__(self):
+        shutil.rmtree(self.teuthology_config['test_path'])
+
+
+def exec_test():
+    # Parse arguments
+    interactive_on_error = False
+    create_cluster = False
+
+    args = sys.argv[1:]
+    flags = [a for a in args if a.startswith("-")]
+    modules = [a for a in args if not a.startswith("-")]
+    for f in flags:
+        if f == "--interactive":
+            interactive_on_error = True
+        elif f == "--create":
+            create_cluster = True
+        else:
+            log.error("Unknown option '{0}'".format(f))
+            sys.exit(-1)
+
+    # Help developers by stopping up-front if their tree isn't built enough for all the
+    # tools that the tests might want to use (add more here if needed)
+    require_binaries = ["ceph-dencoder", "cephfs-journal-tool", "cephfs-data-scan",
+                        "cephfs-table-tool", "ceph-fuse", "rados"]
+    missing_binaries = [b for b in require_binaries if not os.path.exists(os.path.join(BIN_PREFIX, b))]
+    if missing_binaries:
+        log.error("Some ceph binaries missing, please build them: {0}".format(" ".join(missing_binaries)))
+        sys.exit(-1)
+
+    max_required_mds, max_required_clients, max_required_mgr = scan_tests(modules)
+
+    remote = LocalRemote()
+
+    # Tolerate no MDSs or clients running at start
+    ps_txt = remote.run(
+        args=["ps", "-u"+str(os.getuid())]
+    ).stdout.getvalue().strip()
+    lines = ps_txt.split("\n")[1:]
+    for line in lines:
+        if 'ceph-fuse' in line or 'ceph-mds' in line:
+            pid = int(line.split()[0])
+            log.warn("Killing stray process {0}".format(line))
+            os.kill(pid, signal.SIGKILL)
+
+    # Fire up the Ceph cluster if the user requested it
+    if create_cluster:
+        log.info("Creating cluster with {0} MDS daemons".format(
+            max_required_mds))
+        remote.run([os.path.join(SRC_PREFIX, "stop.sh")], check_status=False)
+        remote.run(["rm", "-rf", "./out"])
+        remote.run(["rm", "-rf", "./dev"])
+        vstart_env = os.environ.copy()
+        vstart_env["FS"] = "0"
+        vstart_env["MDS"] = max_required_mds.__str__()
+        vstart_env["OSD"] = "1"
+        vstart_env["MGR"] = max(max_required_mgr, 1).__str__()
+
+        remote.run([os.path.join(SRC_PREFIX, "vstart.sh"), "-n", "-d", "--nolockdep"],
+                   env=vstart_env)
+
+        # Wait for OSD to come up so that subsequent injectargs etc will
+        # definitely succeed
+        LocalCephCluster(LocalContext()).mon_manager.wait_for_all_osds_up(timeout=30)
+
+    # List of client mounts, sufficient to run the selected tests
+    clients = [i.__str__() for i in range(0, max_required_clients)]
+
+    test_dir = tempfile.mkdtemp()
+    teuth_config['test_path'] = test_dir
+
+    # Construct Mount classes
+    mounts = []
+    for client_id in clients:
+        # Populate client keyring (it sucks to use client.admin for test clients
+        # because it's awkward to find the logs later)
+        client_name = "client.{0}".format(client_id)
+
+        if client_name not in open("./keyring").read():
+            p = remote.run(args=[os.path.join(BIN_PREFIX, "ceph"), "auth", "get-or-create", client_name,
+                                 "osd", "allow rw",
+                                 "mds", "allow",
+                                 "mon", "allow r"])
+
+            open("./keyring", "a").write(p.stdout.getvalue())
+
+        mount = LocalFuseMount(test_dir, client_id)
+        mounts.append(mount)
+        if mount.is_mounted():
+            log.warn("unmounting {0}".format(mount.mountpoint))
+            mount.umount_wait()
+        else:
+            if os.path.exists(mount.mountpoint):
+                os.rmdir(mount.mountpoint)
+
+    ctx = LocalContext()
+    ceph_cluster = LocalCephCluster(ctx)
+    mds_cluster = LocalMDSCluster(ctx)
+    mgr_cluster = LocalMgrCluster(ctx)
+
+    from tasks.cephfs_test_runner import DecoratingLoader
+
+    class LogStream(object):
+        def __init__(self):
+            self.buffer = ""
+
+        def write(self, data):
+            self.buffer += data
+            if "\n" in self.buffer:
+                lines = self.buffer.split("\n")
+                for line in lines[:-1]:
+                    pass
+                    # sys.stderr.write(line + "\n")
+                    log.info(line)
+                self.buffer = lines[-1]
+
+        def flush(self):
+            pass
+
+    decorating_loader = DecoratingLoader({
+        "ctx": ctx,
+        "mounts": mounts,
+        "ceph_cluster": ceph_cluster,
+        "mds_cluster": mds_cluster,
+        "mgr_cluster": mgr_cluster,
+    })
+
+    # For the benefit of polling tests like test_full -- in teuthology land we set this
+    # in a .yaml, here it's just a hardcoded thing for the developer's pleasure.
+    remote.run(args=[os.path.join(BIN_PREFIX, "ceph"), "tell", "osd.*", "injectargs", "--osd-mon-report-interval-max", "5"])
+    ceph_cluster.set_ceph_conf("osd", "osd_mon_report_interval_max", "5")
+
+    # Vstart defaults to two segments, which very easily gets a "behind on trimming" health warning
+    # from normal IO latency.  Increase it for running teests.
+    ceph_cluster.set_ceph_conf("mds", "mds log max segments", "10")
+
+    # Make sure the filesystem created in tests has uid/gid that will let us talk to
+    # it after mounting it (without having to  go root).  Set in 'global' not just 'mds'
+    # so that cephfs-data-scan will pick it up too.
+    ceph_cluster.set_ceph_conf("global", "mds root ino uid", "%s" % os.getuid())
+    ceph_cluster.set_ceph_conf("global", "mds root ino gid", "%s" % os.getgid())
+
+    # Monkeypatch get_package_version to avoid having to work out what kind of distro we're on
+    def _get_package_version(remote, pkg_name):
+        # Used in cephfs tests to find fuse version.  Your development workstation *does* have >=2.9, right?
+        return "2.9"
+
+    import teuthology.packaging
+    teuthology.packaging.get_package_version = _get_package_version
+
+    overall_suite = load_tests(modules, decorating_loader)
+
+    # Filter out tests that don't lend themselves to interactive running,
+    victims = []
+    for case, method in enumerate_methods(overall_suite):
+        fn = getattr(method, method._testMethodName)
+
+        drop_test = False
+
+        if hasattr(fn, 'is_for_teuthology') and getattr(fn, 'is_for_teuthology') is True:
+            drop_test = True
+            log.warn("Dropping test because long running: ".format(method.id()))
+
+        if getattr(fn, "needs_trimming", False) is True:
+            drop_test = (os.getuid() != 0)
+            log.warn("Dropping test because client trim unavailable: ".format(method.id()))
+
+        if drop_test:
+            # Don't drop the test if it was explicitly requested in arguments
+            is_named = False
+            for named in modules:
+                if named.endswith(method.id()):
+                    is_named = True
+                    break
+
+            if not is_named:
+                victims.append((case, method))
+
+    log.info("Disabling {0} tests because of is_for_teuthology or needs_trimming".format(len(victims)))
+    for s, method in victims:
+        s._tests.remove(method)
+
+    if interactive_on_error:
+        result_class = InteractiveFailureResult
+    else:
+        result_class = unittest.TextTestResult
+    fail_on_skip = False
+
+    class LoggingResult(result_class):
+        def startTest(self, test):
+            log.info("Starting test: {0}".format(self.getDescription(test)))
+            test.started_at = datetime.datetime.utcnow()
+            return super(LoggingResult, self).startTest(test)
+
+        def stopTest(self, test):
+            log.info("Stopped test: {0} in {1}s".format(
+                self.getDescription(test),
+                (datetime.datetime.utcnow() - test.started_at).total_seconds()
+            ))
+
+        def addSkip(self, test, reason):
+            if fail_on_skip:
+                # Don't just call addFailure because that requires a traceback
+                self.failures.append((test, reason))
+            else:
+                super(LoggingResult, self).addSkip(test, reason)
+
+    # Execute!
+    result = unittest.TextTestRunner(
+        stream=LogStream(),
+        resultclass=LoggingResult,
+        verbosity=2,
+        failfast=True).run(overall_suite)
+
+    if not result.wasSuccessful():
+        result.printErrors()  # duplicate output at end for convenience
+
+        bad_tests = []
+        for test, error in result.errors:
+            bad_tests.append(str(test))
+        for test, failure in result.failures:
+            bad_tests.append(str(test))
+
+        sys.exit(-1)
+    else:
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    exec_test()
diff --git a/src/ceph/qa/tasks/watch_notify_same_primary.py b/src/ceph/qa/tasks/watch_notify_same_primary.py
new file mode 100644
index 0000000..8f6d33b
--- /dev/null
+++ b/src/ceph/qa/tasks/watch_notify_same_primary.py
@@ -0,0 +1,134 @@
+
+"""
+watch_notify_same_primary task
+"""
+from cStringIO import StringIO
+import contextlib
+import logging
+
+from teuthology.orchestra import run
+from teuthology.contextutil import safe_while
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run watch_notify_same_primary
+
+    The config should be as follows:
+
+    watch_notify_same_primary:
+        clients: [client list]
+
+    The client list should contain 1 client
+
+    The test requires 3 osds.
+
+    example:
+
+    tasks:
+    - ceph:
+    - watch_notify_same_primary:
+        clients: [client.0]
+    - interactive:
+    """
+    log.info('Beginning watch_notify_same_primary...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+
+    clients = config.get('clients', ['client.0'])
+    assert len(clients) == 1
+    role = clients[0]
+    assert isinstance(role, basestring)
+    PREFIX = 'client.'
+    assert role.startswith(PREFIX)
+    (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+    manager = ctx.managers['ceph']
+    manager.raw_cluster_cmd('osd', 'set', 'noout')
+
+    pool = manager.create_pool_with_unique_name()
+    def obj(n): return "foo-{num}".format(num=n)
+    def start_watch(n):
+        remote.run(
+            args = [
+                "rados",
+                "-p", pool,
+                "put",
+                obj(n),
+                "/etc/resolv.conf"],
+            logger=log.getChild('watch.{id}'.format(id=n)))
+        proc = remote.run(
+            args = [
+                "rados",
+                "-p", pool,
+                "watch",
+                obj(n)],
+            stdin=run.PIPE,
+            stdout=StringIO(),
+            stderr=StringIO(),
+            wait=False)
+        return proc
+
+    num = 20
+
+    watches = [start_watch(i) for i in range(num)]
+
+    # wait for them all to register
+    for i in range(num):
+        with safe_while() as proceed:
+            while proceed():
+                proc = remote.run(
+                    args = [
+                        "rados",
+                        "-p", pool,
+                        "listwatchers",
+                        obj(i)],
+                    stdout=StringIO())
+                lines = proc.stdout.getvalue()
+                num_watchers = lines.count('watcher=')
+                log.info('i see %d watchers for %s', num_watchers, obj(i))
+                if num_watchers >= 1:
+                    break
+
+    def notify(n, msg):
+        remote.run(
+            args = [
+                "rados",
+                "-p", pool,
+                "notify",
+                obj(n),
+                msg],
+            logger=log.getChild('notify.{id}'.format(id=n)))
+
+    [notify(n, 'notify1') for n in range(len(watches))]
+
+    manager.kill_osd(0)
+    manager.mark_down_osd(0)
+
+    [notify(n, 'notify2') for n in range(len(watches))]
+
+    try:
+        yield
+    finally:
+        log.info('joining watch_notify_stress')
+        for watch in watches:
+            watch.stdin.write("\n")
+
+        run.wait(watches)
+
+        for watch in watches:
+            lines = watch.stdout.getvalue().split("\n")
+            got1 = False
+            got2 = False
+            for l in lines:
+                if 'notify1' in l:
+                    got1 = True
+                if 'notify2' in l:
+                    got2 = True
+            log.info(lines)
+            assert got1 and got2
+
+        manager.revive_osd(0)
+        manager.remove_pool(pool)
diff --git a/src/ceph/qa/tasks/watch_notify_stress.py b/src/ceph/qa/tasks/watch_notify_stress.py
new file mode 100644
index 0000000..6db313f
--- /dev/null
+++ b/src/ceph/qa/tasks/watch_notify_stress.py
@@ -0,0 +1,69 @@
+"""
+test_stress_watch task
+"""
+import contextlib
+import logging
+import proc_thrasher
+
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+    """
+    Run test_stress_watch
+
+    The config should be as follows:
+
+    test_stress_watch:
+        clients: [client list]
+
+    example:
+
+    tasks:
+    - ceph:
+    - test_stress_watch:
+        clients: [client.0]
+    - interactive:
+    """
+    log.info('Beginning test_stress_watch...')
+    assert isinstance(config, dict), \
+        "please list clients to run on"
+    testwatch = {}
+
+    remotes = []
+
+    for role in config.get('clients', ['client.0']):
+        assert isinstance(role, basestring)
+        PREFIX = 'client.'
+        assert role.startswith(PREFIX)
+        id_ = role[len(PREFIX):]
+        (remote,) = ctx.cluster.only(role).remotes.iterkeys()
+        remotes.append(remote)
+
+        args =['CEPH_CLIENT_ID={id_}'.format(id_=id_),
+               'CEPH_ARGS="{flags}"'.format(flags=config.get('flags', '')),
+               'daemon-helper',
+               'kill',
+               'multi_stress_watch foo foo'
+               ]
+
+        log.info("args are %s" % (args,))
+
+        proc = proc_thrasher.ProcThrasher({}, remote,
+            args=[run.Raw(i) for i in args],
+            logger=log.getChild('testwatch.{id}'.format(id=id_)),
+            stdin=run.PIPE,
+            wait=False
+            )
+        proc.start()
+        testwatch[id_] = proc
+
+    try:
+        yield
+    finally:
+        log.info('joining watch_notify_stress')
+        for i in testwatch.itervalues():
+            i.join()
diff --git a/src/ceph/qa/tasks/workunit.py b/src/ceph/qa/tasks/workunit.py
new file mode 100644
index 0000000..f69b396
--- /dev/null
+++ b/src/ceph/qa/tasks/workunit.py
@@ -0,0 +1,486 @@
+"""
+Workunit task -- Run ceph on sets of specific clients
+"""
+import logging
+import pipes
+import os
+import re
+
+from copy import deepcopy
+from util import get_remote_for_role
+
+from teuthology import misc
+from teuthology.config import config as teuth_config
+from teuthology.orchestra.run import CommandFailedError
+from teuthology.parallel import parallel
+from teuthology.orchestra import run
+
+log = logging.getLogger(__name__)
+
+
+class Refspec:
+    def __init__(self, refspec):
+        self.refspec = refspec
+
+    def __str__(self):
+        return self.refspec
+
+    def _clone(self, git_url, clonedir, opts=None):
+        if opts is None:
+            opts = []
+        return (['rm', '-rf', clonedir] +
+                [run.Raw('&&')] +
+                ['git', 'clone'] + opts +
+                [git_url, clonedir])
+
+    def _cd(self, clonedir):
+        return ['cd', clonedir]
+
+    def _checkout(self):
+        return ['git', 'checkout', self.refspec]
+
+    def clone(self, git_url, clonedir):
+        return (self._clone(git_url, clonedir) +
+                [run.Raw('&&')] +
+                self._cd(clonedir) +
+                [run.Raw('&&')] +
+                self._checkout())
+
+
+class Branch(Refspec):
+    def __init__(self, tag):
+        Refspec.__init__(self, tag)
+
+    def clone(self, git_url, clonedir):
+        opts = ['--depth', '1',
+                '--branch', self.refspec]
+        return (self._clone(git_url, clonedir, opts) +
+                [run.Raw('&&')] +
+                self._cd(clonedir))
+
+
+class Head(Refspec):
+    def __init__(self):
+        Refspec.__init__(self, 'HEAD')
+
+    def clone(self, git_url, clonedir):
+        opts = ['--depth', '1']
+        return (self._clone(git_url, clonedir, opts) +
+                [run.Raw('&&')] +
+                self._cd(clonedir))
+
+
+def task(ctx, config):
+    """
+    Run ceph on all workunits found under the specified path.
+
+    For example::
+
+        tasks:
+        - ceph:
+        - ceph-fuse: [client.0]
+        - workunit:
+            clients:
+              client.0: [direct_io, xattrs.sh]
+              client.1: [snaps]
+            branch: foo
+
+    You can also run a list of workunits on all clients:
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - workunit:
+            tag: v0.47
+            clients:
+              all: [direct_io, xattrs.sh, snaps]
+
+    If you have an "all" section it will run all the workunits
+    on each client simultaneously, AFTER running any workunits specified
+    for individual clients. (This prevents unintended simultaneous runs.)
+
+    To customize tests, you can specify environment variables as a dict. You
+    can also specify a time limit for each work unit (defaults to 3h):
+
+        tasks:
+        - ceph:
+        - ceph-fuse:
+        - workunit:
+            sha1: 9b28948635b17165d17c1cf83d4a870bd138ddf6
+            clients:
+              all: [snaps]
+            env:
+              FOO: bar
+              BAZ: quux
+            timeout: 3h
+
+    This task supports roles that include a ceph cluster, e.g.::
+
+        tasks:
+        - ceph:
+        - workunit:
+            clients:
+              backup.client.0: [foo]
+              client.1: [bar] # cluster is implicitly 'ceph'
+
+    You can also specify an alternative top-level dir to 'qa/workunits', like
+    'qa/standalone', with::
+
+        tasks:
+        - install:
+        - workunit:
+            basedir: qa/standalone
+            clients:
+              client.0:
+                - test-ceph-helpers.sh
+
+    :param ctx: Context
+    :param config: Configuration
+    """
+    assert isinstance(config, dict)
+    assert isinstance(config.get('clients'), dict), \
+        'configuration must contain a dictionary of clients'
+
+    # mimic the behavior of the "install" task, where the "overrides" are
+    # actually the defaults of that task. in other words, if none of "sha1",
+    # "tag", or "branch" is specified by a "workunit" tasks, we will update
+    # it with the information in the "workunit" sub-task nested in "overrides".
+    overrides = deepcopy(ctx.config.get('overrides', {}).get('workunit', {}))
+    refspecs = {'branch': Branch, 'tag': Refspec, 'sha1': Refspec}
+    if any(map(lambda i: i in config, refspecs.iterkeys())):
+        for i in refspecs.iterkeys():
+            overrides.pop(i, None)
+    misc.deep_merge(config, overrides)
+
+    for spec, cls in refspecs.iteritems():
+        refspec = config.get(spec)
+        if refspec:
+            refspec = cls(refspec)
+            break
+    if refspec is None:
+        refspec = Head()
+
+    timeout = config.get('timeout', '3h')
+
+    log.info('Pulling workunits from ref %s', refspec)
+
+    created_mountpoint = {}
+
+    if config.get('env') is not None:
+        assert isinstance(config['env'], dict), 'env must be a dictionary'
+    clients = config['clients']
+
+    # Create scratch dirs for any non-all workunits
+    log.info('Making a separate scratch dir for every client...')
+    for role in clients.iterkeys():
+        assert isinstance(role, basestring)
+        if role == "all":
+            continue
+
+        assert 'client' in role
+        created_mnt_dir = _make_scratch_dir(ctx, role, config.get('subdir'))
+        created_mountpoint[role] = created_mnt_dir
+
+    # Execute any non-all workunits
+    with parallel() as p:
+        for role, tests in clients.iteritems():
+            if role != "all":
+                p.spawn(_run_tests, ctx, refspec, role, tests,
+                        config.get('env'),
+                        basedir=config.get('basedir','qa/workunits'),
+                        timeout=timeout)
+
+    # Clean up dirs from any non-all workunits
+    for role, created in created_mountpoint.items():
+        _delete_dir(ctx, role, created)
+
+    # Execute any 'all' workunits
+    if 'all' in clients:
+        all_tasks = clients["all"]
+        _spawn_on_all_clients(ctx, refspec, all_tasks, config.get('env'),
+                              config.get('basedir', 'qa/workunits'),
+                              config.get('subdir'), timeout=timeout)
+
+
+def _client_mountpoint(ctx, cluster, id_):
+    """
+    Returns the path to the expected mountpoint for workunits running
+    on some kind of filesystem.
+    """
+    # for compatibility with tasks like ceph-fuse that aren't cluster-aware yet,
+    # only include the cluster name in the dir if the cluster is not 'ceph'
+    if cluster == 'ceph':
+        dir_ = 'mnt.{0}'.format(id_)
+    else:
+        dir_ = 'mnt.{0}.{1}'.format(cluster, id_)
+    return os.path.join(misc.get_testdir(ctx), dir_)
+
+
+def _delete_dir(ctx, role, created_mountpoint):
+    """
+    Delete file used by this role, and delete the directory that this
+    role appeared in.
+
+    :param ctx: Context
+    :param role: "role.#" where # is used for the role id.
+    """
+    cluster, _, id_ = misc.split_role(role)
+    remote = get_remote_for_role(ctx, role)
+    mnt = _client_mountpoint(ctx, cluster, id_)
+    client = os.path.join(mnt, 'client.{id}'.format(id=id_))
+
+    # Remove the directory inside the mount where the workunit ran
+    remote.run(
+        args=[
+            'sudo',
+            'rm',
+            '-rf',
+            '--',
+            client,
+        ],
+    )
+    log.info("Deleted dir {dir}".format(dir=client))
+
+    # If the mount was an artificially created dir, delete that too
+    if created_mountpoint:
+        remote.run(
+            args=[
+                'rmdir',
+                '--',
+                mnt,
+            ],
+        )
+        log.info("Deleted artificial mount point {dir}".format(dir=client))
+
+
+def _make_scratch_dir(ctx, role, subdir):
+    """
+    Make scratch directories for this role.  This also makes the mount
+    point if that directory does not exist.
+
+    :param ctx: Context
+    :param role: "role.#" where # is used for the role id.
+    :param subdir: use this subdir (False if not used)
+    """
+    created_mountpoint = False
+    cluster, _, id_ = misc.split_role(role)
+    remote = get_remote_for_role(ctx, role)
+    dir_owner = remote.user
+    mnt = _client_mountpoint(ctx, cluster, id_)
+    # if neither kclient nor ceph-fuse are required for a workunit,
+    # mnt may not exist. Stat and create the directory if it doesn't.
+    try:
+        remote.run(
+            args=[
+                'stat',
+                '--',
+                mnt,
+            ],
+        )
+        log.info('Did not need to create dir {dir}'.format(dir=mnt))
+    except CommandFailedError:
+        remote.run(
+            args=[
+                'mkdir',
+                '--',
+                mnt,
+            ],
+        )
+        log.info('Created dir {dir}'.format(dir=mnt))
+        created_mountpoint = True
+
+    if not subdir:
+        subdir = 'client.{id}'.format(id=id_)
+
+    if created_mountpoint:
+        remote.run(
+            args=[
+                'cd',
+                '--',
+                mnt,
+                run.Raw('&&'),
+                'mkdir',
+                '--',
+                subdir,
+            ],
+        )
+    else:
+        remote.run(
+            args=[
+                # cd first so this will fail if the mount point does
+                # not exist; pure install -d will silently do the
+                # wrong thing
+                'cd',
+                '--',
+                mnt,
+                run.Raw('&&'),
+                'sudo',
+                'install',
+                '-d',
+                '-m', '0755',
+                '--owner={user}'.format(user=dir_owner),
+                '--',
+                subdir,
+            ],
+        )
+
+    return created_mountpoint
+
+
+def _spawn_on_all_clients(ctx, refspec, tests, env, basedir, subdir, timeout=None):
+    """
+    Make a scratch directory for each client in the cluster, and then for each
+    test spawn _run_tests() for each role.
+
+    See run_tests() for parameter documentation.
+    """
+    is_client = misc.is_type('client')
+    client_remotes = {}
+    created_mountpoint = {}
+    for remote, roles_for_host in ctx.cluster.remotes.items():
+        for role in roles_for_host:
+            if is_client(role):
+                client_remotes[role] = remote
+                created_mountpoint[role] = _make_scratch_dir(ctx, role, subdir)
+
+    for unit in tests:
+        with parallel() as p:
+            for role, remote in client_remotes.items():
+                p.spawn(_run_tests, ctx, refspec, role, [unit], env,
+                        basedir,
+                        subdir,
+                        timeout=timeout)
+
+    # cleanup the generated client directories
+    for role, _ in client_remotes.items():
+        _delete_dir(ctx, role, created_mountpoint[role])
+
+
+def _run_tests(ctx, refspec, role, tests, env, basedir,
+               subdir=None, timeout=None):
+    """
+    Run the individual test. Create a scratch directory and then extract the
+    workunits from git. Make the executables, and then run the tests.
+    Clean up (remove files created) after the tests are finished.
+
+    :param ctx:     Context
+    :param refspec: branch, sha1, or version tag used to identify this
+                    build
+    :param tests:   specific tests specified.
+    :param env:     environment set in yaml file.  Could be None.
+    :param subdir:  subdirectory set in yaml file.  Could be None
+    :param timeout: If present, use the 'timeout' command on the remote host
+                    to limit execution time. Must be specified by a number
+                    followed by 's' for seconds, 'm' for minutes, 'h' for
+                    hours, or 'd' for days. If '0' or anything that evaluates
+                    to False is passed, the 'timeout' command is not used.
+    """
+    testdir = misc.get_testdir(ctx)
+    assert isinstance(role, basestring)
+    cluster, type_, id_ = misc.split_role(role)
+    assert type_ == 'client'
+    remote = get_remote_for_role(ctx, role)
+    mnt = _client_mountpoint(ctx, cluster, id_)
+    # subdir so we can remove and recreate this a lot without sudo
+    if subdir is None:
+        scratch_tmp = os.path.join(mnt, 'client.{id}'.format(id=id_), 'tmp')
+    else:
+        scratch_tmp = os.path.join(mnt, subdir)
+    clonedir = '{tdir}/clone.{role}'.format(tdir=testdir, role=role)
+    srcdir = '{cdir}/{basedir}'.format(cdir=clonedir,
+                                       basedir=basedir)
+
+    git_url = teuth_config.get_ceph_qa_suite_git_url()
+    # if we are running an upgrade test, and ceph-ci does not have branches like
+    # `jewel`, so should use ceph.git as an alternative.
+    try:
+        remote.run(logger=log.getChild(role),
+                   args=refspec.clone(git_url, clonedir))
+    except CommandFailedError:
+        if git_url.endswith('/ceph-ci.git'):
+            alt_git_url = git_url.replace('/ceph-ci.git', '/ceph.git')
+        elif git_url.endswith('/ceph-ci'):
+            alt_git_url = re.sub(r'/ceph-ci$', '/ceph.git', git_url)
+        else:
+            raise
+        log.info(
+            "failed to check out '%s' from %s; will also try in %s",
+            refspec,
+            git_url,
+            alt_git_url,
+        )
+        remote.run(logger=log.getChild(role),
+                   args=refspec.clone(alt_git_url, clonedir))
+    remote.run(
+        logger=log.getChild(role),
+        args=[
+            'cd', '--', srcdir,
+            run.Raw('&&'),
+            'if', 'test', '-e', 'Makefile', run.Raw(';'), 'then', 'make', run.Raw(';'), 'fi',
+            run.Raw('&&'),
+            'find', '-executable', '-type', 'f', '-printf', r'%P\0'.format(srcdir=srcdir),
+            run.Raw('>{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)),
+        ],
+    )
+
+    workunits_file = '{tdir}/workunits.list.{role}'.format(tdir=testdir, role=role)
+    workunits = sorted(misc.get_file(remote, workunits_file).split('\0'))
+    assert workunits
+
+    try:
+        assert isinstance(tests, list)
+        for spec in tests:
+            log.info('Running workunits matching %s on %s...', spec, role)
+            prefix = '{spec}/'.format(spec=spec)
+            to_run = [w for w in workunits if w == spec or w.startswith(prefix)]
+            if not to_run:
+                raise RuntimeError('Spec did not match any workunits: {spec!r}'.format(spec=spec))
+            for workunit in to_run:
+                log.info('Running workunit %s...', workunit)
+                args = [
+                    'mkdir', '-p', '--', scratch_tmp,
+                    run.Raw('&&'),
+                    'cd', '--', scratch_tmp,
+                    run.Raw('&&'),
+                    run.Raw('CEPH_CLI_TEST_DUP_COMMAND=1'),
+                    run.Raw('CEPH_REF={ref}'.format(ref=refspec)),
+                    run.Raw('TESTDIR="{tdir}"'.format(tdir=testdir)),
+                    run.Raw('CEPH_ARGS="--cluster {0}"'.format(cluster)),
+                    run.Raw('CEPH_ID="{id}"'.format(id=id_)),
+                    run.Raw('PATH=$PATH:/usr/sbin'),
+                    run.Raw('CEPH_BASE={dir}'.format(dir=clonedir)),
+                    run.Raw('CEPH_ROOT={dir}'.format(dir=clonedir)),
+                ]
+                if env is not None:
+                    for var, val in env.iteritems():
+                        quoted_val = pipes.quote(val)
+                        env_arg = '{var}={val}'.format(var=var, val=quoted_val)
+                        args.append(run.Raw(env_arg))
+                args.extend([
+                    'adjust-ulimits',
+                    'ceph-coverage',
+                    '{tdir}/archive/coverage'.format(tdir=testdir)])
+                if timeout and timeout != '0':
+                    args.extend(['timeout', timeout])
+                args.extend([
+                    '{srcdir}/{workunit}'.format(
+                        srcdir=srcdir,
+                        workunit=workunit,
+                    ),
+                ])
+                remote.run(
+                    logger=log.getChild(role),
+                    args=args,
+                    label="workunit test {workunit}".format(workunit=workunit)
+                )
+                remote.run(
+                    logger=log.getChild(role),
+                    args=['sudo', 'rm', '-rf', '--', scratch_tmp],
+                )
+    finally:
+        log.info('Stopping %s on %s...', tests, role)
+        remote.run(
+            logger=log.getChild(role),
+            args=[
+                'rm', '-rf', '--', workunits_file, clonedir,
+            ],
+        )
author	Qiaowei Ren <qiaowei.ren@intel.com>	2018-01-04 13:43:33 +0800
committer	Qiaowei Ren <qiaowei.ren@intel.com>	2018-01-05 11:59:39 +0800
commit	812ff6ca9fcd3e629e49d4328905f33eee8ca3f5 (patch)
tree	04ece7b4da00d9d2f98093774594f4057ae561d4 /src/ceph/qa/tasks
parent	15280273faafb77777eab341909a3f495cf248d9 (diff)