diff options
Diffstat (limited to 'src/ceph/qa/tasks/cephfs')
33 files changed, 10708 insertions, 0 deletions
diff --git a/src/ceph/qa/tasks/cephfs/__init__.py b/src/ceph/qa/tasks/cephfs/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/__init__.py diff --git a/src/ceph/qa/tasks/cephfs/cephfs_test_case.py b/src/ceph/qa/tasks/cephfs/cephfs_test_case.py new file mode 100644 index 0000000..801d0d3 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/cephfs_test_case.py @@ -0,0 +1,315 @@ +import json +import logging +from unittest import case +from tasks.ceph_test_case import CephTestCase +import os +import re +from StringIO import StringIO + +from tasks.cephfs.fuse_mount import FuseMount + +from teuthology.orchestra import run +from teuthology.orchestra.run import CommandFailedError + + +log = logging.getLogger(__name__) + + +def for_teuthology(f): + """ + Decorator that adds an "is_for_teuthology" attribute to the wrapped function + """ + f.is_for_teuthology = True + return f + + +def needs_trimming(f): + """ + Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse + this means it needs to be able to run as root, currently) + """ + f.needs_trimming = True + return f + + +class CephFSTestCase(CephTestCase): + """ + Test case for Ceph FS, requires caller to populate Filesystem and Mounts, + into the fs, mount_a, mount_b class attributes (setting mount_b is optional) + + Handles resetting the cluster under test between tests. + """ + + # FIXME weird explicit naming + mount_a = None + mount_b = None + recovery_mount = None + + # Declarative test requirements: subclasses should override these to indicate + # their special needs. If not met, tests will be skipped. + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + REQUIRE_KCLIENT_REMOTE = False + REQUIRE_ONE_CLIENT_REMOTE = False + REQUIRE_MEMSTORE = False + + # Whether to create the default filesystem during setUp + REQUIRE_FILESYSTEM = True + + # requires REQUIRE_FILESYSTEM = True + REQUIRE_RECOVERY_FILESYSTEM = False + + LOAD_SETTINGS = [] + + def setUp(self): + super(CephFSTestCase, self).setUp() + + if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED: + raise case.SkipTest("Only have {0} MDSs, require {1}".format( + len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED + )) + + if len(self.mounts) < self.CLIENTS_REQUIRED: + raise case.SkipTest("Only have {0} clients, require {1}".format( + len(self.mounts), self.CLIENTS_REQUIRED + )) + + if self.REQUIRE_KCLIENT_REMOTE: + if not isinstance(self.mounts[0], FuseMount) or not isinstance(self.mounts[1], FuseMount): + # kclient kill() power cycles nodes, so requires clients to each be on + # their own node + if self.mounts[0].client_remote.hostname == self.mounts[1].client_remote.hostname: + raise case.SkipTest("kclient clients must be on separate nodes") + + if self.REQUIRE_ONE_CLIENT_REMOTE: + if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames(): + raise case.SkipTest("Require first client to be on separate server from MDSs") + + if self.REQUIRE_MEMSTORE: + objectstore = self.mds_cluster.get_config("osd_objectstore", "osd") + if objectstore != "memstore": + # You certainly *could* run this on a real OSD, but you don't want to sit + # here for hours waiting for the test to fill up a 1TB drive! + raise case.SkipTest("Require `memstore` OSD backend to simulate full drives") + + # Create friendly mount_a, mount_b attrs + for i in range(0, self.CLIENTS_REQUIRED): + setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i]) + + self.mds_cluster.clear_firewall() + + # Unmount all clients, we are about to blow away the filesystem + for mount in self.mounts: + if mount.is_mounted(): + mount.umount_wait(force=True) + + # To avoid any issues with e.g. unlink bugs, we destroy and recreate + # the filesystem rather than just doing a rm -rf of files + self.mds_cluster.mds_stop() + self.mds_cluster.mds_fail() + self.mds_cluster.delete_all_filesystems() + self.fs = None # is now invalid! + self.recovery_fs = None + + # In case the previous filesystem had filled up the RADOS cluster, wait for that + # flag to pass. + osd_mon_report_interval_max = int(self.mds_cluster.get_config("osd_mon_report_interval_max", service_type='osd')) + self.wait_until_true(lambda: not self.mds_cluster.is_full(), + timeout=osd_mon_report_interval_max * 5) + + # In case anything is in the OSD blacklist list, clear it out. This is to avoid + # the OSD map changing in the background (due to blacklist expiry) while tests run. + try: + self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "clear") + except CommandFailedError: + # Fallback for older Ceph cluster + blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd", + "dump", "--format=json-pretty"))['blacklist'] + log.info("Removing {0} blacklist entries".format(len(blacklist))) + for addr, blacklisted_at in blacklist.items(): + self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr) + + client_mount_ids = [m.client_id for m in self.mounts] + # In case the test changes the IDs of clients, stash them so that we can + # reset in tearDown + self._original_client_ids = client_mount_ids + log.info(client_mount_ids) + + # In case there were any extra auth identities around from a previous + # test, delete them + for entry in self.auth_list(): + ent_type, ent_id = entry['entity'].split(".") + if ent_type == "client" and ent_id not in client_mount_ids and ent_id != "admin": + self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity']) + + if self.REQUIRE_FILESYSTEM: + self.fs = self.mds_cluster.newfs(create=True) + self.fs.mds_restart() + + # In case some test messed with auth caps, reset them + for client_id in client_mount_ids: + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(client_id), + 'mds', 'allow', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}'.format(self.fs.get_data_pool_name())) + + # wait for mds restart to complete... + self.fs.wait_for_daemons() + + # Mount the requested number of clients + for i in range(0, self.CLIENTS_REQUIRED): + self.mounts[i].mount() + self.mounts[i].wait_until_mounted() + + if self.REQUIRE_RECOVERY_FILESYSTEM: + if not self.REQUIRE_FILESYSTEM: + raise case.SkipTest("Recovery filesystem requires a primary filesystem as well") + self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set', + 'enable_multiple', 'true', + '--yes-i-really-mean-it') + self.recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False) + self.recovery_fs.set_metadata_overlay(True) + self.recovery_fs.set_data_pool_name(self.fs.get_data_pool_name()) + self.recovery_fs.create() + self.recovery_fs.getinfo(refresh=True) + self.recovery_fs.mds_restart() + self.recovery_fs.wait_for_daemons() + + # Load an config settings of interest + for setting in self.LOAD_SETTINGS: + setattr(self, setting, float(self.fs.mds_asok( + ['config', 'get', setting], self.mds_cluster.mds_ids[0] + )[setting])) + + self.configs_set = set() + + def tearDown(self): + super(CephFSTestCase, self).tearDown() + + self.mds_cluster.clear_firewall() + for m in self.mounts: + m.teardown() + + for i, m in enumerate(self.mounts): + m.client_id = self._original_client_ids[i] + + for subsys, key in self.configs_set: + self.mds_cluster.clear_ceph_conf(subsys, key) + + def set_conf(self, subsys, key, value): + self.configs_set.add((subsys, key)) + self.mds_cluster.set_ceph_conf(subsys, key, value) + + def auth_list(self): + """ + Convenience wrapper on "ceph auth ls" + """ + return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd( + "auth", "ls", "--format=json-pretty" + ))['auth_dump'] + + def assert_session_count(self, expected, ls_data=None, mds_id=None): + if ls_data is None: + ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id) + + alive_count = len([s for s in ls_data if s['state'] != 'killing']) + + self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format( + expected, alive_count + )) + + def assert_session_state(self, client_id, expected_state): + self.assertEqual( + self._session_by_id( + self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'], + expected_state) + + def get_session_data(self, client_id): + return self._session_by_id(client_id) + + def _session_list(self): + ls_data = self.fs.mds_asok(['session', 'ls']) + ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']] + return ls_data + + def get_session(self, client_id, session_ls=None): + if session_ls is None: + session_ls = self.fs.mds_asok(['session', 'ls']) + + return self._session_by_id(session_ls)[client_id] + + def _session_by_id(self, session_ls): + return dict([(s['id'], s) for s in session_ls]) + + def wait_for_daemon_start(self, daemon_ids=None): + """ + Wait until all the daemons appear in the FSMap, either assigned + MDS ranks or in the list of standbys + """ + def get_daemon_names(): + return [info['name'] for info in self.mds_cluster.status().get_all()] + + if daemon_ids is None: + daemon_ids = self.mds_cluster.mds_ids + + try: + self.wait_until_true( + lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids), + timeout=30 + ) + except RuntimeError: + log.warn("Timeout waiting for daemons {0}, while we have {1}".format( + daemon_ids, get_daemon_names() + )) + raise + + def assert_mds_crash(self, daemon_id): + """ + Assert that the a particular MDS daemon crashes (block until + it does) + """ + try: + self.mds_cluster.mds_daemons[daemon_id].proc.wait() + except CommandFailedError as e: + log.info("MDS '{0}' crashed with status {1} as expected".format(daemon_id, e.exitstatus)) + self.mds_cluster.mds_daemons[daemon_id].proc = None + + # Go remove the coredump from the crash, otherwise teuthology.internal.coredump will + # catch it later and treat it as a failure. + p = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[ + "sudo", "sysctl", "-n", "kernel.core_pattern"], stdout=StringIO()) + core_pattern = p.stdout.getvalue().strip() + if os.path.dirname(core_pattern): # Non-default core_pattern with a directory in it + # We have seen a core_pattern that looks like it's from teuthology's coredump + # task, so proceed to clear out the core file + log.info("Clearing core from pattern: {0}".format(core_pattern)) + + # Determine the PID of the crashed MDS by inspecting the MDSMap, it had + # to talk to the mons to get assigned a rank to reach the point of crashing + addr = self.mds_cluster.mon_manager.get_mds_status(daemon_id)['addr'] + pid_str = addr.split("/")[1] + log.info("Determined crasher PID was {0}".format(pid_str)) + + # Substitute PID into core_pattern to get a glob + core_glob = core_pattern.replace("%p", pid_str) + core_glob = re.sub("%[a-z]", "*", core_glob) # Match all for all other % tokens + + # Verify that we see the expected single coredump matching the expected pattern + ls_proc = self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[ + "sudo", "ls", run.Raw(core_glob) + ], stdout=StringIO()) + cores = [f for f in ls_proc.stdout.getvalue().strip().split("\n") if f] + log.info("Enumerated cores: {0}".format(cores)) + self.assertEqual(len(cores), 1) + + log.info("Found core file {0}, deleting it".format(cores[0])) + + self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[ + "sudo", "rm", "-f", cores[0] + ]) + else: + log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)") + + else: + raise AssertionError("MDS daemon '{0}' did not crash as expected".format(daemon_id)) diff --git a/src/ceph/qa/tasks/cephfs/filesystem.py b/src/ceph/qa/tasks/cephfs/filesystem.py new file mode 100644 index 0000000..9638fd5 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/filesystem.py @@ -0,0 +1,1213 @@ + +from StringIO import StringIO +import json +import logging +from gevent import Greenlet +import os +import time +import datetime +import re +import errno +import random + +from teuthology.exceptions import CommandFailedError +from teuthology import misc +from teuthology.nuke import clear_firewall +from teuthology.parallel import parallel +from tasks.ceph_manager import write_conf +from tasks import ceph_manager + + +log = logging.getLogger(__name__) + + +DAEMON_WAIT_TIMEOUT = 120 +ROOT_INO = 1 + + +class ObjectNotFound(Exception): + def __init__(self, object_name): + self._object_name = object_name + + def __str__(self): + return "Object not found: '{0}'".format(self._object_name) + +class FSStatus(object): + """ + Operations on a snapshot of the FSMap. + """ + def __init__(self, mon_manager): + self.mon = mon_manager + self.map = json.loads(self.mon.raw_cluster_cmd("fs", "dump", "--format=json")) + + def __str__(self): + return json.dumps(self.map, indent = 2, sort_keys = True) + + # Expose the fsmap for manual inspection. + def __getitem__(self, key): + """ + Get a field from the fsmap. + """ + return self.map[key] + + def get_filesystems(self): + """ + Iterator for all filesystems. + """ + for fs in self.map['filesystems']: + yield fs + + def get_all(self): + """ + Iterator for all the mds_info components in the FSMap. + """ + for info in self.get_standbys(): + yield info + for fs in self.map['filesystems']: + for info in fs['mdsmap']['info'].values(): + yield info + + def get_standbys(self): + """ + Iterator for all standbys. + """ + for info in self.map['standbys']: + yield info + + def get_fsmap(self, fscid): + """ + Get the fsmap for the given FSCID. + """ + for fs in self.map['filesystems']: + if fscid is None or fs['id'] == fscid: + return fs + raise RuntimeError("FSCID {0} not in map".format(fscid)) + + def get_fsmap_byname(self, name): + """ + Get the fsmap for the given file system name. + """ + for fs in self.map['filesystems']: + if name is None or fs['mdsmap']['fs_name'] == name: + return fs + raise RuntimeError("FS {0} not in map".format(name)) + + def get_replays(self, fscid): + """ + Get the standby:replay MDS for the given FSCID. + """ + fs = self.get_fsmap(fscid) + for info in fs['mdsmap']['info'].values(): + if info['state'] == 'up:standby-replay': + yield info + + def get_ranks(self, fscid): + """ + Get the ranks for the given FSCID. + """ + fs = self.get_fsmap(fscid) + for info in fs['mdsmap']['info'].values(): + if info['rank'] >= 0: + yield info + + def get_rank(self, fscid, rank): + """ + Get the rank for the given FSCID. + """ + for info in self.get_ranks(fscid): + if info['rank'] == rank: + return info + raise RuntimeError("FSCID {0} has no rank {1}".format(fscid, rank)) + + def get_mds(self, name): + """ + Get the info for the given MDS name. + """ + for info in self.get_all(): + if info['name'] == name: + return info + return None + + def get_mds_addr(self, name): + """ + Return the instance addr as a string, like "10.214.133.138:6807\/10825" + """ + info = self.get_mds(name) + if info: + return info['addr'] + else: + log.warn(json.dumps(list(self.get_all()), indent=2)) # dump for debugging + raise RuntimeError("MDS id '{0}' not found in map".format(name)) + +class CephCluster(object): + @property + def admin_remote(self): + first_mon = misc.get_first_mon(self._ctx, None) + (result,) = self._ctx.cluster.only(first_mon).remotes.iterkeys() + return result + + def __init__(self, ctx): + self._ctx = ctx + self.mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=ctx, logger=log.getChild('ceph_manager')) + + def get_config(self, key, service_type=None): + """ + Get config from mon by default, or a specific service if caller asks for it + """ + if service_type is None: + service_type = 'mon' + + service_id = sorted(misc.all_roles_of_type(self._ctx.cluster, service_type))[0] + return self.json_asok(['config', 'get', key], service_type, service_id)[key] + + def set_ceph_conf(self, subsys, key, value): + if subsys not in self._ctx.ceph['ceph'].conf: + self._ctx.ceph['ceph'].conf[subsys] = {} + self._ctx.ceph['ceph'].conf[subsys][key] = value + write_conf(self._ctx) # XXX because we don't have the ceph task's config object, if they + # used a different config path this won't work. + + def clear_ceph_conf(self, subsys, key): + del self._ctx.ceph['ceph'].conf[subsys][key] + write_conf(self._ctx) + + def json_asok(self, command, service_type, service_id): + proc = self.mon_manager.admin_socket(service_type, service_id, command) + response_data = proc.stdout.getvalue() + log.info("_json_asok output: {0}".format(response_data)) + if response_data.strip(): + return json.loads(response_data) + else: + return None + + +class MDSCluster(CephCluster): + """ + Collective operations on all the MDS daemons in the Ceph cluster. These + daemons may be in use by various Filesystems. + + For the benefit of pre-multi-filesystem tests, this class is also + a parent of Filesystem. The correct way to use MDSCluster going forward is + as a separate instance outside of your (multiple) Filesystem instances. + """ + def __init__(self, ctx): + super(MDSCluster, self).__init__(ctx) + + self.mds_ids = list(misc.all_roles_of_type(ctx.cluster, 'mds')) + + if len(self.mds_ids) == 0: + raise RuntimeError("This task requires at least one MDS") + + if hasattr(self._ctx, "daemons"): + # Presence of 'daemons' attribute implies ceph task rather than ceph_deploy task + self.mds_daemons = dict([(mds_id, self._ctx.daemons.get_daemon('mds', mds_id)) for mds_id in self.mds_ids]) + + def _one_or_all(self, mds_id, cb, in_parallel=True): + """ + Call a callback for a single named MDS, or for all. + + Note that the parallelism here isn't for performance, it's to avoid being overly kind + to the cluster by waiting a graceful ssh-latency of time between doing things, and to + avoid being overly kind by executing them in a particular order. However, some actions + don't cope with being done in parallel, so it's optional (`in_parallel`) + + :param mds_id: MDS daemon name, or None + :param cb: Callback taking single argument of MDS daemon name + :param in_parallel: whether to invoke callbacks concurrently (else one after the other) + """ + if mds_id is None: + if in_parallel: + with parallel() as p: + for mds_id in self.mds_ids: + p.spawn(cb, mds_id) + else: + for mds_id in self.mds_ids: + cb(mds_id) + else: + cb(mds_id) + + def get_config(self, key, service_type=None): + """ + get_config specialization of service_type="mds" + """ + if service_type != "mds": + return super(MDSCluster, self).get_config(key, service_type) + + # Some tests stop MDS daemons, don't send commands to a dead one: + service_id = random.sample(filter(lambda i: self.mds_daemons[i].running(), self.mds_daemons), 1)[0] + return self.json_asok(['config', 'get', key], service_type, service_id)[key] + + def mds_stop(self, mds_id=None): + """ + Stop the MDS daemon process(se). If it held a rank, that rank + will eventually go laggy. + """ + self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].stop()) + + def mds_fail(self, mds_id=None): + """ + Inform MDSMonitor of the death of the daemon process(es). If it held + a rank, that rank will be relinquished. + """ + self._one_or_all(mds_id, lambda id_: self.mon_manager.raw_cluster_cmd("mds", "fail", id_)) + + def mds_restart(self, mds_id=None): + self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].restart()) + + def mds_fail_restart(self, mds_id=None): + """ + Variation on restart that includes marking MDSs as failed, so that doing this + operation followed by waiting for healthy daemon states guarantees that they + have gone down and come up, rather than potentially seeing the healthy states + that existed before the restart. + """ + def _fail_restart(id_): + self.mds_daemons[id_].stop() + self.mon_manager.raw_cluster_cmd("mds", "fail", id_) + self.mds_daemons[id_].restart() + + self._one_or_all(mds_id, _fail_restart) + + def newfs(self, name='cephfs', create=True): + return Filesystem(self._ctx, name=name, create=create) + + def status(self): + return FSStatus(self.mon_manager) + + def delete_all_filesystems(self): + """ + Remove all filesystems that exist, and any pools in use by them. + """ + pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools'] + pool_id_name = {} + for pool in pools: + pool_id_name[pool['pool']] = pool['pool_name'] + + # mark cluster down for each fs to prevent churn during deletion + status = self.status() + for fs in status.get_filesystems(): + self.mon_manager.raw_cluster_cmd("fs", "set", fs['mdsmap']['fs_name'], "cluster_down", "true") + + # get a new copy as actives may have since changed + status = self.status() + for fs in status.get_filesystems(): + mdsmap = fs['mdsmap'] + metadata_pool = pool_id_name[mdsmap['metadata_pool']] + + for gid in mdsmap['up'].values(): + self.mon_manager.raw_cluster_cmd('mds', 'fail', gid.__str__()) + + self.mon_manager.raw_cluster_cmd('fs', 'rm', mdsmap['fs_name'], '--yes-i-really-mean-it') + self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete', + metadata_pool, metadata_pool, + '--yes-i-really-really-mean-it') + for data_pool in mdsmap['data_pools']: + data_pool = pool_id_name[data_pool] + try: + self.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete', + data_pool, data_pool, + '--yes-i-really-really-mean-it') + except CommandFailedError as e: + if e.exitstatus == 16: # EBUSY, this data pool is used + pass # by two metadata pools, let the 2nd + else: # pass delete it + raise + + def get_standby_daemons(self): + return set([s['name'] for s in self.status().get_standbys()]) + + def get_mds_hostnames(self): + result = set() + for mds_id in self.mds_ids: + mds_remote = self.mon_manager.find_remote('mds', mds_id) + result.add(mds_remote.hostname) + + return list(result) + + def set_clients_block(self, blocked, mds_id=None): + """ + Block (using iptables) client communications to this MDS. Be careful: if + other services are running on this MDS, or other MDSs try to talk to this + MDS, their communications may also be blocked as collatoral damage. + + :param mds_id: Optional ID of MDS to block, default to all + :return: + """ + da_flag = "-A" if blocked else "-D" + + def set_block(_mds_id): + remote = self.mon_manager.find_remote('mds', _mds_id) + status = self.status() + + addr = status.get_mds_addr(_mds_id) + ip_str, port_str, inst_str = re.match("(.+):(.+)/(.+)", addr).groups() + + remote.run( + args=["sudo", "iptables", da_flag, "OUTPUT", "-p", "tcp", "--sport", port_str, "-j", "REJECT", "-m", + "comment", "--comment", "teuthology"]) + remote.run( + args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m", + "comment", "--comment", "teuthology"]) + + self._one_or_all(mds_id, set_block, in_parallel=False) + + def clear_firewall(self): + clear_firewall(self._ctx) + + def get_mds_info(self, mds_id): + return FSStatus(self.mon_manager).get_mds(mds_id) + + def is_full(self): + flags = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['flags'] + return 'full' in flags + + def is_pool_full(self, pool_name): + pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools'] + for pool in pools: + if pool['pool_name'] == pool_name: + return 'full' in pool['flags_names'].split(",") + + raise RuntimeError("Pool not found '{0}'".format(pool_name)) + +class Filesystem(MDSCluster): + """ + This object is for driving a CephFS filesystem. The MDS daemons driven by + MDSCluster may be shared with other Filesystems. + """ + def __init__(self, ctx, fscid=None, name=None, create=False, + ec_profile=None): + super(Filesystem, self).__init__(ctx) + + self.name = name + self.ec_profile = ec_profile + self.id = None + self.metadata_pool_name = None + self.metadata_overlay = False + self.data_pool_name = None + self.data_pools = None + + client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client')) + self.client_id = client_list[0] + self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1] + + if name is not None: + if fscid is not None: + raise RuntimeError("cannot specify fscid when creating fs") + if create and not self.legacy_configured(): + self.create() + else: + if fscid is not None: + self.id = fscid + self.getinfo(refresh = True) + + # Stash a reference to the first created filesystem on ctx, so + # that if someone drops to the interactive shell they can easily + # poke our methods. + if not hasattr(self._ctx, "filesystem"): + self._ctx.filesystem = self + + def getinfo(self, refresh = False): + status = self.status() + if self.id is not None: + fsmap = status.get_fsmap(self.id) + elif self.name is not None: + fsmap = status.get_fsmap_byname(self.name) + else: + fss = [fs for fs in status.get_filesystems()] + if len(fss) == 1: + fsmap = fss[0] + elif len(fss) == 0: + raise RuntimeError("no file system available") + else: + raise RuntimeError("more than one file system available") + self.id = fsmap['id'] + self.name = fsmap['mdsmap']['fs_name'] + self.get_pool_names(status = status, refresh = refresh) + return status + + def set_metadata_overlay(self, overlay): + if self.id is not None: + raise RuntimeError("cannot specify fscid when configuring overlay") + self.metadata_overlay = overlay + + def deactivate(self, rank): + if rank < 0: + raise RuntimeError("invalid rank") + elif rank == 0: + raise RuntimeError("cannot deactivate rank 0") + self.mon_manager.raw_cluster_cmd("mds", "deactivate", "%d:%d" % (self.id, rank)) + + def set_max_mds(self, max_mds): + self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "max_mds", "%d" % max_mds) + + def set_allow_dirfrags(self, yes): + self.mon_manager.raw_cluster_cmd("fs", "set", self.name, "allow_dirfrags", str(yes).lower(), '--yes-i-really-mean-it') + + def get_pgs_per_fs_pool(self): + """ + Calculate how many PGs to use when creating a pool, in order to avoid raising any + health warnings about mon_pg_warn_min_per_osd + + :return: an integer number of PGs + """ + pg_warn_min_per_osd = int(self.get_config('mon_pg_warn_min_per_osd')) + osd_count = len(list(misc.all_roles_of_type(self._ctx.cluster, 'osd'))) + return pg_warn_min_per_osd * osd_count + + def create(self): + if self.name is None: + self.name = "cephfs" + if self.metadata_pool_name is None: + self.metadata_pool_name = "{0}_metadata".format(self.name) + if self.data_pool_name is None: + data_pool_name = "{0}_data".format(self.name) + else: + data_pool_name = self.data_pool_name + + log.info("Creating filesystem '{0}'".format(self.name)) + + pgs_per_fs_pool = self.get_pgs_per_fs_pool() + + self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', + self.metadata_pool_name, pgs_per_fs_pool.__str__()) + if self.metadata_overlay: + self.mon_manager.raw_cluster_cmd('fs', 'new', + self.name, self.metadata_pool_name, data_pool_name, + '--allow-dangerous-metadata-overlay') + else: + if self.ec_profile: + log.info("EC profile is %s", self.ec_profile) + cmd = ['osd', 'erasure-code-profile', 'set', data_pool_name] + cmd.extend(self.ec_profile) + self.mon_manager.raw_cluster_cmd(*cmd) + self.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'create', + data_pool_name, pgs_per_fs_pool.__str__(), 'erasure', + data_pool_name) + self.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'set', + data_pool_name, 'allow_ec_overwrites', 'true') + else: + self.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'create', + data_pool_name, pgs_per_fs_pool.__str__()) + self.mon_manager.raw_cluster_cmd('fs', 'new', + self.name, self.metadata_pool_name, data_pool_name) + self.check_pool_application(self.metadata_pool_name) + self.check_pool_application(data_pool_name) + # Turn off spurious standby count warnings from modifying max_mds in tests. + try: + self.mon_manager.raw_cluster_cmd('fs', 'set', self.name, 'standby_count_wanted', '0') + except CommandFailedError as e: + if e.exitstatus == 22: + # standby_count_wanted not available prior to luminous (upgrade tests would fail otherwise) + pass + else: + raise + + self.getinfo(refresh = True) + + + def check_pool_application(self, pool_name): + osd_map = self.mon_manager.get_osd_dump_json() + for pool in osd_map['pools']: + if pool['pool_name'] == pool_name: + if "application_metadata" in pool: + if not "cephfs" in pool['application_metadata']: + raise RuntimeError("Pool %p does not name cephfs as application!".\ + format(pool_name)) + + + def __del__(self): + if getattr(self._ctx, "filesystem", None) == self: + delattr(self._ctx, "filesystem") + + def exists(self): + """ + Whether a filesystem exists in the mon's filesystem list + """ + fs_list = json.loads(self.mon_manager.raw_cluster_cmd('fs', 'ls', '--format=json-pretty')) + return self.name in [fs['name'] for fs in fs_list] + + def legacy_configured(self): + """ + Check if a legacy (i.e. pre "fs new") filesystem configuration is present. If this is + the case, the caller should avoid using Filesystem.create + """ + try: + out_text = self.mon_manager.raw_cluster_cmd('--format=json-pretty', 'osd', 'lspools') + pools = json.loads(out_text) + metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools] + if metadata_pool_exists: + self.metadata_pool_name = 'metadata' + except CommandFailedError as e: + # For use in upgrade tests, Ceph cuttlefish and earlier don't support + # structured output (--format) from the CLI. + if e.exitstatus == 22: + metadata_pool_exists = True + else: + raise + + return metadata_pool_exists + + def _df(self): + return json.loads(self.mon_manager.raw_cluster_cmd("df", "--format=json-pretty")) + + def get_mds_map(self): + return self.status().get_fsmap(self.id)['mdsmap'] + + def add_data_pool(self, name): + self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name, self.get_pgs_per_fs_pool().__str__()) + self.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', self.name, name) + self.get_pool_names(refresh = True) + for poolid, fs_name in self.data_pools.items(): + if name == fs_name: + return poolid + raise RuntimeError("could not get just created pool '{0}'".format(name)) + + def get_pool_names(self, refresh = False, status = None): + if refresh or self.metadata_pool_name is None or self.data_pools is None: + if status is None: + status = self.status() + fsmap = status.get_fsmap(self.id) + + osd_map = self.mon_manager.get_osd_dump_json() + id_to_name = {} + for p in osd_map['pools']: + id_to_name[p['pool']] = p['pool_name'] + + self.metadata_pool_name = id_to_name[fsmap['mdsmap']['metadata_pool']] + self.data_pools = {} + for data_pool in fsmap['mdsmap']['data_pools']: + self.data_pools[data_pool] = id_to_name[data_pool] + + def get_data_pool_name(self, refresh = False): + if refresh or self.data_pools is None: + self.get_pool_names(refresh = True) + assert(len(self.data_pools) == 1) + return self.data_pools.values()[0] + + def get_data_pool_id(self, refresh = False): + """ + Don't call this if you have multiple data pools + :return: integer + """ + if refresh or self.data_pools is None: + self.get_pool_names(refresh = True) + assert(len(self.data_pools) == 1) + return self.data_pools.keys()[0] + + def get_data_pool_names(self, refresh = False): + if refresh or self.data_pools is None: + self.get_pool_names(refresh = True) + return self.data_pools.values() + + def get_metadata_pool_name(self): + return self.metadata_pool_name + + def set_data_pool_name(self, name): + if self.id is not None: + raise RuntimeError("can't set filesystem name if its fscid is set") + self.data_pool_name = name + + def get_namespace_id(self): + return self.id + + def get_pool_df(self, pool_name): + """ + Return a dict like: + {u'bytes_used': 0, u'max_avail': 83848701, u'objects': 0, u'kb_used': 0} + """ + for pool_df in self._df()['pools']: + if pool_df['name'] == pool_name: + return pool_df['stats'] + + raise RuntimeError("Pool name '{0}' not found".format(pool_name)) + + def get_usage(self): + return self._df()['stats']['total_used_bytes'] + + def are_daemons_healthy(self): + """ + Return true if all daemons are in one of active, standby, standby-replay, and + at least max_mds daemons are in 'active'. + + Unlike most of Filesystem, this function is tolerant of new-style `fs` + commands being missing, because we are part of the ceph installation + process during upgrade suites, so must fall back to old style commands + when we get an EINVAL on a new style command. + + :return: + """ + + active_count = 0 + try: + mds_map = self.get_mds_map() + except CommandFailedError as cfe: + # Old version, fall back to non-multi-fs commands + if cfe.exitstatus == errno.EINVAL: + mds_map = json.loads( + self.mon_manager.raw_cluster_cmd('mds', 'dump', '--format=json')) + else: + raise + + log.info("are_daemons_healthy: mds map: {0}".format(mds_map)) + + for mds_id, mds_status in mds_map['info'].items(): + if mds_status['state'] not in ["up:active", "up:standby", "up:standby-replay"]: + log.warning("Unhealthy mds state {0}:{1}".format(mds_id, mds_status['state'])) + return False + elif mds_status['state'] == 'up:active': + active_count += 1 + + log.info("are_daemons_healthy: {0}/{1}".format( + active_count, mds_map['max_mds'] + )) + + if active_count >= mds_map['max_mds']: + # The MDSMap says these guys are active, but let's check they really are + for mds_id, mds_status in mds_map['info'].items(): + if mds_status['state'] == 'up:active': + try: + daemon_status = self.mds_asok(["status"], mds_id=mds_status['name']) + except CommandFailedError as cfe: + if cfe.exitstatus == errno.EINVAL: + # Old version, can't do this check + continue + else: + # MDS not even running + return False + + if daemon_status['state'] != 'up:active': + # MDS hasn't taken the latest map yet + return False + + return True + else: + return False + + def get_daemon_names(self, state=None): + """ + Return MDS daemon names of those daemons in the given state + :param state: + :return: + """ + status = self.get_mds_map() + result = [] + for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])): + if mds_status['state'] == state or state is None: + result.append(mds_status['name']) + + return result + + def get_active_names(self): + """ + Return MDS daemon names of those daemons holding ranks + in state up:active + + :return: list of strings like ['a', 'b'], sorted by rank + """ + return self.get_daemon_names("up:active") + + def get_all_mds_rank(self): + status = self.get_mds_map() + result = [] + for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])): + if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay': + result.append(mds_status['rank']) + + return result + + def get_rank_names(self): + """ + Return MDS daemon names of those daemons holding a rank, + sorted by rank. This includes e.g. up:replay/reconnect + as well as active, but does not include standby or + standby-replay. + """ + status = self.get_mds_map() + result = [] + for mds_status in sorted(status['info'].values(), lambda a, b: cmp(a['rank'], b['rank'])): + if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay': + result.append(mds_status['name']) + + return result + + def wait_for_daemons(self, timeout=None): + """ + Wait until all daemons are healthy + :return: + """ + + if timeout is None: + timeout = DAEMON_WAIT_TIMEOUT + + elapsed = 0 + while True: + if self.are_daemons_healthy(): + return + else: + time.sleep(1) + elapsed += 1 + + if elapsed > timeout: + raise RuntimeError("Timed out waiting for MDS daemons to become healthy") + + def get_lone_mds_id(self): + """ + Get a single MDS ID: the only one if there is only one + configured, else the only one currently holding a rank, + else raise an error. + """ + if len(self.mds_ids) != 1: + alive = self.get_rank_names() + if len(alive) == 1: + return alive[0] + else: + raise ValueError("Explicit MDS argument required when multiple MDSs in use") + else: + return self.mds_ids[0] + + def recreate(self): + log.info("Creating new filesystem") + self.delete_all_filesystems() + self.id = None + self.create() + + def put_metadata_object_raw(self, object_id, infile): + """ + Save an object to the metadata pool + """ + temp_bin_path = infile + self.client_remote.run(args=[ + 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'put', object_id, temp_bin_path + ]) + + def get_metadata_object_raw(self, object_id): + """ + Retrieve an object from the metadata pool and store it in a file. + """ + temp_bin_path = '/tmp/' + object_id + '.bin' + + self.client_remote.run(args=[ + 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path + ]) + + return temp_bin_path + + def get_metadata_object(self, object_type, object_id): + """ + Retrieve an object from the metadata pool, pass it through + ceph-dencoder to dump it to JSON, and return the decoded object. + """ + temp_bin_path = '/tmp/out.bin' + + self.client_remote.run(args=[ + 'sudo', os.path.join(self._prefix, 'rados'), '-p', self.metadata_pool_name, 'get', object_id, temp_bin_path + ]) + + stdout = StringIO() + self.client_remote.run(args=[ + 'sudo', os.path.join(self._prefix, 'ceph-dencoder'), 'type', object_type, 'import', temp_bin_path, 'decode', 'dump_json' + ], stdout=stdout) + dump_json = stdout.getvalue().strip() + try: + dump = json.loads(dump_json) + except (TypeError, ValueError): + log.error("Failed to decode JSON: '{0}'".format(dump_json)) + raise + + return dump + + def get_journal_version(self): + """ + Read the JournalPointer and Journal::Header objects to learn the version of + encoding in use. + """ + journal_pointer_object = '400.00000000' + journal_pointer_dump = self.get_metadata_object("JournalPointer", journal_pointer_object) + journal_ino = journal_pointer_dump['journal_pointer']['front'] + + journal_header_object = "{0:x}.00000000".format(journal_ino) + journal_header_dump = self.get_metadata_object('Journaler::Header', journal_header_object) + + version = journal_header_dump['journal_header']['stream_format'] + log.info("Read journal version {0}".format(version)) + + return version + + def mds_asok(self, command, mds_id=None): + if mds_id is None: + mds_id = self.get_lone_mds_id() + + return self.json_asok(command, 'mds', mds_id) + + def read_cache(self, path, depth=None): + cmd = ["dump", "tree", path] + if depth is not None: + cmd.append(depth.__str__()) + result = self.mds_asok(cmd) + if len(result) == 0: + raise RuntimeError("Path not found in cache: {0}".format(path)) + + return result + + def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None, rank=None): + """ + Block until the MDS reaches a particular state, or a failure condition + is met. + + When there are multiple MDSs, succeed when exaclty one MDS is in the + goal state, or fail when any MDS is in the reject state. + + :param goal_state: Return once the MDS is in this state + :param reject: Fail if the MDS enters this state before the goal state + :param timeout: Fail if this many seconds pass before reaching goal + :return: number of seconds waited, rounded down to integer + """ + + started_at = time.time() + while True: + status = self.status() + if rank is not None: + mds_info = status.get_rank(self.id, rank) + current_state = mds_info['state'] if mds_info else None + log.info("Looked up MDS state for mds.{0}: {1}".format(rank, current_state)) + elif mds_id is not None: + # mds_info is None if no daemon with this ID exists in the map + mds_info = status.get_mds(mds_id) + current_state = mds_info['state'] if mds_info else None + log.info("Looked up MDS state for {0}: {1}".format(mds_id, current_state)) + else: + # In general, look for a single MDS + states = [m['state'] for m in status.get_ranks(self.id)] + if [s for s in states if s == goal_state] == [goal_state]: + current_state = goal_state + elif reject in states: + current_state = reject + else: + current_state = None + log.info("mapped states {0} to {1}".format(states, current_state)) + + elapsed = time.time() - started_at + if current_state == goal_state: + log.info("reached state '{0}' in {1}s".format(current_state, elapsed)) + return elapsed + elif reject is not None and current_state == reject: + raise RuntimeError("MDS in reject state {0}".format(current_state)) + elif timeout is not None and elapsed > timeout: + log.error("MDS status at timeout: {0}".format(status.get_fsmap(self.id))) + raise RuntimeError( + "Reached timeout after {0} seconds waiting for state {1}, while in state {2}".format( + elapsed, goal_state, current_state + )) + else: + time.sleep(1) + + def _read_data_xattr(self, ino_no, xattr_name, type, pool): + mds_id = self.mds_ids[0] + remote = self.mds_daemons[mds_id].remote + if pool is None: + pool = self.get_data_pool_name() + + obj_name = "{0:x}.00000000".format(ino_no) + + args = [ + os.path.join(self._prefix, "rados"), "-p", pool, "getxattr", obj_name, xattr_name + ] + try: + proc = remote.run( + args=args, + stdout=StringIO()) + except CommandFailedError as e: + log.error(e.__str__()) + raise ObjectNotFound(obj_name) + + data = proc.stdout.getvalue() + + p = remote.run( + args=[os.path.join(self._prefix, "ceph-dencoder"), "type", type, "import", "-", "decode", "dump_json"], + stdout=StringIO(), + stdin=data + ) + + return json.loads(p.stdout.getvalue().strip()) + + def _write_data_xattr(self, ino_no, xattr_name, data, pool=None): + """ + Write to an xattr of the 0th data object of an inode. Will + succeed whether the object and/or xattr already exist or not. + + :param ino_no: integer inode number + :param xattr_name: string name of the xattr + :param data: byte array data to write to the xattr + :param pool: name of data pool or None to use primary data pool + :return: None + """ + remote = self.mds_daemons[self.mds_ids[0]].remote + if pool is None: + pool = self.get_data_pool_name() + + obj_name = "{0:x}.00000000".format(ino_no) + args = [ + os.path.join(self._prefix, "rados"), "-p", pool, "setxattr", + obj_name, xattr_name, data + ] + remote.run( + args=args, + stdout=StringIO()) + + def read_backtrace(self, ino_no, pool=None): + """ + Read the backtrace from the data pool, return a dict in the format + given by inode_backtrace_t::dump, which is something like: + + :: + + rados -p cephfs_data getxattr 10000000002.00000000 parent > out.bin + ceph-dencoder type inode_backtrace_t import out.bin decode dump_json + + { "ino": 1099511627778, + "ancestors": [ + { "dirino": 1, + "dname": "blah", + "version": 11}], + "pool": 1, + "old_pools": []} + + :param pool: name of pool to read backtrace from. If omitted, FS must have only + one data pool and that will be used. + """ + return self._read_data_xattr(ino_no, "parent", "inode_backtrace_t", pool) + + def read_layout(self, ino_no, pool=None): + """ + Read 'layout' xattr of an inode and parse the result, returning a dict like: + :: + { + "stripe_unit": 4194304, + "stripe_count": 1, + "object_size": 4194304, + "pool_id": 1, + "pool_ns": "", + } + + :param pool: name of pool to read backtrace from. If omitted, FS must have only + one data pool and that will be used. + """ + return self._read_data_xattr(ino_no, "layout", "file_layout_t", pool) + + def _enumerate_data_objects(self, ino, size): + """ + Get the list of expected data objects for a range, and the list of objects + that really exist. + + :return a tuple of two lists of strings (expected, actual) + """ + stripe_size = 1024 * 1024 * 4 + + size = max(stripe_size, size) + + want_objects = [ + "{0:x}.{1:08x}".format(ino, n) + for n in range(0, ((size - 1) / stripe_size) + 1) + ] + + exist_objects = self.rados(["ls"], pool=self.get_data_pool_name()).split("\n") + + return want_objects, exist_objects + + def data_objects_present(self, ino, size): + """ + Check that *all* the expected data objects for an inode are present in the data pool + """ + + want_objects, exist_objects = self._enumerate_data_objects(ino, size) + missing = set(want_objects) - set(exist_objects) + + if missing: + log.info("Objects missing (ino {0}, size {1}): {2}".format( + ino, size, missing + )) + return False + else: + log.info("All objects for ino {0} size {1} found".format(ino, size)) + return True + + def data_objects_absent(self, ino, size): + want_objects, exist_objects = self._enumerate_data_objects(ino, size) + present = set(want_objects) & set(exist_objects) + + if present: + log.info("Objects not absent (ino {0}, size {1}): {2}".format( + ino, size, present + )) + return False + else: + log.info("All objects for ino {0} size {1} are absent".format(ino, size)) + return True + + def dirfrag_exists(self, ino, frag): + try: + self.rados(["stat", "{0:x}.{1:08x}".format(ino, frag)]) + except CommandFailedError as e: + return False + else: + return True + + def rados(self, args, pool=None, namespace=None, stdin_data=None): + """ + Call into the `rados` CLI from an MDS + """ + + if pool is None: + pool = self.get_metadata_pool_name() + + # Doesn't matter which MDS we use to run rados commands, they all + # have access to the pools + mds_id = self.mds_ids[0] + remote = self.mds_daemons[mds_id].remote + + # NB we could alternatively use librados pybindings for this, but it's a one-liner + # using the `rados` CLI + args = ([os.path.join(self._prefix, "rados"), "-p", pool] + + (["--namespace", namespace] if namespace else []) + + args) + p = remote.run( + args=args, + stdin=stdin_data, + stdout=StringIO()) + return p.stdout.getvalue().strip() + + def list_dirfrag(self, dir_ino): + """ + Read the named object and return the list of omap keys + + :return a list of 0 or more strings + """ + + dirfrag_obj_name = "{0:x}.00000000".format(dir_ino) + + try: + key_list_str = self.rados(["listomapkeys", dirfrag_obj_name]) + except CommandFailedError as e: + log.error(e.__str__()) + raise ObjectNotFound(dirfrag_obj_name) + + return key_list_str.split("\n") if key_list_str else [] + + def erase_metadata_objects(self, prefix): + """ + For all objects in the metadata pool matching the prefix, + erase them. + + This O(N) with the number of objects in the pool, so only suitable + for use on toy test filesystems. + """ + all_objects = self.rados(["ls"]).split("\n") + matching_objects = [o for o in all_objects if o.startswith(prefix)] + for o in matching_objects: + self.rados(["rm", o]) + + def erase_mds_objects(self, rank): + """ + Erase all the per-MDS objects for a particular rank. This includes + inotable, sessiontable, journal + """ + + def obj_prefix(multiplier): + """ + MDS object naming conventions like rank 1's + journal is at 201.*** + """ + return "%x." % (multiplier * 0x100 + rank) + + # MDS_INO_LOG_OFFSET + self.erase_metadata_objects(obj_prefix(2)) + # MDS_INO_LOG_BACKUP_OFFSET + self.erase_metadata_objects(obj_prefix(3)) + # MDS_INO_LOG_POINTER_OFFSET + self.erase_metadata_objects(obj_prefix(4)) + # MDSTables & SessionMap + self.erase_metadata_objects("mds{rank:d}_".format(rank=rank)) + + @property + def _prefix(self): + """ + Override this to set a different + """ + return "" + + def _run_tool(self, tool, args, rank=None, quiet=False): + # Tests frequently have [client] configuration that jacks up + # the objecter log level (unlikely to be interesting here) + # and does not set the mds log level (very interesting here) + if quiet: + base_args = [os.path.join(self._prefix, tool), '--debug-mds=1', '--debug-objecter=1'] + else: + base_args = [os.path.join(self._prefix, tool), '--debug-mds=4', '--debug-objecter=1'] + + if rank is not None: + base_args.extend(["--rank", "%d" % rank]) + + t1 = datetime.datetime.now() + r = self.tool_remote.run( + args=base_args + args, + stdout=StringIO()).stdout.getvalue().strip() + duration = datetime.datetime.now() - t1 + log.info("Ran {0} in time {1}, result:\n{2}".format( + base_args + args, duration, r + )) + return r + + @property + def tool_remote(self): + """ + An arbitrary remote to use when invoking recovery tools. Use an MDS host because + it'll definitely have keys with perms to access cephfs metadata pool. This is public + so that tests can use this remote to go get locally written output files from the tools. + """ + mds_id = self.mds_ids[0] + return self.mds_daemons[mds_id].remote + + def journal_tool(self, args, rank=None, quiet=False): + """ + Invoke cephfs-journal-tool with the passed arguments, and return its stdout + """ + return self._run_tool("cephfs-journal-tool", args, rank, quiet) + + def table_tool(self, args, quiet=False): + """ + Invoke cephfs-table-tool with the passed arguments, and return its stdout + """ + return self._run_tool("cephfs-table-tool", args, None, quiet) + + def data_scan(self, args, quiet=False, worker_count=1): + """ + Invoke cephfs-data-scan with the passed arguments, and return its stdout + + :param worker_count: if greater than 1, multiple workers will be run + in parallel and the return value will be None + """ + + workers = [] + + for n in range(0, worker_count): + if worker_count > 1: + # data-scan args first token is a command, followed by args to it. + # insert worker arguments after the command. + cmd = args[0] + worker_args = [cmd] + ["--worker_n", n.__str__(), "--worker_m", worker_count.__str__()] + args[1:] + else: + worker_args = args + + workers.append(Greenlet.spawn(lambda wargs=worker_args: + self._run_tool("cephfs-data-scan", wargs, None, quiet))) + + for w in workers: + w.get() + + if worker_count == 1: + return workers[0].value + else: + return None diff --git a/src/ceph/qa/tasks/cephfs/fuse_mount.py b/src/ceph/qa/tasks/cephfs/fuse_mount.py new file mode 100644 index 0000000..8d8410c --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/fuse_mount.py @@ -0,0 +1,428 @@ + +from StringIO import StringIO +import json +import time +import logging +from textwrap import dedent + +from teuthology import misc +from teuthology.contextutil import MaxWhileTries +from teuthology.orchestra import run +from teuthology.orchestra.run import CommandFailedError +from .mount import CephFSMount + +log = logging.getLogger(__name__) + + +class FuseMount(CephFSMount): + def __init__(self, client_config, test_dir, client_id, client_remote): + super(FuseMount, self).__init__(test_dir, client_id, client_remote) + + self.client_config = client_config if client_config else {} + self.fuse_daemon = None + self._fuse_conn = None + + def mount(self, mount_path=None, mount_fs_name=None): + try: + return self._mount(mount_path, mount_fs_name) + except RuntimeError: + # Catch exceptions by the mount() logic (i.e. not remote command + # failures) and ensure the mount is not left half-up. + # Otherwise we might leave a zombie mount point that causes + # anyone traversing cephtest/ to get hung up on. + log.warn("Trying to clean up after failed mount") + self.umount_wait(force=True) + raise + + def _mount(self, mount_path, mount_fs_name): + log.info("Client client.%s config is %s" % (self.client_id, self.client_config)) + + daemon_signal = 'kill' + if self.client_config.get('coverage') or self.client_config.get('valgrind') is not None: + daemon_signal = 'term' + + log.info('Mounting ceph-fuse client.{id} at {remote} {mnt}...'.format( + id=self.client_id, remote=self.client_remote, mnt=self.mountpoint)) + + self.client_remote.run( + args=[ + 'mkdir', + '--', + self.mountpoint, + ], + ) + + run_cmd = [ + 'sudo', + 'adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=self.test_dir), + 'daemon-helper', + daemon_signal, + ] + + fuse_cmd = ['ceph-fuse', "-f"] + + if mount_path is not None: + fuse_cmd += ["--client_mountpoint={0}".format(mount_path)] + + if mount_fs_name is not None: + fuse_cmd += ["--client_mds_namespace={0}".format(mount_fs_name)] + + fuse_cmd += [ + '--name', 'client.{id}'.format(id=self.client_id), + # TODO ceph-fuse doesn't understand dash dash '--', + self.mountpoint, + ] + + if self.client_config.get('valgrind') is not None: + run_cmd = misc.get_valgrind_args( + self.test_dir, + 'client.{id}'.format(id=self.client_id), + run_cmd, + self.client_config.get('valgrind'), + ) + + run_cmd.extend(fuse_cmd) + + def list_connections(): + self.client_remote.run( + args=["sudo", "mount", "-t", "fusectl", "/sys/fs/fuse/connections", "/sys/fs/fuse/connections"], + check_status=False + ) + p = self.client_remote.run( + args=["ls", "/sys/fs/fuse/connections"], + stdout=StringIO(), + check_status=False + ) + if p.exitstatus != 0: + return [] + + ls_str = p.stdout.getvalue().strip() + if ls_str: + return [int(n) for n in ls_str.split("\n")] + else: + return [] + + # Before starting ceph-fuse process, note the contents of + # /sys/fs/fuse/connections + pre_mount_conns = list_connections() + log.info("Pre-mount connections: {0}".format(pre_mount_conns)) + + proc = self.client_remote.run( + args=run_cmd, + logger=log.getChild('ceph-fuse.{id}'.format(id=self.client_id)), + stdin=run.PIPE, + wait=False, + ) + self.fuse_daemon = proc + + # Wait for the connection reference to appear in /sys + mount_wait = self.client_config.get('mount_wait', 0) + if mount_wait > 0: + log.info("Fuse mount waits {0} seconds before checking /sys/".format(mount_wait)) + time.sleep(mount_wait) + timeout = int(self.client_config.get('mount_timeout', 30)) + waited = 0 + + post_mount_conns = list_connections() + while len(post_mount_conns) <= len(pre_mount_conns): + if self.fuse_daemon.finished: + # Did mount fail? Raise the CommandFailedError instead of + # hitting the "failed to populate /sys/" timeout + self.fuse_daemon.wait() + time.sleep(1) + waited += 1 + if waited > timeout: + raise RuntimeError("Fuse mount failed to populate /sys/ after {0} seconds".format( + waited + )) + else: + post_mount_conns = list_connections() + + log.info("Post-mount connections: {0}".format(post_mount_conns)) + + # Record our fuse connection number so that we can use it when + # forcing an unmount + new_conns = list(set(post_mount_conns) - set(pre_mount_conns)) + if len(new_conns) == 0: + raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns)) + elif len(new_conns) > 1: + raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns)) + else: + self._fuse_conn = new_conns[0] + + def is_mounted(self): + proc = self.client_remote.run( + args=[ + 'stat', + '--file-system', + '--printf=%T\n', + '--', + self.mountpoint, + ], + stdout=StringIO(), + stderr=StringIO(), + wait=False + ) + try: + proc.wait() + except CommandFailedError: + if ("endpoint is not connected" in proc.stderr.getvalue() + or "Software caused connection abort" in proc.stderr.getvalue()): + # This happens is fuse is killed without unmount + log.warn("Found stale moutn point at {0}".format(self.mountpoint)) + return True + else: + # This happens if the mount directory doesn't exist + log.info('mount point does not exist: %s', self.mountpoint) + return False + + fstype = proc.stdout.getvalue().rstrip('\n') + if fstype == 'fuseblk': + log.info('ceph-fuse is mounted on %s', self.mountpoint) + return True + else: + log.debug('ceph-fuse not mounted, got fs type {fstype!r}'.format( + fstype=fstype)) + return False + + def wait_until_mounted(self): + """ + Check to make sure that fuse is mounted on mountpoint. If not, + sleep for 5 seconds and check again. + """ + + while not self.is_mounted(): + # Even if it's not mounted, it should at least + # be running: catch simple failures where it has terminated. + assert not self.fuse_daemon.poll() + + time.sleep(5) + + # Now that we're mounted, set permissions so that the rest of the test will have + # unrestricted access to the filesystem mount. + self.client_remote.run( + args=['sudo', 'chmod', '1777', self.mountpoint]) + + def _mountpoint_exists(self): + return self.client_remote.run(args=["ls", "-d", self.mountpoint], check_status=False).exitstatus == 0 + + def umount(self): + try: + log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name)) + self.client_remote.run( + args=[ + 'sudo', + 'fusermount', + '-u', + self.mountpoint, + ], + ) + except run.CommandFailedError: + log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name)) + + self.client_remote.run(args=[ + 'sudo', + run.Raw('PATH=/usr/sbin:$PATH'), + 'lsof', + run.Raw(';'), + 'ps', + 'auxf', + ]) + + # abort the fuse mount, killing all hung processes + if self._fuse_conn: + self.run_python(dedent(""" + import os + path = "/sys/fs/fuse/connections/{0}/abort" + if os.path.exists(path): + open(path, "w").write("1") + """).format(self._fuse_conn)) + self._fuse_conn = None + + stderr = StringIO() + try: + # make sure its unmounted + self.client_remote.run( + args=[ + 'sudo', + 'umount', + '-l', + '-f', + self.mountpoint, + ], + stderr=stderr + ) + except CommandFailedError: + if self.is_mounted(): + raise + + assert not self.is_mounted() + self._fuse_conn = None + + def umount_wait(self, force=False, require_clean=False): + """ + :param force: Complete cleanly even if the MDS is offline + """ + if force: + assert not require_clean # mutually exclusive + + # When we expect to be forcing, kill the ceph-fuse process directly. + # This should avoid hitting the more aggressive fallback killing + # in umount() which can affect other mounts too. + self.fuse_daemon.stdin.close() + + # However, we will still hit the aggressive wait if there is an ongoing + # mount -o remount (especially if the remount is stuck because MDSs + # are unavailable) + + self.umount() + + try: + if self.fuse_daemon: + # Permit a timeout, so that we do not block forever + run.wait([self.fuse_daemon], 900) + except MaxWhileTries: + log.error("process failed to terminate after unmount. This probably" + "indicates a bug within ceph-fuse.") + raise + except CommandFailedError: + if require_clean: + raise + + self.cleanup() + + def cleanup(self): + """ + Remove the mount point. + + Prerequisite: the client is not mounted. + """ + stderr = StringIO() + try: + self.client_remote.run( + args=[ + 'rmdir', + '--', + self.mountpoint, + ], + stderr=stderr + ) + except CommandFailedError: + if "No such file or directory" in stderr.getvalue(): + pass + else: + raise + + def kill(self): + """ + Terminate the client without removing the mount point. + """ + self.fuse_daemon.stdin.close() + try: + self.fuse_daemon.wait() + except CommandFailedError: + pass + + def kill_cleanup(self): + """ + Follow up ``kill`` to get to a clean unmounted state. + """ + self.umount() + self.cleanup() + + def teardown(self): + """ + Whatever the state of the mount, get it gone. + """ + super(FuseMount, self).teardown() + + self.umount() + + if self.fuse_daemon and not self.fuse_daemon.finished: + self.fuse_daemon.stdin.close() + try: + self.fuse_daemon.wait() + except CommandFailedError: + pass + + # Indiscriminate, unlike the touchier cleanup() + self.client_remote.run( + args=[ + 'rm', + '-rf', + self.mountpoint, + ], + ) + + def _asok_path(self): + return "/var/run/ceph/ceph-client.{0}.*.asok".format(self.client_id) + + @property + def _prefix(self): + return "" + + def admin_socket(self, args): + pyscript = """ +import glob +import re +import os +import subprocess + +def find_socket(client_name): + asok_path = "{asok_path}" + files = glob.glob(asok_path) + + # Given a non-glob path, it better be there + if "*" not in asok_path: + assert(len(files) == 1) + return files[0] + + for f in files: + pid = re.match(".*\.(\d+)\.asok$", f).group(1) + if os.path.exists("/proc/{{0}}".format(pid)): + return f + raise RuntimeError("Client socket {{0}} not found".format(client_name)) + +print find_socket("{client_name}") +""".format( + asok_path=self._asok_path(), + client_name="client.{0}".format(self.client_id)) + + # Find the admin socket + p = self.client_remote.run(args=[ + 'python', '-c', pyscript + ], stdout=StringIO()) + asok_path = p.stdout.getvalue().strip() + log.info("Found client admin socket at {0}".format(asok_path)) + + # Query client ID from admin socket + p = self.client_remote.run( + args=['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args, + stdout=StringIO()) + return json.loads(p.stdout.getvalue()) + + def get_global_id(self): + """ + Look up the CephFS client ID for this mount + """ + + return self.admin_socket(['mds_sessions'])['id'] + + def get_osd_epoch(self): + """ + Return 2-tuple of osd_epoch, osd_epoch_barrier + """ + status = self.admin_socket(['status']) + return status['osd_epoch'], status['osd_epoch_barrier'] + + def get_dentry_count(self): + """ + Return 2-tuple of dentry_count, dentry_pinned_count + """ + status = self.admin_socket(['status']) + return status['dentry_count'], status['dentry_pinned_count'] + + def set_cache_size(self, size): + return self.admin_socket(['config', 'set', 'client_cache_size', str(size)]) diff --git a/src/ceph/qa/tasks/cephfs/kernel_mount.py b/src/ceph/qa/tasks/cephfs/kernel_mount.py new file mode 100644 index 0000000..bfa1ac6 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/kernel_mount.py @@ -0,0 +1,267 @@ +from StringIO import StringIO +import json +import logging +from textwrap import dedent +from teuthology.orchestra.run import CommandFailedError +from teuthology import misc + +from teuthology.orchestra import remote as orchestra_remote +from teuthology.orchestra import run +from teuthology.contextutil import MaxWhileTries +from .mount import CephFSMount + +log = logging.getLogger(__name__) + + +UMOUNT_TIMEOUT = 300 + + +class KernelMount(CephFSMount): + def __init__(self, mons, test_dir, client_id, client_remote, + ipmi_user, ipmi_password, ipmi_domain): + super(KernelMount, self).__init__(test_dir, client_id, client_remote) + self.mons = mons + + self.mounted = False + self.ipmi_user = ipmi_user + self.ipmi_password = ipmi_password + self.ipmi_domain = ipmi_domain + + def write_secret_file(self, remote, role, keyring, filename): + """ + Stash the keyring in the filename specified. + """ + remote.run( + args=[ + 'adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=self.test_dir), + 'ceph-authtool', + '--name={role}'.format(role=role), + '--print-key', + keyring, + run.Raw('>'), + filename, + ], + ) + + def mount(self, mount_path=None, mount_fs_name=None): + log.info('Mounting kclient client.{id} at {remote} {mnt}...'.format( + id=self.client_id, remote=self.client_remote, mnt=self.mountpoint)) + + keyring = self.get_keyring_path() + secret = '{tdir}/ceph.data/client.{id}.secret'.format(tdir=self.test_dir, id=self.client_id) + self.write_secret_file(self.client_remote, 'client.{id}'.format(id=self.client_id), + keyring, secret) + + self.client_remote.run( + args=[ + 'mkdir', + '--', + self.mountpoint, + ], + ) + + if mount_path is None: + mount_path = "/" + + opts = 'name={id},secretfile={secret},norequire_active_mds'.format(id=self.client_id, + secret=secret) + + if mount_fs_name is not None: + opts += ",mds_namespace={0}".format(mount_fs_name) + + self.client_remote.run( + args=[ + 'sudo', + 'adjust-ulimits', + 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=self.test_dir), + '/sbin/mount.ceph', + '{mons}:{mount_path}'.format(mons=','.join(self.mons), mount_path=mount_path), + self.mountpoint, + '-v', + '-o', + opts + ], + ) + + self.client_remote.run( + args=['sudo', 'chmod', '1777', self.mountpoint]) + + self.mounted = True + + def umount(self, force=False): + log.debug('Unmounting client client.{id}...'.format(id=self.client_id)) + + cmd=['sudo', 'umount', self.mountpoint] + if force: + cmd.append('-f') + + try: + self.client_remote.run(args=cmd) + except Exception as e: + self.client_remote.run(args=[ + 'sudo', + run.Raw('PATH=/usr/sbin:$PATH'), + 'lsof', + run.Raw(';'), + 'ps', 'auxf', + ]) + raise e + + rproc = self.client_remote.run( + args=[ + 'rmdir', + '--', + self.mountpoint, + ], + wait=False + ) + run.wait([rproc], UMOUNT_TIMEOUT) + self.mounted = False + + def cleanup(self): + pass + + def umount_wait(self, force=False, require_clean=False): + """ + Unlike the fuse client, the kernel client's umount is immediate + """ + if not self.is_mounted(): + return + + try: + self.umount(force) + except (CommandFailedError, MaxWhileTries): + if not force: + raise + + self.kill() + self.kill_cleanup() + + self.mounted = False + + def is_mounted(self): + return self.mounted + + def wait_until_mounted(self): + """ + Unlike the fuse client, the kernel client is up and running as soon + as the initial mount() function returns. + """ + assert self.mounted + + def teardown(self): + super(KernelMount, self).teardown() + if self.mounted: + self.umount() + + def kill(self): + """ + The Ceph kernel client doesn't have a mechanism to kill itself (doing + that in side the kernel would be weird anyway), so we reboot the whole node + to get the same effect. + + We use IPMI to reboot, because we don't want the client to send any + releases of capabilities. + """ + + con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, + self.ipmi_user, + self.ipmi_password, + self.ipmi_domain) + con.power_off() + + self.mounted = False + + def kill_cleanup(self): + assert not self.mounted + + con = orchestra_remote.getRemoteConsole(self.client_remote.hostname, + self.ipmi_user, + self.ipmi_password, + self.ipmi_domain) + con.power_on() + + # Wait for node to come back up after reboot + misc.reconnect(None, 300, [self.client_remote]) + + # Remove mount directory + self.client_remote.run( + args=[ + 'rmdir', + '--', + self.mountpoint, + ], + ) + + def _find_debug_dir(self): + """ + Find the debugfs folder for this mount + """ + pyscript = dedent(""" + import glob + import os + import json + + def get_id_to_dir(): + result = {} + for dir in glob.glob("/sys/kernel/debug/ceph/*"): + mds_sessions_lines = open(os.path.join(dir, "mds_sessions")).readlines() + client_id = mds_sessions_lines[1].split()[1].strip('"') + + result[client_id] = dir + return result + + print json.dumps(get_id_to_dir()) + """) + + p = self.client_remote.run(args=[ + 'sudo', 'python', '-c', pyscript + ], stdout=StringIO()) + client_id_to_dir = json.loads(p.stdout.getvalue()) + + try: + return client_id_to_dir[self.client_id] + except KeyError: + log.error("Client id '{0}' debug dir not found (clients seen were: {1})".format( + self.client_id, ",".join(client_id_to_dir.keys()) + )) + raise + + def _read_debug_file(self, filename): + debug_dir = self._find_debug_dir() + + pyscript = dedent(""" + import os + + print open(os.path.join("{debug_dir}", "{filename}")).read() + """).format(debug_dir=debug_dir, filename=filename) + + p = self.client_remote.run(args=[ + 'sudo', 'python', '-c', pyscript + ], stdout=StringIO()) + return p.stdout.getvalue() + + def get_global_id(self): + """ + Look up the CephFS client ID for this mount, using debugfs. + """ + + assert self.mounted + + mds_sessions = self._read_debug_file("mds_sessions") + lines = mds_sessions.split("\n") + return int(lines[0].split()[1]) + + def get_osd_epoch(self): + """ + Return 2-tuple of osd_epoch, osd_epoch_barrier + """ + osd_map = self._read_debug_file("osdmap") + lines = osd_map.split("\n") + first_line_tokens = lines[0].split() + epoch, barrier = int(first_line_tokens[1]), int(first_line_tokens[3]) + + return epoch, barrier diff --git a/src/ceph/qa/tasks/cephfs/mount.py b/src/ceph/qa/tasks/cephfs/mount.py new file mode 100644 index 0000000..4f96e6c --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/mount.py @@ -0,0 +1,627 @@ +from contextlib import contextmanager +import json +import logging +import datetime +import time +from textwrap import dedent +import os +from StringIO import StringIO +from teuthology.orchestra import run +from teuthology.orchestra.run import CommandFailedError, ConnectionLostError + +log = logging.getLogger(__name__) + + +class CephFSMount(object): + def __init__(self, test_dir, client_id, client_remote): + """ + :param test_dir: Global teuthology test dir + :param client_id: Client ID, the 'foo' in client.foo + :param client_remote: Remote instance for the host where client will run + """ + + self.test_dir = test_dir + self.client_id = client_id + self.client_remote = client_remote + self.mountpoint_dir_name = 'mnt.{id}'.format(id=self.client_id) + + self.test_files = ['a', 'b', 'c'] + + self.background_procs = [] + + @property + def mountpoint(self): + return os.path.join( + self.test_dir, '{dir_name}'.format(dir_name=self.mountpoint_dir_name)) + + def is_mounted(self): + raise NotImplementedError() + + def mount(self, mount_path=None, mount_fs_name=None): + raise NotImplementedError() + + def umount(self): + raise NotImplementedError() + + def umount_wait(self, force=False, require_clean=False): + """ + + :param force: Expect that the mount will not shutdown cleanly: kill + it hard. + :param require_clean: Wait for the Ceph client associated with the + mount (e.g. ceph-fuse) to terminate, and + raise if it doesn't do so cleanly. + :return: + """ + raise NotImplementedError() + + def kill_cleanup(self): + raise NotImplementedError() + + def kill(self): + raise NotImplementedError() + + def cleanup(self): + raise NotImplementedError() + + def wait_until_mounted(self): + raise NotImplementedError() + + def get_keyring_path(self): + return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id) + + @property + def config_path(self): + """ + Path to ceph.conf: override this if you're not a normal systemwide ceph install + :return: stringv + """ + return "/etc/ceph/ceph.conf" + + @contextmanager + def mounted(self): + """ + A context manager, from an initially unmounted state, to mount + this, yield, and then unmount and clean up. + """ + self.mount() + self.wait_until_mounted() + try: + yield + finally: + self.umount_wait() + + def create_files(self): + assert(self.is_mounted()) + + for suffix in self.test_files: + log.info("Creating file {0}".format(suffix)) + self.client_remote.run(args=[ + 'sudo', 'touch', os.path.join(self.mountpoint, suffix) + ]) + + def check_files(self): + assert(self.is_mounted()) + + for suffix in self.test_files: + log.info("Checking file {0}".format(suffix)) + r = self.client_remote.run(args=[ + 'sudo', 'ls', os.path.join(self.mountpoint, suffix) + ], check_status=False) + if r.exitstatus != 0: + raise RuntimeError("Expected file {0} not found".format(suffix)) + + def create_destroy(self): + assert(self.is_mounted()) + + filename = "{0} {1}".format(datetime.datetime.now(), self.client_id) + log.debug("Creating test file {0}".format(filename)) + self.client_remote.run(args=[ + 'sudo', 'touch', os.path.join(self.mountpoint, filename) + ]) + log.debug("Deleting test file {0}".format(filename)) + self.client_remote.run(args=[ + 'sudo', 'rm', '-f', os.path.join(self.mountpoint, filename) + ]) + + def _run_python(self, pyscript): + return self.client_remote.run(args=[ + 'sudo', 'adjust-ulimits', 'daemon-helper', 'kill', 'python', '-c', pyscript + ], wait=False, stdin=run.PIPE, stdout=StringIO()) + + def run_python(self, pyscript): + p = self._run_python(pyscript) + p.wait() + return p.stdout.getvalue().strip() + + def run_shell(self, args, wait=True): + args = ["cd", self.mountpoint, run.Raw('&&'), "sudo"] + args + return self.client_remote.run(args=args, stdout=StringIO(), + stderr=StringIO(), wait=wait) + + def open_no_data(self, basename): + """ + A pure metadata operation + """ + assert(self.is_mounted()) + + path = os.path.join(self.mountpoint, basename) + + p = self._run_python(dedent( + """ + f = open("{path}", 'w') + """.format(path=path) + )) + p.wait() + + def open_background(self, basename="background_file"): + """ + Open a file for writing, then block such that the client + will hold a capability. + + Don't return until the remote process has got as far as opening + the file, then return the RemoteProcess instance. + """ + assert(self.is_mounted()) + + path = os.path.join(self.mountpoint, basename) + + pyscript = dedent(""" + import time + + f = open("{path}", 'w') + f.write('content') + f.flush() + f.write('content2') + while True: + time.sleep(1) + """).format(path=path) + + rproc = self._run_python(pyscript) + self.background_procs.append(rproc) + + # This wait would not be sufficient if the file had already + # existed, but it's simple and in practice users of open_background + # are not using it on existing files. + self.wait_for_visible(basename) + + return rproc + + def wait_for_visible(self, basename="background_file", timeout=30): + i = 0 + while i < timeout: + r = self.client_remote.run(args=[ + 'sudo', 'ls', os.path.join(self.mountpoint, basename) + ], check_status=False) + if r.exitstatus == 0: + log.debug("File {0} became visible from {1} after {2}s".format( + basename, self.client_id, i)) + return + else: + time.sleep(1) + i += 1 + + raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format( + i, basename, self.client_id)) + + def lock_background(self, basename="background_file", do_flock=True): + """ + Open and lock a files for writing, hold the lock in a background process + """ + assert(self.is_mounted()) + + path = os.path.join(self.mountpoint, basename) + + script_builder = """ + import time + import fcntl + import struct""" + if do_flock: + script_builder += """ + f1 = open("{path}-1", 'w') + fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)""" + script_builder += """ + f2 = open("{path}-2", 'w') + lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0) + fcntl.fcntl(f2, fcntl.F_SETLK, lockdata) + while True: + time.sleep(1) + """ + + pyscript = dedent(script_builder).format(path=path) + + log.info("lock_background file {0}".format(basename)) + rproc = self._run_python(pyscript) + self.background_procs.append(rproc) + return rproc + + def lock_and_release(self, basename="background_file"): + assert(self.is_mounted()) + + path = os.path.join(self.mountpoint, basename) + + script = """ + import time + import fcntl + import struct + f1 = open("{path}-1", 'w') + fcntl.flock(f1, fcntl.LOCK_EX) + f2 = open("{path}-2", 'w') + lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0) + fcntl.fcntl(f2, fcntl.F_SETLK, lockdata) + """ + pyscript = dedent(script).format(path=path) + + log.info("lock_and_release file {0}".format(basename)) + return self._run_python(pyscript) + + def check_filelock(self, basename="background_file", do_flock=True): + assert(self.is_mounted()) + + path = os.path.join(self.mountpoint, basename) + + script_builder = """ + import fcntl + import errno + import struct""" + if do_flock: + script_builder += """ + f1 = open("{path}-1", 'r') + try: + fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB) + except IOError, e: + if e.errno == errno.EAGAIN: + pass + else: + raise RuntimeError("flock on file {path}-1 not found")""" + script_builder += """ + f2 = open("{path}-2", 'r') + try: + lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0) + fcntl.fcntl(f2, fcntl.F_SETLK, lockdata) + except IOError, e: + if e.errno == errno.EAGAIN: + pass + else: + raise RuntimeError("posix lock on file {path}-2 not found") + """ + pyscript = dedent(script_builder).format(path=path) + + log.info("check lock on file {0}".format(basename)) + self.client_remote.run(args=[ + 'sudo', 'python', '-c', pyscript + ]) + + def write_background(self, basename="background_file", loop=False): + """ + Open a file for writing, complete as soon as you can + :param basename: + :return: + """ + assert(self.is_mounted()) + + path = os.path.join(self.mountpoint, basename) + + pyscript = dedent(""" + import os + import time + + fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0644) + try: + while True: + os.write(fd, 'content') + time.sleep(1) + if not {loop}: + break + except IOError, e: + pass + os.close(fd) + """).format(path=path, loop=str(loop)) + + rproc = self._run_python(pyscript) + self.background_procs.append(rproc) + return rproc + + def write_n_mb(self, filename, n_mb, seek=0, wait=True): + """ + Write the requested number of megabytes to a file + """ + assert(self.is_mounted()) + + return self.run_shell(["dd", "if=/dev/urandom", "of={0}".format(filename), + "bs=1M", "conv=fdatasync", + "count={0}".format(n_mb), + "seek={0}".format(seek) + ], wait=wait) + + def write_test_pattern(self, filename, size): + log.info("Writing {0} bytes to {1}".format(size, filename)) + return self.run_python(dedent(""" + import zlib + path = "{path}" + f = open(path, 'w') + for i in range(0, {size}): + val = zlib.crc32("%s" % i) & 7 + f.write(chr(val)) + f.close() + """.format( + path=os.path.join(self.mountpoint, filename), + size=size + ))) + + def validate_test_pattern(self, filename, size): + log.info("Validating {0} bytes from {1}".format(size, filename)) + return self.run_python(dedent(""" + import zlib + path = "{path}" + f = open(path, 'r') + bytes = f.read() + f.close() + if len(bytes) != {size}: + raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format( + len(bytes), {size} + )) + for i, b in enumerate(bytes): + val = zlib.crc32("%s" % i) & 7 + if b != chr(val): + raise RuntimeError("Bad data at offset {{0}}".format(i)) + """.format( + path=os.path.join(self.mountpoint, filename), + size=size + ))) + + def open_n_background(self, fs_path, count): + """ + Open N files for writing, hold them open in a background process + + :param fs_path: Path relative to CephFS root, e.g. "foo/bar" + :return: a RemoteProcess + """ + assert(self.is_mounted()) + + abs_path = os.path.join(self.mountpoint, fs_path) + + pyscript = dedent(""" + import sys + import time + import os + + n = {count} + abs_path = "{abs_path}" + + if not os.path.exists(os.path.dirname(abs_path)): + os.makedirs(os.path.dirname(abs_path)) + + handles = [] + for i in range(0, n): + fname = "{{0}}_{{1}}".format(abs_path, i) + handles.append(open(fname, 'w')) + + while True: + time.sleep(1) + """).format(abs_path=abs_path, count=count) + + rproc = self._run_python(pyscript) + self.background_procs.append(rproc) + return rproc + + def create_n_files(self, fs_path, count, sync=False): + assert(self.is_mounted()) + + abs_path = os.path.join(self.mountpoint, fs_path) + + pyscript = dedent(""" + import sys + import time + import os + + n = {count} + abs_path = "{abs_path}" + + if not os.path.exists(os.path.dirname(abs_path)): + os.makedirs(os.path.dirname(abs_path)) + + for i in range(0, n): + fname = "{{0}}_{{1}}".format(abs_path, i) + h = open(fname, 'w') + h.write('content') + if {sync}: + h.flush() + os.fsync(h.fileno()) + h.close() + """).format(abs_path=abs_path, count=count, sync=str(sync)) + + self.run_python(pyscript) + + def teardown(self): + for p in self.background_procs: + log.info("Terminating background process") + self._kill_background(p) + + self.background_procs = [] + + def _kill_background(self, p): + if p.stdin: + p.stdin.close() + try: + p.wait() + except (CommandFailedError, ConnectionLostError): + pass + + def kill_background(self, p): + """ + For a process that was returned by one of the _background member functions, + kill it hard. + """ + self._kill_background(p) + self.background_procs.remove(p) + + def get_global_id(self): + raise NotImplementedError() + + def get_osd_epoch(self): + raise NotImplementedError() + + def stat(self, fs_path, wait=True): + """ + stat a file, and return the result as a dictionary like this: + { + "st_ctime": 1414161137.0, + "st_mtime": 1414161137.0, + "st_nlink": 33, + "st_gid": 0, + "st_dev": 16777218, + "st_size": 1190, + "st_ino": 2, + "st_uid": 0, + "st_mode": 16877, + "st_atime": 1431520593.0 + } + + Raises exception on absent file. + """ + abs_path = os.path.join(self.mountpoint, fs_path) + + pyscript = dedent(""" + import os + import stat + import json + import sys + + try: + s = os.stat("{path}") + except OSError as e: + sys.exit(e.errno) + + attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"] + print json.dumps( + dict([(a, getattr(s, a)) for a in attrs]), + indent=2) + """).format(path=abs_path) + proc = self._run_python(pyscript) + if wait: + proc.wait() + return json.loads(proc.stdout.getvalue().strip()) + else: + return proc + + def touch(self, fs_path): + """ + Create a dentry if it doesn't already exist. This python + implementation exists because the usual command line tool doesn't + pass through error codes like EIO. + + :param fs_path: + :return: + """ + abs_path = os.path.join(self.mountpoint, fs_path) + pyscript = dedent(""" + import sys + import errno + + try: + f = open("{path}", "w") + f.close() + except IOError as e: + sys.exit(errno.EIO) + """).format(path=abs_path) + proc = self._run_python(pyscript) + proc.wait() + + def path_to_ino(self, fs_path, follow_symlinks=True): + abs_path = os.path.join(self.mountpoint, fs_path) + + if follow_symlinks: + pyscript = dedent(""" + import os + import stat + + print os.stat("{path}").st_ino + """).format(path=abs_path) + else: + pyscript = dedent(""" + import os + import stat + + print os.lstat("{path}").st_ino + """).format(path=abs_path) + + proc = self._run_python(pyscript) + proc.wait() + return int(proc.stdout.getvalue().strip()) + + def path_to_nlink(self, fs_path): + abs_path = os.path.join(self.mountpoint, fs_path) + + pyscript = dedent(""" + import os + import stat + + print os.stat("{path}").st_nlink + """).format(path=abs_path) + + proc = self._run_python(pyscript) + proc.wait() + return int(proc.stdout.getvalue().strip()) + + def ls(self, path=None): + """ + Wrap ls: return a list of strings + """ + cmd = ["ls"] + if path: + cmd.append(path) + + ls_text = self.run_shell(cmd).stdout.getvalue().strip() + + if ls_text: + return ls_text.split("\n") + else: + # Special case because otherwise split on empty string + # gives you [''] instead of [] + return [] + + def setfattr(self, path, key, val): + """ + Wrap setfattr. + + :param path: relative to mount point + :param key: xattr name + :param val: xattr value + :return: None + """ + self.run_shell(["setfattr", "-n", key, "-v", val, path]) + + def getfattr(self, path, attr): + """ + Wrap getfattr: return the values of a named xattr on one file, or + None if the attribute is not found. + + :return: a string + """ + p = self.run_shell(["getfattr", "--only-values", "-n", attr, path], wait=False) + try: + p.wait() + except CommandFailedError as e: + if e.exitstatus == 1 and "No such attribute" in p.stderr.getvalue(): + return None + else: + raise + + return p.stdout.getvalue() + + def df(self): + """ + Wrap df: return a dict of usage fields in bytes + """ + + p = self.run_shell(["df", "-B1", "."]) + lines = p.stdout.getvalue().strip().split("\n") + fs, total, used, avail = lines[1].split()[:4] + log.warn(lines) + + return { + "total": int(total), + "used": int(used), + "available": int(avail) + } diff --git a/src/ceph/qa/tasks/cephfs/test_auto_repair.py b/src/ceph/qa/tasks/cephfs/test_auto_repair.py new file mode 100644 index 0000000..c0aa2e4 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_auto_repair.py @@ -0,0 +1,90 @@ + +""" +Exercise the MDS's auto repair functions +""" + +import logging +import time + +from teuthology.orchestra.run import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase + + +log = logging.getLogger(__name__) + + +# Arbitrary timeouts for operations involving restarting +# an MDS or waiting for it to come up +MDS_RESTART_GRACE = 60 + + +class TestMDSAutoRepair(CephFSTestCase): + def test_backtrace_repair(self): + """ + MDS should verify/fix backtrace on fetch dirfrag + """ + + self.mount_a.run_shell(["mkdir", "testdir1"]) + self.mount_a.run_shell(["touch", "testdir1/testfile"]) + dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino("testdir1")) + + # drop inodes caps + self.mount_a.umount_wait() + + # flush journal entries to dirfrag objects, and expire journal + self.fs.mds_asok(['flush', 'journal']) + + # Restart the MDS to drop the metadata cache (because we expired the journal, + # nothing gets replayed into cache on restart) + self.fs.mds_stop() + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + # remove testdir1's backtrace + self.fs.rados(["rmxattr", dir_objname, "parent"]) + + # readdir (fetch dirfrag) should fix testdir1's backtrace + self.mount_a.mount() + self.mount_a.wait_until_mounted() + self.mount_a.run_shell(["ls", "testdir1"]) + + # flush journal entries to dirfrag objects + self.fs.mds_asok(['flush', 'journal']) + + # check if backtrace exists + self.fs.rados(["getxattr", dir_objname, "parent"]) + + def test_mds_readonly(self): + """ + test if MDS behave correct when it's readonly + """ + # operation should successd when MDS is not readonly + self.mount_a.run_shell(["touch", "test_file1"]) + writer = self.mount_a.write_background(loop=True) + + time.sleep(10) + self.assertFalse(writer.finished) + + # force MDS to read-only mode + self.fs.mds_asok(['force_readonly']) + time.sleep(10) + + # touching test file should fail + try: + self.mount_a.run_shell(["touch", "test_file1"]) + except CommandFailedError: + pass + else: + self.assertTrue(False) + + # background writer also should fail + self.assertTrue(writer.finished) + + # The MDS should report its readonly health state to the mon + self.wait_for_health("MDS_READ_ONLY", timeout=30) + + # restart mds to make it writable + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + self.wait_for_health_clear(timeout=30) diff --git a/src/ceph/qa/tasks/cephfs/test_backtrace.py b/src/ceph/qa/tasks/cephfs/test_backtrace.py new file mode 100644 index 0000000..af246a1 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_backtrace.py @@ -0,0 +1,78 @@ + +from tasks.cephfs.cephfs_test_case import CephFSTestCase + + +class TestBacktrace(CephFSTestCase): + def test_backtrace(self): + """ + That the 'parent' and 'layout' xattrs on the head objects of files + are updated correctly. + """ + + old_data_pool_name = self.fs.get_data_pool_name() + old_pool_id = self.fs.get_data_pool_id() + + # Create a file for subsequent checks + self.mount_a.run_shell(["mkdir", "parent_a"]) + self.mount_a.run_shell(["touch", "parent_a/alpha"]) + file_ino = self.mount_a.path_to_ino("parent_a/alpha") + + # That backtrace and layout are written after initial flush + self.fs.mds_asok(["flush", "journal"]) + backtrace = self.fs.read_backtrace(file_ino) + self.assertEqual(['alpha', 'parent_a'], [a['dname'] for a in backtrace['ancestors']]) + layout = self.fs.read_layout(file_ino) + self.assertDictEqual(layout, { + "stripe_unit": 4194304, + "stripe_count": 1, + "object_size": 4194304, + "pool_id": old_pool_id, + "pool_ns": "", + }) + self.assertEqual(backtrace['pool'], old_pool_id) + + # That backtrace is written after parentage changes + self.mount_a.run_shell(["mkdir", "parent_b"]) + self.mount_a.run_shell(["mv", "parent_a/alpha", "parent_b/alpha"]) + + self.fs.mds_asok(["flush", "journal"]) + backtrace = self.fs.read_backtrace(file_ino) + self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace['ancestors']]) + + # Create a new data pool + new_pool_name = "data_new" + new_pool_id = self.fs.add_data_pool(new_pool_name) + + # That an object which has switched pools gets its backtrace updated + self.mount_a.setfattr("./parent_b/alpha", + "ceph.file.layout.pool", new_pool_name) + self.fs.mds_asok(["flush", "journal"]) + backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name) + self.assertEqual(backtrace_old_pool['pool'], new_pool_id) + backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name) + self.assertEqual(backtrace_new_pool['pool'], new_pool_id) + new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name) + self.assertEqual(new_pool_layout['pool_id'], new_pool_id) + self.assertEqual(new_pool_layout['pool_ns'], '') + + # That subsequent linkage changes are only written to new pool backtrace + self.mount_a.run_shell(["mkdir", "parent_c"]) + self.mount_a.run_shell(["mv", "parent_b/alpha", "parent_c/alpha"]) + self.fs.mds_asok(["flush", "journal"]) + backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name) + self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace_old_pool['ancestors']]) + backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name) + self.assertEqual(['alpha', 'parent_c'], [a['dname'] for a in backtrace_new_pool['ancestors']]) + + # That layout is written to new pool after change to other field in layout + self.mount_a.setfattr("./parent_c/alpha", + "ceph.file.layout.object_size", "8388608") + + self.fs.mds_asok(["flush", "journal"]) + new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name) + self.assertEqual(new_pool_layout['object_size'], 8388608) + + # ...but not to the old pool: the old pool's backtrace points to the new pool, and that's enough, + # we don't update the layout in all the old pools whenever it changes + old_pool_layout = self.fs.read_layout(file_ino, pool=old_data_pool_name) + self.assertEqual(old_pool_layout['object_size'], 4194304) diff --git a/src/ceph/qa/tasks/cephfs/test_cap_flush.py b/src/ceph/qa/tasks/cephfs/test_cap_flush.py new file mode 100644 index 0000000..1cd102f --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_cap_flush.py @@ -0,0 +1,64 @@ + +import os +import time +from textwrap import dedent +from unittest import SkipTest +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology + +class TestCapFlush(CephFSTestCase): + @for_teuthology + def test_replay_create(self): + """ + MDS starts to handle client caps when it enters clientreplay stage. + When handling a client cap in clientreplay stage, it's possible that + corresponding inode does not exist because the client request which + creates inode hasn't been replayed. + """ + + if not isinstance(self.mount_a, FuseMount): + raise SkipTest("Require FUSE client to inject client release failure") + + dir_path = os.path.join(self.mount_a.mountpoint, "testdir") + py_script = dedent(""" + import os + os.mkdir("{0}") + fd = os.open("{0}", os.O_RDONLY) + os.fchmod(fd, 0777) + os.fsync(fd) + """).format(dir_path) + self.mount_a.run_python(py_script) + + self.fs.mds_asok(["flush", "journal"]) + + # client will only get unsafe replay + self.fs.mds_asok(["config", "set", "mds_log_pause", "1"]) + + file_name = "testfile" + file_path = dir_path + "/" + file_name + + # Create a file and modify its mode. ceph-fuse will mark Ax cap dirty + py_script = dedent(""" + import os + os.chdir("{0}") + os.setgid(65534) + os.setuid(65534) + fd = os.open("{1}", os.O_CREAT | os.O_RDWR, 0644) + os.fchmod(fd, 0640) + """).format(dir_path, file_name) + self.mount_a.run_python(py_script) + + # Modify file mode by different user. ceph-fuse will send a setattr request + self.mount_a.run_shell(["chmod", "600", file_path], wait=False) + + time.sleep(10) + + # Restart mds. Client will re-send the unsafe request and cap flush + self.fs.mds_stop() + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip() + # If the cap flush get dropped, mode should be 0644. + # (Ax cap stays in dirty state, which prevents setattr reply from updating file mode) + self.assertEqual(mode, "600") diff --git a/src/ceph/qa/tasks/cephfs/test_client_limits.py b/src/ceph/qa/tasks/cephfs/test_client_limits.py new file mode 100644 index 0000000..cb5e3a4 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_client_limits.py @@ -0,0 +1,239 @@ + +""" +Exercise the MDS's behaviour when clients and the MDCache reach or +exceed the limits of how many caps/inodes they should hold. +""" + +import logging +from textwrap import dedent +from unittest import SkipTest +from teuthology.orchestra.run import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming +from tasks.cephfs.fuse_mount import FuseMount +import os + + +log = logging.getLogger(__name__) + + +# Arbitrary timeouts for operations involving restarting +# an MDS or waiting for it to come up +MDS_RESTART_GRACE = 60 + +# Hardcoded values from Server::recall_client_state +CAP_RECALL_RATIO = 0.8 +CAP_RECALL_MIN = 100 + + +class TestClientLimits(CephFSTestCase): + REQUIRE_KCLIENT_REMOTE = True + CLIENTS_REQUIRED = 2 + + def _test_client_pin(self, use_subdir, open_files): + """ + When a client pins an inode in its cache, for example because the file is held open, + it should reject requests from the MDS to trim these caps. The MDS should complain + to the user that it is unable to enforce its cache size limits because of this + objectionable client. + + :param use_subdir: whether to put test files in a subdir or use root + """ + + cache_size = open_files/2 + + self.set_conf('mds', 'mds cache size', cache_size) + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client")) + self.assertTrue(open_files >= mds_min_caps_per_client) + mds_max_ratio_caps_per_client = float(self.fs.get_config("mds_max_ratio_caps_per_client")) + + mount_a_client_id = self.mount_a.get_global_id() + path = "subdir/mount_a" if use_subdir else "mount_a" + open_proc = self.mount_a.open_n_background(path, open_files) + + # Client should now hold: + # `open_files` caps for the open files + # 1 cap for root + # 1 cap for subdir + self.wait_until_equal(lambda: self.get_session(mount_a_client_id)['num_caps'], + open_files + (2 if use_subdir else 1), + timeout=600, + reject_fn=lambda x: x > open_files + 2) + + # MDS should not be happy about that, as the client is failing to comply + # with the SESSION_RECALL messages it is being sent + mds_recall_state_timeout = float(self.fs.get_config("mds_recall_state_timeout")) + self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_state_timeout+10) + + # We can also test that the MDS health warning for oversized + # cache is functioning as intended. + self.wait_for_health("MDS_CACHE_OVERSIZED", + mds_recall_state_timeout + 10) + + # When the client closes the files, it should retain only as many caps as allowed + # under the SESSION_RECALL policy + log.info("Terminating process holding files open") + open_proc.stdin.close() + try: + open_proc.wait() + except CommandFailedError: + # We killed it, so it raises an error + pass + + # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message, + # which depend on the caps outstanding, cache size and overall ratio + recall_expected_value = int((1.0-mds_max_ratio_caps_per_client)*(open_files+2)) + def expected_caps(): + num_caps = self.get_session(mount_a_client_id)['num_caps'] + if num_caps < mds_min_caps_per_client: + raise RuntimeError("client caps fell below min!") + elif num_caps == mds_min_caps_per_client: + return True + elif recall_expected_value*.95 <= num_caps <= recall_expected_value*1.05: + return True + else: + return False + + self.wait_until_true(expected_caps, timeout=60) + + @needs_trimming + def test_client_pin_root(self): + self._test_client_pin(False, 400) + + @needs_trimming + def test_client_pin(self): + self._test_client_pin(True, 800) + + @needs_trimming + def test_client_pin_mincaps(self): + self._test_client_pin(True, 200) + + def test_client_release_bug(self): + """ + When a client has a bug (which we will simulate) preventing it from releasing caps, + the MDS should notice that releases are not being sent promptly, and generate a health + metric to that effect. + """ + + # The debug hook to inject the failure only exists in the fuse client + if not isinstance(self.mount_a, FuseMount): + raise SkipTest("Require FUSE client to inject client release failure") + + self.set_conf('client.{0}'.format(self.mount_a.client_id), 'client inject release failure', 'true') + self.mount_a.teardown() + self.mount_a.mount() + self.mount_a.wait_until_mounted() + mount_a_client_id = self.mount_a.get_global_id() + + # Client A creates a file. He will hold the write caps on the file, and later (simulated bug) fail + # to comply with the MDSs request to release that cap + self.mount_a.run_shell(["touch", "file1"]) + + # Client B tries to stat the file that client A created + rproc = self.mount_b.write_background("file1") + + # After mds_revoke_cap_timeout, we should see a health warning (extra lag from + # MDS beacon period) + mds_revoke_cap_timeout = float(self.fs.get_config("mds_revoke_cap_timeout")) + self.wait_for_health("MDS_CLIENT_LATE_RELEASE", mds_revoke_cap_timeout + 10) + + # Client B should still be stuck + self.assertFalse(rproc.finished) + + # Kill client A + self.mount_a.kill() + self.mount_a.kill_cleanup() + + # Client B should complete + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + rproc.wait() + + def test_client_oldest_tid(self): + """ + When a client does not advance its oldest tid, the MDS should notice that + and generate health warnings. + """ + + # num of requests client issues + max_requests = 1000 + + # The debug hook to inject the failure only exists in the fuse client + if not isinstance(self.mount_a, FuseMount): + raise SkipTest("Require FUSE client to inject client release failure") + + self.set_conf('client', 'client inject fixed oldest tid', 'true') + self.mount_a.teardown() + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + self.fs.mds_asok(['config', 'set', 'mds_max_completed_requests', '{0}'.format(max_requests)]) + + # Create lots of files + self.mount_a.create_n_files("testdir/file1", max_requests + 100) + + # Create a few files synchronously. This makes sure previous requests are completed + self.mount_a.create_n_files("testdir/file2", 5, True) + + # Wait for the health warnings. Assume mds can handle 10 request per second at least + self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests / 10) + + def _test_client_cache_size(self, mount_subdir): + """ + check if client invalidate kernel dcache according to its cache size config + """ + + # The debug hook to inject the failure only exists in the fuse client + if not isinstance(self.mount_a, FuseMount): + raise SkipTest("Require FUSE client to inject client release failure") + + if mount_subdir: + # fuse assigns a fix inode number (1) to root inode. But in mounting into + # subdir case, the actual inode number of root is not 1. This mismatch + # confuses fuse_lowlevel_notify_inval_entry() when invalidating dentries + # in root directory. + self.mount_a.run_shell(["mkdir", "subdir"]) + self.mount_a.umount_wait() + self.set_conf('client', 'client mountpoint', '/subdir') + self.mount_a.mount() + self.mount_a.wait_until_mounted() + root_ino = self.mount_a.path_to_ino(".") + self.assertEqual(root_ino, 1); + + dir_path = os.path.join(self.mount_a.mountpoint, "testdir") + + mkdir_script = dedent(""" + import os + os.mkdir("{path}") + for n in range(0, {num_dirs}): + os.mkdir("{path}/dir{{0}}".format(n)) + """) + + num_dirs = 1000 + self.mount_a.run_python(mkdir_script.format(path=dir_path, num_dirs=num_dirs)) + self.mount_a.run_shell(["sync"]) + + dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count() + self.assertGreaterEqual(dentry_count, num_dirs) + self.assertGreaterEqual(dentry_pinned_count, num_dirs) + + cache_size = num_dirs / 10 + self.mount_a.set_cache_size(cache_size) + + def trimmed(): + dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count() + log.info("waiting, dentry_count, dentry_pinned_count: {0}, {1}".format( + dentry_count, dentry_pinned_count + )) + if dentry_count > cache_size or dentry_pinned_count > cache_size: + return False + + return True + + self.wait_until_true(trimmed, 30) + + @needs_trimming + def test_client_cache_size(self): + self._test_client_cache_size(False) + self._test_client_cache_size(True) diff --git a/src/ceph/qa/tasks/cephfs/test_client_recovery.py b/src/ceph/qa/tasks/cephfs/test_client_recovery.py new file mode 100644 index 0000000..fd58c14 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_client_recovery.py @@ -0,0 +1,474 @@ + +""" +Teuthology task for exercising CephFS client recovery +""" + +import logging +from textwrap import dedent +import time +import distutils.version as version +import re +import os + +from teuthology.orchestra.run import CommandFailedError, ConnectionLostError +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.packaging import get_package_version + + +log = logging.getLogger(__name__) + + +# Arbitrary timeouts for operations involving restarting +# an MDS or waiting for it to come up +MDS_RESTART_GRACE = 60 + + +class TestClientNetworkRecovery(CephFSTestCase): + REQUIRE_KCLIENT_REMOTE = True + REQUIRE_ONE_CLIENT_REMOTE = True + CLIENTS_REQUIRED = 2 + + LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"] + + # Environment references + mds_session_timeout = None + mds_reconnect_timeout = None + ms_max_backoff = None + + def test_network_death(self): + """ + Simulate software freeze or temporary network failure. + + Check that the client blocks I/O during failure, and completes + I/O after failure. + """ + + # We only need one client + self.mount_b.umount_wait() + + # Initially our one client session should be visible + client_id = self.mount_a.get_global_id() + ls_data = self._session_list() + self.assert_session_count(1, ls_data) + self.assertEqual(ls_data[0]['id'], client_id) + self.assert_session_state(client_id, "open") + + # ...and capable of doing I/O without blocking + self.mount_a.create_files() + + # ...but if we turn off the network + self.fs.set_clients_block(True) + + # ...and try and start an I/O + write_blocked = self.mount_a.write_background() + + # ...then it should block + self.assertFalse(write_blocked.finished) + self.assert_session_state(client_id, "open") + time.sleep(self.mds_session_timeout * 1.5) # Long enough for MDS to consider session stale + self.assertFalse(write_blocked.finished) + self.assert_session_state(client_id, "stale") + + # ...until we re-enable I/O + self.fs.set_clients_block(False) + + # ...when it should complete promptly + a = time.time() + self.wait_until_true(lambda: write_blocked.finished, self.ms_max_backoff * 2) + write_blocked.wait() # Already know we're finished, wait() to raise exception on errors + recovery_time = time.time() - a + log.info("recovery time: {0}".format(recovery_time)) + self.assert_session_state(client_id, "open") + + +class TestClientRecovery(CephFSTestCase): + REQUIRE_KCLIENT_REMOTE = True + CLIENTS_REQUIRED = 2 + + LOAD_SETTINGS = ["mds_session_timeout", "mds_reconnect_timeout", "ms_max_backoff"] + + # Environment references + mds_session_timeout = None + mds_reconnect_timeout = None + ms_max_backoff = None + + def test_basic(self): + # Check that two clients come up healthy and see each others' files + # ===================================================== + self.mount_a.create_files() + self.mount_a.check_files() + self.mount_a.umount_wait() + + self.mount_b.check_files() + + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + # Check that the admin socket interface is correctly reporting + # two sessions + # ===================================================== + ls_data = self._session_list() + self.assert_session_count(2, ls_data) + + self.assertSetEqual( + set([l['id'] for l in ls_data]), + {self.mount_a.get_global_id(), self.mount_b.get_global_id()} + ) + + def test_restart(self): + # Check that after an MDS restart both clients reconnect and continue + # to handle I/O + # ===================================================== + self.fs.mds_fail_restart() + self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) + + self.mount_a.create_destroy() + self.mount_b.create_destroy() + + def _session_num_caps(self, client_id): + ls_data = self.fs.mds_asok(['session', 'ls']) + return int(self._session_by_id(ls_data).get(client_id, {'num_caps': None})['num_caps']) + + def test_reconnect_timeout(self): + # Reconnect timeout + # ================= + # Check that if I stop an MDS and a client goes away, the MDS waits + # for the reconnect period + self.fs.mds_stop() + self.fs.mds_fail() + + mount_a_client_id = self.mount_a.get_global_id() + self.mount_a.umount_wait(force=True) + + self.fs.mds_restart() + + self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) + # Check that the MDS locally reports its state correctly + status = self.fs.mds_asok(['status']) + self.assertIn("reconnect_status", status) + + ls_data = self._session_list() + self.assert_session_count(2, ls_data) + + # The session for the dead client should have the 'reconnect' flag set + self.assertTrue(self.get_session(mount_a_client_id)['reconnecting']) + + # Wait for the reconnect state to clear, this should take the + # reconnect timeout period. + in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2) + # Check that the period we waited to enter active is within a factor + # of two of the reconnect timeout. + self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout / 2, + "Should have been in reconnect phase for {0} but only took {1}".format( + self.mds_reconnect_timeout, in_reconnect_for + )) + + self.assert_session_count(1) + + # Check that the client that timed out during reconnect can + # mount again and do I/O + self.mount_a.mount() + self.mount_a.wait_until_mounted() + self.mount_a.create_destroy() + + self.assert_session_count(2) + + def test_reconnect_eviction(self): + # Eviction during reconnect + # ========================= + mount_a_client_id = self.mount_a.get_global_id() + + self.fs.mds_stop() + self.fs.mds_fail() + + # The mount goes away while the MDS is offline + self.mount_a.kill() + + self.fs.mds_restart() + + # Enter reconnect phase + self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) + self.assert_session_count(2) + + # Evict the stuck client + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + self.assert_session_count(1) + + # Observe that we proceed to active phase without waiting full reconnect timeout + evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) + # Once we evict the troublemaker, the reconnect phase should complete + # in well under the reconnect timeout. + self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5, + "reconnect did not complete soon enough after eviction, took {0}".format( + evict_til_active + )) + + # We killed earlier so must clean up before trying to use again + self.mount_a.kill_cleanup() + + # Bring the client back + self.mount_a.mount() + self.mount_a.wait_until_mounted() + self.mount_a.create_destroy() + + def test_stale_caps(self): + # Capability release from stale session + # ===================================== + cap_holder = self.mount_a.open_background() + + # Wait for the file to be visible from another client, indicating + # that mount_a has completed its network ops + self.mount_b.wait_for_visible() + + # Simulate client death + self.mount_a.kill() + + try: + # Now, after mds_session_timeout seconds, the waiter should + # complete their operation when the MDS marks the holder's + # session stale. + cap_waiter = self.mount_b.write_background() + a = time.time() + cap_waiter.wait() + b = time.time() + + # Should have succeeded + self.assertEqual(cap_waiter.exitstatus, 0) + + cap_waited = b - a + log.info("cap_waiter waited {0}s".format(cap_waited)) + self.assertTrue(self.mds_session_timeout / 2.0 <= cap_waited <= self.mds_session_timeout * 2.0, + "Capability handover took {0}, expected approx {1}".format( + cap_waited, self.mds_session_timeout + )) + + cap_holder.stdin.close() + try: + cap_holder.wait() + except (CommandFailedError, ConnectionLostError): + # We killed it (and possibly its node), so it raises an error + pass + finally: + # teardown() doesn't quite handle this case cleanly, so help it out + self.mount_a.kill_cleanup() + + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + def test_evicted_caps(self): + # Eviction while holding a capability + # =================================== + + # Take out a write capability on a file on client A, + # and then immediately kill it. + cap_holder = self.mount_a.open_background() + mount_a_client_id = self.mount_a.get_global_id() + + # Wait for the file to be visible from another client, indicating + # that mount_a has completed its network ops + self.mount_b.wait_for_visible() + + # Simulate client death + self.mount_a.kill() + + try: + # The waiter should get stuck waiting for the capability + # held on the MDS by the now-dead client A + cap_waiter = self.mount_b.write_background() + time.sleep(5) + self.assertFalse(cap_waiter.finished) + + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + # Now, because I evicted the old holder of the capability, it should + # immediately get handed over to the waiter + a = time.time() + cap_waiter.wait() + b = time.time() + cap_waited = b - a + log.info("cap_waiter waited {0}s".format(cap_waited)) + # This is the check that it happened 'now' rather than waiting + # for the session timeout + self.assertLess(cap_waited, self.mds_session_timeout / 2.0, + "Capability handover took {0}, expected less than {1}".format( + cap_waited, self.mds_session_timeout / 2.0 + )) + + cap_holder.stdin.close() + try: + cap_holder.wait() + except (CommandFailedError, ConnectionLostError): + # We killed it (and possibly its node), so it raises an error + pass + finally: + self.mount_a.kill_cleanup() + + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + def test_trim_caps(self): + # Trim capability when reconnecting MDS + # =================================== + + count = 500 + # Create lots of files + for i in range(count): + self.mount_a.run_shell(["touch", "f{0}".format(i)]) + + # Populate mount_b's cache + self.mount_b.run_shell(["ls", "-l"]) + + client_id = self.mount_b.get_global_id() + num_caps = self._session_num_caps(client_id) + self.assertGreaterEqual(num_caps, count) + + # Restart MDS. client should trim its cache when reconnecting to the MDS + self.fs.mds_fail_restart() + self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) + + num_caps = self._session_num_caps(client_id) + self.assertLess(num_caps, count, + "should have less than {0} capabilities, have {1}".format( + count, num_caps + )) + + def _is_flockable(self): + a_version_str = get_package_version(self.mount_a.client_remote, "fuse") + b_version_str = get_package_version(self.mount_b.client_remote, "fuse") + flock_version_str = "2.9" + + version_regex = re.compile(r"[0-9\.]+") + a_result = version_regex.match(a_version_str) + self.assertTrue(a_result) + b_result = version_regex.match(b_version_str) + self.assertTrue(b_result) + a_version = version.StrictVersion(a_result.group()) + b_version = version.StrictVersion(b_result.group()) + flock_version=version.StrictVersion(flock_version_str) + + if (a_version >= flock_version and b_version >= flock_version): + log.info("flock locks are available") + return True + else: + log.info("not testing flock locks, machines have versions {av} and {bv}".format( + av=a_version_str,bv=b_version_str)) + return False + + def test_filelock(self): + """ + Check that file lock doesn't get lost after an MDS restart + """ + + flockable = self._is_flockable() + lock_holder = self.mount_a.lock_background(do_flock=flockable) + + self.mount_b.wait_for_visible("background_file-2") + self.mount_b.check_filelock(do_flock=flockable) + + self.fs.mds_fail_restart() + self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) + + self.mount_b.check_filelock(do_flock=flockable) + + # Tear down the background process + lock_holder.stdin.close() + try: + lock_holder.wait() + except (CommandFailedError, ConnectionLostError): + # We killed it, so it raises an error + pass + + def test_filelock_eviction(self): + """ + Check that file lock held by evicted client is given to + waiting client. + """ + if not self._is_flockable(): + self.skipTest("flock is not available") + + lock_holder = self.mount_a.lock_background() + self.mount_b.wait_for_visible("background_file-2") + self.mount_b.check_filelock() + + lock_taker = self.mount_b.lock_and_release() + # Check the taker is waiting (doesn't get it immediately) + time.sleep(2) + self.assertFalse(lock_holder.finished) + self.assertFalse(lock_taker.finished) + + try: + mount_a_client_id = self.mount_a.get_global_id() + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + + # Evicting mount_a should let mount_b's attempt to take the lock + # succeed + self.wait_until_true(lambda: lock_taker.finished, timeout=10) + finally: + # teardown() doesn't quite handle this case cleanly, so help it out + self.mount_a.kill() + self.mount_a.kill_cleanup() + + # Bring the client back + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + def test_dir_fsync(self): + self._test_fsync(True); + + def test_create_fsync(self): + self._test_fsync(False); + + def _test_fsync(self, dirfsync): + """ + That calls to fsync guarantee visibility of metadata to another + client immediately after the fsyncing client dies. + """ + + # Leave this guy out until he's needed + self.mount_b.umount_wait() + + # Create dir + child dentry on client A, and fsync the dir + path = os.path.join(self.mount_a.mountpoint, "subdir") + self.mount_a.run_python( + dedent(""" + import os + import time + + path = "{path}" + + print "Starting creation..." + start = time.time() + + os.mkdir(path) + dfd = os.open(path, os.O_DIRECTORY) + + fd = open(os.path.join(path, "childfile"), "w") + print "Finished creation in {{0}}s".format(time.time() - start) + + print "Starting fsync..." + start = time.time() + if {dirfsync}: + os.fsync(dfd) + else: + os.fsync(fd) + print "Finished fsync in {{0}}s".format(time.time() - start) + """.format(path=path,dirfsync=str(dirfsync))) + ) + + # Immediately kill the MDS and then client A + self.fs.mds_stop() + self.fs.mds_fail() + self.mount_a.kill() + self.mount_a.kill_cleanup() + + # Restart the MDS. Wait for it to come up, it'll have to time out in clientreplay + self.fs.mds_restart() + log.info("Waiting for reconnect...") + self.fs.wait_for_state("up:reconnect") + log.info("Waiting for active...") + self.fs.wait_for_state("up:active", timeout=MDS_RESTART_GRACE + self.mds_reconnect_timeout) + log.info("Reached active...") + + # Is the child dentry visible from mount B? + self.mount_b.mount() + self.mount_b.wait_until_mounted() + self.mount_b.run_shell(["ls", "subdir/childfile"]) diff --git a/src/ceph/qa/tasks/cephfs/test_config_commands.py b/src/ceph/qa/tasks/cephfs/test_config_commands.py new file mode 100644 index 0000000..ce0619f --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_config_commands.py @@ -0,0 +1,63 @@ + +from unittest import case +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from tasks.cephfs.fuse_mount import FuseMount + + +class TestConfigCommands(CephFSTestCase): + """ + Test that daemons and clients respond to the otherwise rarely-used + runtime config modification operations. + """ + + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def test_client_config(self): + """ + That I can successfully issue asok "config set" commands + + :return: + """ + + if not isinstance(self.mount_a, FuseMount): + raise case.SkipTest("Test only applies to FUSE clients") + + test_key = "client_cache_size" + test_val = "123" + self.mount_a.admin_socket(['config', 'set', test_key, test_val]) + out = self.mount_a.admin_socket(['config', 'get', test_key]) + self.assertEqual(out[test_key], test_val) + + self.mount_a.write_n_mb("file.bin", 1); + + # Implicitly asserting that things don't have lockdep error in shutdown + self.mount_a.umount_wait(require_clean=True) + self.fs.mds_stop() + + def test_mds_config_asok(self): + test_key = "mds_max_purge_ops" + test_val = "123" + self.fs.mds_asok(['config', 'set', test_key, test_val]) + out = self.fs.mds_asok(['config', 'get', test_key]) + self.assertEqual(out[test_key], test_val) + + # Implicitly asserting that things don't have lockdep error in shutdown + self.mount_a.umount_wait(require_clean=True) + self.fs.mds_stop() + + def test_mds_config_tell(self): + test_key = "mds_max_purge_ops" + test_val = "123" + + mds_id = self.fs.get_lone_mds_id() + self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "injectargs", + "--{0}={1}".format(test_key, test_val)) + + # Read it back with asok because there is no `tell` equivalent + out = self.fs.mds_asok(['config', 'get', test_key]) + self.assertEqual(out[test_key], test_val) + + # Implicitly asserting that things don't have lockdep error in shutdown + self.mount_a.umount_wait(require_clean=True) + self.fs.mds_stop() diff --git a/src/ceph/qa/tasks/cephfs/test_damage.py b/src/ceph/qa/tasks/cephfs/test_damage.py new file mode 100644 index 0000000..380b49c --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_damage.py @@ -0,0 +1,548 @@ +import json +import logging +import errno +import re +from teuthology.contextutil import MaxWhileTries +from teuthology.exceptions import CommandFailedError +from teuthology.orchestra.run import wait +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology + +DAMAGED_ON_START = "damaged_on_start" +DAMAGED_ON_LS = "damaged_on_ls" +CRASHED = "server crashed" +NO_DAMAGE = "no damage" +FAILED_CLIENT = "client failed" +FAILED_SERVER = "server failed" + +# An EIO in response to a stat from the client +EIO_ON_LS = "eio" + +# An EIO, but nothing in damage table (not ever what we expect) +EIO_NO_DAMAGE = "eio without damage entry" + + +log = logging.getLogger(__name__) + + +class TestDamage(CephFSTestCase): + def _simple_workload_write(self): + self.mount_a.run_shell(["mkdir", "subdir"]) + self.mount_a.write_n_mb("subdir/sixmegs", 6) + return self.mount_a.stat("subdir/sixmegs") + + def is_marked_damaged(self, rank): + mds_map = self.fs.get_mds_map() + return rank in mds_map['damaged'] + + @for_teuthology #459s + def test_object_deletion(self): + """ + That the MDS has a clean 'damaged' response to loss of any single metadata object + """ + + self._simple_workload_write() + + # Hmm, actually it would be nice to permute whether the metadata pool + # state contains sessions or not, but for the moment close this session + # to avoid waiting through reconnect on every MDS start. + self.mount_a.umount_wait() + for mds_name in self.fs.get_active_names(): + self.fs.mds_asok(["flush", "journal"], mds_name) + + self.fs.mds_stop() + self.fs.mds_fail() + + self.fs.rados(['export', '/tmp/metadata.bin']) + + def is_ignored(obj_id, dentry=None): + """ + A filter to avoid redundantly mutating many similar objects (e.g. + stray dirfrags) or similar dentries (e.g. stray dir dentries) + """ + if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000": + return True + + if dentry and obj_id == "100.00000000": + if re.match("stray.+_head", dentry) and dentry != "stray0_head": + return True + + return False + + def get_path(obj_id, dentry=None): + """ + What filesystem path does this object or dentry correspond to? i.e. + what should I poke to see EIO after damaging it? + """ + + if obj_id == "1.00000000" and dentry == "subdir_head": + return "./subdir" + elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head": + return "./subdir/sixmegs" + + # None means ls will do an "ls -R" in hope of seeing some errors + return None + + objects = self.fs.rados(["ls"]).split("\n") + objects = [o for o in objects if not is_ignored(o)] + + # Find all objects with an OMAP header + omap_header_objs = [] + for o in objects: + header = self.fs.rados(["getomapheader", o]) + # The rados CLI wraps the header output in a hex-printed style + header_bytes = int(re.match("header \((.+) bytes\)", header).group(1)) + if header_bytes > 0: + omap_header_objs.append(o) + + # Find all OMAP key/vals + omap_keys = [] + for o in objects: + keys_str = self.fs.rados(["listomapkeys", o]) + if keys_str: + for key in keys_str.split("\n"): + if not is_ignored(o, key): + omap_keys.append((o, key)) + + # Find objects that have data in their bodies + data_objects = [] + for obj_id in objects: + stat_out = self.fs.rados(["stat", obj_id]) + size = int(re.match(".+, size (.+)$", stat_out).group(1)) + if size > 0: + data_objects.append(obj_id) + + # Define the various forms of damage we will inflict + class MetadataMutation(object): + def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None): + self.obj_id = obj_id_ + self.desc = desc_ + self.mutate_fn = mutate_fn_ + self.expectation = expectation_ + if ls_path is None: + self.ls_path = "." + else: + self.ls_path = ls_path + + def __eq__(self, other): + return self.desc == other.desc + + def __hash__(self): + return hash(self.desc) + + junk = "deadbeef" * 10 + mutations = [] + + # Removals + for obj_id in objects: + if obj_id in [ + # JournalPointers are auto-replaced if missing (same path as upgrade) + "400.00000000", + # Missing dirfrags for non-system dirs result in empty directory + "10000000000.00000000", + # PurgeQueue is auto-created if not found on startup + "500.00000000" + ]: + expectation = NO_DAMAGE + else: + expectation = DAMAGED_ON_START + + log.info("Expectation on rm '{0}' will be '{1}'".format( + obj_id, expectation + )) + + mutations.append(MetadataMutation( + obj_id, + "Delete {0}".format(obj_id), + lambda o=obj_id: self.fs.rados(["rm", o]), + expectation + )) + + # Blatant corruptions + mutations.extend([ + MetadataMutation( + o, + "Corrupt {0}".format(o), + lambda o=o: self.fs.rados(["put", o, "-"], stdin_data=junk), + DAMAGED_ON_START + ) for o in data_objects + ]) + + # Truncations + for obj_id in data_objects: + if obj_id == "500.00000000": + # The PurgeQueue is allowed to be empty: Journaler interprets + # an empty header object as an empty journal. + expectation = NO_DAMAGE + else: + expectation = DAMAGED_ON_START + + mutations.append( + MetadataMutation( + o, + "Truncate {0}".format(o), + lambda o=o: self.fs.rados(["truncate", o, "0"]), + DAMAGED_ON_START + )) + + # OMAP value corruptions + for o, k in omap_keys: + if o.startswith("100."): + # Anything in rank 0's 'mydir' + expectation = DAMAGED_ON_START + else: + expectation = EIO_ON_LS + + mutations.append( + MetadataMutation( + o, + "Corrupt omap key {0}:{1}".format(o, k), + lambda o=o,k=k: self.fs.rados(["setomapval", o, k, junk]), + expectation, + get_path(o, k) + ) + ) + + # OMAP header corruptions + for obj_id in omap_header_objs: + if re.match("60.\.00000000", obj_id) \ + or obj_id in ["1.00000000", "100.00000000", "mds0_sessionmap"]: + expectation = DAMAGED_ON_START + else: + expectation = NO_DAMAGE + + log.info("Expectation on corrupt header '{0}' will be '{1}'".format( + obj_id, expectation + )) + + mutations.append( + MetadataMutation( + obj_id, + "Corrupt omap header on {0}".format(obj_id), + lambda o=obj_id: self.fs.rados(["setomapheader", o, junk]), + expectation + ) + ) + + results = {} + + for mutation in mutations: + log.info("Applying mutation '{0}'".format(mutation.desc)) + + # Reset MDS state + self.mount_a.umount_wait(force=True) + self.fs.mds_stop() + self.fs.mds_fail() + self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') + + # Reset RADOS pool state + self.fs.rados(['import', '/tmp/metadata.bin']) + + # Inject the mutation + mutation.mutate_fn() + + # Try starting the MDS + self.fs.mds_restart() + + # How long we'll wait between starting a daemon and expecting + # it to make it through startup, and potentially declare itself + # damaged to the mon cluster. + startup_timeout = 60 + + if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE): + if mutation.expectation == DAMAGED_ON_START: + # The MDS may pass through active before making it to damaged + try: + self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout) + except RuntimeError: + pass + + # Wait for MDS to either come up or go into damaged state + try: + self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout) + except RuntimeError: + crashed = False + # Didn't make it to healthy or damaged, did it crash? + for daemon_id, daemon in self.fs.mds_daemons.items(): + if daemon.proc and daemon.proc.finished: + crashed = True + log.error("Daemon {0} crashed!".format(daemon_id)) + daemon.proc = None # So that subsequent stop() doesn't raise error + if not crashed: + # Didn't go health, didn't go damaged, didn't crash, so what? + raise + else: + log.info("Result: Mutation '{0}' led to crash".format(mutation.desc)) + results[mutation] = CRASHED + continue + if self.is_marked_damaged(0): + log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc)) + results[mutation] = DAMAGED_ON_START + continue + else: + log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc)) + else: + try: + self.wait_until_true(self.fs.are_daemons_healthy, 60) + except RuntimeError: + log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc)) + if self.is_marked_damaged(0): + results[mutation] = DAMAGED_ON_START + else: + results[mutation] = FAILED_SERVER + continue + log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc)) + + # MDS is up, should go damaged on ls or client mount + self.mount_a.mount() + self.mount_a.wait_until_mounted() + if mutation.ls_path == ".": + proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False) + else: + proc = self.mount_a.stat(mutation.ls_path, wait=False) + + if mutation.expectation == DAMAGED_ON_LS: + try: + self.wait_until_true(lambda: self.is_marked_damaged(0), 60) + log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc)) + results[mutation] = DAMAGED_ON_LS + except RuntimeError: + if self.fs.are_daemons_healthy(): + log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format( + mutation.desc)) + results[mutation] = NO_DAMAGE + else: + log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc)) + results[mutation] = FAILED_SERVER + + else: + try: + wait([proc], 20) + log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc)) + results[mutation] = NO_DAMAGE + except MaxWhileTries: + log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc)) + results[mutation] = FAILED_CLIENT + except CommandFailedError as e: + if e.exitstatus == errno.EIO: + log.info("Result: EIO on client") + results[mutation] = EIO_ON_LS + else: + log.info("Result: unexpected error {0} on client".format(e)) + results[mutation] = FAILED_CLIENT + + if mutation.expectation == EIO_ON_LS: + # EIOs mean something handled by DamageTable: assert that it has + # been populated + damage = json.loads( + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty')) + if len(damage) == 0: + results[mutation] = EIO_NO_DAMAGE + + failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result] + if failures: + log.error("{0} mutations had unexpected outcomes:".format(len(failures))) + for mutation, result in failures: + log.error(" Expected '{0}' actually '{1}' from '{2}'".format( + mutation.expectation, result, mutation.desc + )) + raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures))) + else: + log.info("All {0} mutations had expected outcomes".format(len(mutations))) + + def test_damaged_dentry(self): + # Damage to dentrys is interesting because it leaves the + # directory's `complete` flag in a subtle state where + # we have marked the dir complete in order that folks + # can access it, but in actual fact there is a dentry + # missing + self.mount_a.run_shell(["mkdir", "subdir/"]) + + self.mount_a.run_shell(["touch", "subdir/file_undamaged"]) + self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"]) + + subdir_ino = self.mount_a.path_to_ino("subdir") + + self.mount_a.umount_wait() + for mds_name in self.fs.get_active_names(): + self.fs.mds_asok(["flush", "journal"], mds_name) + + self.fs.mds_stop() + self.fs.mds_fail() + + # Corrupt a dentry + junk = "deadbeef" * 10 + dirfrag_obj = "{0:x}.00000000".format(subdir_ino) + self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk]) + + # Start up and try to list it + self.fs.mds_restart() + self.fs.wait_for_daemons() + + self.mount_a.mount() + self.mount_a.wait_until_mounted() + dentries = self.mount_a.ls("subdir/") + + # The damaged guy should have disappeared + self.assertEqual(dentries, ["file_undamaged"]) + + # I should get ENOENT if I try and read it normally, because + # the dir is considered complete + try: + self.mount_a.stat("subdir/file_to_be_damaged", wait=True) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + raise AssertionError("Expected ENOENT") + + # The fact that there is damaged should have bee recorded + damage = json.loads( + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "ls", '--format=json-pretty')) + self.assertEqual(len(damage), 1) + damage_id = damage[0]['id'] + + # If I try to create a dentry with the same name as the damaged guy + # then that should be forbidden + try: + self.mount_a.touch("subdir/file_to_be_damaged") + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EIO) + else: + raise AssertionError("Expected EIO") + + # Attempting that touch will clear the client's complete flag, now + # when I stat it I'll get EIO instead of ENOENT + try: + self.mount_a.stat("subdir/file_to_be_damaged", wait=True) + except CommandFailedError as e: + if isinstance(self.mount_a, FuseMount): + self.assertEqual(e.exitstatus, errno.EIO) + else: + # Kernel client handles this case differently + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + raise AssertionError("Expected EIO") + + nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files") + self.assertEqual(nfiles, "2") + + self.mount_a.umount_wait() + + # Now repair the stats + scrub_json = self.fs.mds_asok(["scrub_path", "/subdir", "repair"]) + log.info(json.dumps(scrub_json, indent=2)) + + self.assertEqual(scrub_json["passed_validation"], False) + self.assertEqual(scrub_json["raw_stats"]["checked"], True) + self.assertEqual(scrub_json["raw_stats"]["passed"], False) + + # Check that the file count is now correct + self.mount_a.mount() + self.mount_a.wait_until_mounted() + nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files") + self.assertEqual(nfiles, "1") + + # Clean up the omap object + self.fs.rados(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk]) + + # Clean up the damagetable entry + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "rm", "{did}".format(did=damage_id)) + + # Now I should be able to create a file with the same name as the + # damaged guy if I want. + self.mount_a.touch("subdir/file_to_be_damaged") + + def test_open_ino_errors(self): + """ + That errors encountered during opening inos are properly propagated + """ + + self.mount_a.run_shell(["mkdir", "dir1"]) + self.mount_a.run_shell(["touch", "dir1/file1"]) + self.mount_a.run_shell(["mkdir", "dir2"]) + self.mount_a.run_shell(["touch", "dir2/file2"]) + self.mount_a.run_shell(["mkdir", "testdir"]) + self.mount_a.run_shell(["ln", "dir1/file1", "testdir/hardlink1"]) + self.mount_a.run_shell(["ln", "dir2/file2", "testdir/hardlink2"]) + + file1_ino = self.mount_a.path_to_ino("dir1/file1") + file2_ino = self.mount_a.path_to_ino("dir2/file2") + dir2_ino = self.mount_a.path_to_ino("dir2") + + # Ensure everything is written to backing store + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"]) + + # Drop everything from the MDS cache + self.mds_cluster.mds_stop() + self.fs.journal_tool(['journal', 'reset']) + self.mds_cluster.mds_fail_restart() + self.fs.wait_for_daemons() + + self.mount_a.mount() + + # Case 1: un-decodeable backtrace + + # Validate that the backtrace is present and decodable + self.fs.read_backtrace(file1_ino) + # Go corrupt the backtrace of alpha/target (used for resolving + # bravo/hardlink). + self.fs._write_data_xattr(file1_ino, "parent", "rhubarb") + + # Check that touching the hardlink gives EIO + ran = self.mount_a.run_shell(["stat", "testdir/hardlink1"], wait=False) + try: + ran.wait() + except CommandFailedError: + self.assertTrue("Input/output error" in ran.stderr.getvalue()) + + # Check that an entry is created in the damage table + damage = json.loads( + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "ls", '--format=json-pretty')) + self.assertEqual(len(damage), 1) + self.assertEqual(damage[0]['damage_type'], "backtrace") + self.assertEqual(damage[0]['ino'], file1_ino) + + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "rm", str(damage[0]['id'])) + + + # Case 2: missing dirfrag for the target inode + + self.fs.rados(["rm", "{0:x}.00000000".format(dir2_ino)]) + + # Check that touching the hardlink gives EIO + ran = self.mount_a.run_shell(["stat", "testdir/hardlink2"], wait=False) + try: + ran.wait() + except CommandFailedError: + self.assertTrue("Input/output error" in ran.stderr.getvalue()) + + # Check that an entry is created in the damage table + damage = json.loads( + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "ls", '--format=json-pretty')) + self.assertEqual(len(damage), 2) + if damage[0]['damage_type'] == "backtrace" : + self.assertEqual(damage[0]['ino'], file2_ino) + self.assertEqual(damage[1]['damage_type'], "dir_frag") + self.assertEqual(damage[1]['ino'], dir2_ino) + else: + self.assertEqual(damage[0]['damage_type'], "dir_frag") + self.assertEqual(damage[0]['ino'], dir2_ino) + self.assertEqual(damage[1]['damage_type'], "backtrace") + self.assertEqual(damage[1]['ino'], file2_ino) + + for entry in damage: + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "rm", str(entry['id'])) diff --git a/src/ceph/qa/tasks/cephfs/test_data_scan.py b/src/ceph/qa/tasks/cephfs/test_data_scan.py new file mode 100644 index 0000000..a2d3157 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_data_scan.py @@ -0,0 +1,600 @@ + +""" +Test our tools for recovering metadata from the data pool +""" +import json + +import logging +import os +from textwrap import dedent +import traceback +from collections import namedtuple, defaultdict + +from teuthology.orchestra.run import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology + +log = logging.getLogger(__name__) + + +ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) + + +class Workload(object): + def __init__(self, filesystem, mount): + self._mount = mount + self._filesystem = filesystem + self._initial_state = None + + # Accumulate backtraces for every failed validation, and return them. Backtraces + # are rather verbose, but we only see them when something breaks, and they + # let us see which check failed without having to decorate each check with + # a string + self._errors = [] + + def assert_equal(self, a, b): + try: + if a != b: + raise AssertionError("{0} != {1}".format(a, b)) + except AssertionError as e: + self._errors.append( + ValidationError(e, traceback.format_exc(3)) + ) + + def write(self): + """ + Write the workload files to the mount + """ + raise NotImplementedError() + + def validate(self): + """ + Read from the mount and validate that the workload files are present (i.e. have + survived or been reconstructed from the test scenario) + """ + raise NotImplementedError() + + def damage(self): + """ + Damage the filesystem pools in ways that will be interesting to recover from. By + default just wipe everything in the metadata pool + """ + # Delete every object in the metadata pool + objects = self._filesystem.rados(["ls"]).split("\n") + for o in objects: + self._filesystem.rados(["rm", o]) + + def flush(self): + """ + Called after client unmount, after write: flush whatever you want + """ + self._filesystem.mds_asok(["flush", "journal"]) + + +class SimpleWorkload(Workload): + """ + Single file, single directory, check that it gets recovered and so does its size + """ + def write(self): + self._mount.run_shell(["mkdir", "subdir"]) + self._mount.write_n_mb("subdir/sixmegs", 6) + self._initial_state = self._mount.stat("subdir/sixmegs") + + def validate(self): + self._mount.run_shell(["ls", "subdir"]) + st = self._mount.stat("subdir/sixmegs") + self.assert_equal(st['st_size'], self._initial_state['st_size']) + return self._errors + + +class MovedFile(Workload): + def write(self): + # Create a file whose backtrace disagrees with his eventual position + # in the metadata. We will see that he gets reconstructed in his + # original position according to his backtrace. + self._mount.run_shell(["mkdir", "subdir_alpha"]) + self._mount.run_shell(["mkdir", "subdir_bravo"]) + self._mount.write_n_mb("subdir_alpha/sixmegs", 6) + self._filesystem.mds_asok(["flush", "journal"]) + self._mount.run_shell(["mv", "subdir_alpha/sixmegs", "subdir_bravo/sixmegs"]) + self._initial_state = self._mount.stat("subdir_bravo/sixmegs") + + def flush(self): + pass + + def validate(self): + self.assert_equal(self._mount.ls(), ["subdir_alpha"]) + st = self._mount.stat("subdir_alpha/sixmegs") + self.assert_equal(st['st_size'], self._initial_state['st_size']) + return self._errors + + +class BacktracelessFile(Workload): + def write(self): + self._mount.run_shell(["mkdir", "subdir"]) + self._mount.write_n_mb("subdir/sixmegs", 6) + self._initial_state = self._mount.stat("subdir/sixmegs") + + def flush(self): + # Never flush metadata, so backtrace won't be written + pass + + def validate(self): + ino_name = "%x" % self._initial_state["st_ino"] + + # The inode should be linked into lost+found because we had no path for it + self.assert_equal(self._mount.ls(), ["lost+found"]) + self.assert_equal(self._mount.ls("lost+found"), [ino_name]) + st = self._mount.stat("lost+found/{ino_name}".format(ino_name=ino_name)) + + # We might not have got the name or path, but we should still get the size + self.assert_equal(st['st_size'], self._initial_state['st_size']) + + return self._errors + + +class StripedStashedLayout(Workload): + def __init__(self, fs, m): + super(StripedStashedLayout, self).__init__(fs, m) + + # Nice small stripes so we can quickly do our writes+validates + self.sc = 4 + self.ss = 65536 + self.os = 262144 + + self.interesting_sizes = [ + # Exactly stripe_count objects will exist + self.os * self.sc, + # Fewer than stripe_count objects will exist + self.os * self.sc / 2, + self.os * (self.sc - 1) + self.os / 2, + self.os * (self.sc - 1) + self.os / 2 - 1, + self.os * (self.sc + 1) + self.os / 2, + self.os * (self.sc + 1) + self.os / 2 + 1, + # More than stripe_count objects will exist + self.os * self.sc + self.os * self.sc / 2 + ] + + def write(self): + # Create a dir with a striped layout set on it + self._mount.run_shell(["mkdir", "stripey"]) + + self._mount.setfattr("./stripey", "ceph.dir.layout", + "stripe_unit={ss} stripe_count={sc} object_size={os} pool={pool}".format( + ss=self.ss, os=self.os, sc=self.sc, + pool=self._filesystem.get_data_pool_name() + )) + + # Write files, then flush metadata so that its layout gets written into an xattr + for i, n_bytes in enumerate(self.interesting_sizes): + self._mount.write_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) + # This is really just validating the validator + self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) + self._filesystem.mds_asok(["flush", "journal"]) + + # Write another file in the same way, but this time don't flush the metadata, + # so that it won't have the layout xattr + self._mount.write_test_pattern("stripey/unflushed_file", 1024 * 512) + self._mount.validate_test_pattern("stripey/unflushed_file", 1024 * 512) + + self._initial_state = { + "unflushed_ino": self._mount.path_to_ino("stripey/unflushed_file") + } + + def flush(self): + # Pass because we already selectively flushed during write + pass + + def validate(self): + # The first files should have been recovered into its original location + # with the correct layout: read back correct data + for i, n_bytes in enumerate(self.interesting_sizes): + try: + self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) + except CommandFailedError as e: + self._errors.append( + ValidationError("File {0} (size {1}): {2}".format(i, n_bytes, e), traceback.format_exc(3)) + ) + + # The unflushed file should have been recovered into lost+found without + # the correct layout: read back junk + ino_name = "%x" % self._initial_state["unflushed_ino"] + self.assert_equal(self._mount.ls("lost+found"), [ino_name]) + try: + self._mount.validate_test_pattern(os.path.join("lost+found", ino_name), 1024 * 512) + except CommandFailedError: + pass + else: + self._errors.append( + ValidationError("Unexpectedly valid data in unflushed striped file", "") + ) + + return self._errors + + +class ManyFilesWorkload(Workload): + def __init__(self, filesystem, mount, file_count): + super(ManyFilesWorkload, self).__init__(filesystem, mount) + self.file_count = file_count + + def write(self): + self._mount.run_shell(["mkdir", "subdir"]) + for n in range(0, self.file_count): + self._mount.write_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024) + + def validate(self): + for n in range(0, self.file_count): + try: + self._mount.validate_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024) + except CommandFailedError as e: + self._errors.append( + ValidationError("File {0}: {1}".format(n, e), traceback.format_exc(3)) + ) + + return self._errors + + +class MovedDir(Workload): + def write(self): + # Create a nested dir that we will then move. Two files with two different + # backtraces referring to the moved dir, claiming two different locations for + # it. We will see that only one backtrace wins and the dir ends up with + # single linkage. + self._mount.run_shell(["mkdir", "-p", "grandmother/parent"]) + self._mount.write_n_mb("grandmother/parent/orig_pos_file", 1) + self._filesystem.mds_asok(["flush", "journal"]) + self._mount.run_shell(["mkdir", "grandfather"]) + self._mount.run_shell(["mv", "grandmother/parent", "grandfather"]) + self._mount.write_n_mb("grandfather/parent/new_pos_file", 2) + self._filesystem.mds_asok(["flush", "journal"]) + + self._initial_state = ( + self._mount.stat("grandfather/parent/orig_pos_file"), + self._mount.stat("grandfather/parent/new_pos_file") + ) + + def validate(self): + root_files = self._mount.ls() + self.assert_equal(len(root_files), 1) + self.assert_equal(root_files[0] in ["grandfather", "grandmother"], True) + winner = root_files[0] + st_opf = self._mount.stat("{0}/parent/orig_pos_file".format(winner)) + st_npf = self._mount.stat("{0}/parent/new_pos_file".format(winner)) + + self.assert_equal(st_opf['st_size'], self._initial_state[0]['st_size']) + self.assert_equal(st_npf['st_size'], self._initial_state[1]['st_size']) + + +class MissingZerothObject(Workload): + def write(self): + self._mount.run_shell(["mkdir", "subdir"]) + self._mount.write_n_mb("subdir/sixmegs", 6) + self._initial_state = self._mount.stat("subdir/sixmegs") + + def damage(self): + super(MissingZerothObject, self).damage() + zeroth_id = "{0:x}.00000000".format(self._initial_state['st_ino']) + self._filesystem.rados(["rm", zeroth_id], pool=self._filesystem.get_data_pool_name()) + + def validate(self): + st = self._mount.stat("lost+found/{0:x}".format(self._initial_state['st_ino'])) + self.assert_equal(st['st_size'], self._initial_state['st_size']) + + +class NonDefaultLayout(Workload): + """ + Check that the reconstruction copes with files that have a different + object size in their layout + """ + def write(self): + self._mount.run_shell(["touch", "datafile"]) + self._mount.setfattr("./datafile", "ceph.file.layout.object_size", "8388608") + self._mount.run_shell(["dd", "if=/dev/urandom", "of=./datafile", "bs=1M", "count=32"]) + self._initial_state = self._mount.stat("datafile") + + def validate(self): + # Check we got the layout reconstructed properly + object_size = int(self._mount.getfattr( + "./datafile", "ceph.file.layout.object_size")) + self.assert_equal(object_size, 8388608) + + # Check we got the file size reconstructed properly + st = self._mount.stat("datafile") + self.assert_equal(st['st_size'], self._initial_state['st_size']) + + +class TestDataScan(CephFSTestCase): + MDSS_REQUIRED = 2 + + def is_marked_damaged(self, rank): + mds_map = self.fs.get_mds_map() + return rank in mds_map['damaged'] + + def _rebuild_metadata(self, workload, workers=1): + """ + That when all objects in metadata pool are removed, we can rebuild a metadata pool + based on the contents of a data pool, and a client can see and read our files. + """ + + # First, inject some files + + workload.write() + + # Unmount the client and flush the journal: the tool should also cope with + # situations where there is dirty metadata, but we'll test that separately + self.mount_a.umount_wait() + workload.flush() + + # Stop the MDS + self.fs.mds_stop() + self.fs.mds_fail() + + # After recovery, we need the MDS to not be strict about stats (in production these options + # are off by default, but in QA we need to explicitly disable them) + self.fs.set_ceph_conf('mds', 'mds verify scatter', False) + self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) + + # Apply any data damage the workload wants + workload.damage() + + # Reset the MDS map in case multiple ranks were in play: recovery procedure + # only understands how to rebuild metadata under rank 0 + self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name, + '--yes-i-really-mean-it') + + self.fs.mds_restart() + + def get_state(mds_id): + info = self.mds_cluster.get_mds_info(mds_id) + return info['state'] if info is not None else None + + self.wait_until_true(lambda: self.is_marked_damaged(0), 60) + for mds_id in self.fs.mds_ids: + self.wait_until_equal( + lambda: get_state(mds_id), + "up:standby", + timeout=60) + + self.fs.table_tool([self.fs.name + ":0", "reset", "session"]) + self.fs.table_tool([self.fs.name + ":0", "reset", "snap"]) + self.fs.table_tool([self.fs.name + ":0", "reset", "inode"]) + + # Run the recovery procedure + if False: + with self.assertRaises(CommandFailedError): + # Normal reset should fail when no objects are present, we'll use --force instead + self.fs.journal_tool(["journal", "reset"]) + + self.fs.journal_tool(["journal", "reset", "--force"]) + self.fs.data_scan(["init"]) + self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()], worker_count=workers) + self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()], worker_count=workers) + + # Mark the MDS repaired + self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') + + # Start the MDS + self.fs.mds_restart() + self.fs.wait_for_daemons() + log.info(str(self.mds_cluster.status())) + + # Mount a client + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + # See that the files are present and correct + errors = workload.validate() + if errors: + log.error("Validation errors found: {0}".format(len(errors))) + for e in errors: + log.error(e.exception) + log.error(e.backtrace) + raise AssertionError("Validation failed, first error: {0}\n{1}".format( + errors[0].exception, errors[0].backtrace + )) + + def test_rebuild_simple(self): + self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a)) + + def test_rebuild_moved_file(self): + self._rebuild_metadata(MovedFile(self.fs, self.mount_a)) + + def test_rebuild_backtraceless(self): + self._rebuild_metadata(BacktracelessFile(self.fs, self.mount_a)) + + def test_rebuild_moved_dir(self): + self._rebuild_metadata(MovedDir(self.fs, self.mount_a)) + + def test_rebuild_missing_zeroth(self): + self._rebuild_metadata(MissingZerothObject(self.fs, self.mount_a)) + + def test_rebuild_nondefault_layout(self): + self._rebuild_metadata(NonDefaultLayout(self.fs, self.mount_a)) + + def test_stashed_layout(self): + self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a)) + + def _dirfrag_keys(self, object_id): + keys_str = self.fs.rados(["listomapkeys", object_id]) + if keys_str: + return keys_str.split("\n") + else: + return [] + + def test_fragmented_injection(self): + """ + That when injecting a dentry into a fragmented directory, we put it in the right fragment. + """ + + self.fs.set_allow_dirfrags(True) + + file_count = 100 + file_names = ["%s" % n for n in range(0, file_count)] + + # Create a directory of `file_count` files, each named after its + # decimal number and containing the string of its decimal number + self.mount_a.run_python(dedent(""" + import os + path = os.path.join("{path}", "subdir") + os.mkdir(path) + for n in range(0, {file_count}): + open(os.path.join(path, "%s" % n), 'w').write("%s" % n) + """.format( + path=self.mount_a.mountpoint, + file_count=file_count + ))) + + dir_ino = self.mount_a.path_to_ino("subdir") + + # Only one MDS should be active! + self.assertEqual(len(self.fs.get_active_names()), 1) + + # Ensure that one directory is fragmented + mds_id = self.fs.get_active_names()[0] + self.fs.mds_asok(["dirfrag", "split", "/subdir", "0/0", "1"], mds_id) + + # Flush journal and stop MDS + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"], mds_id) + self.fs.mds_stop() + self.fs.mds_fail() + + # Pick a dentry and wipe out its key + # Because I did a 1 bit split, I know one frag will be named <inode>.01000000 + frag_obj_id = "{0:x}.01000000".format(dir_ino) + keys = self._dirfrag_keys(frag_obj_id) + victim_key = keys[7] # arbitrary choice + log.info("victim_key={0}".format(victim_key)) + victim_dentry = victim_key.split("_head")[0] + self.fs.rados(["rmomapkey", frag_obj_id, victim_key]) + + # Start filesystem back up, observe that the file appears to be gone in an `ls` + self.fs.mds_restart() + self.fs.wait_for_daemons() + self.mount_a.mount() + self.mount_a.wait_until_mounted() + files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n") + self.assertListEqual(sorted(files), sorted(list(set(file_names) - set([victim_dentry])))) + + # Stop the filesystem + self.mount_a.umount_wait() + self.fs.mds_stop() + self.fs.mds_fail() + + # Run data-scan, observe that it inserts our dentry back into the correct fragment + # by checking the omap now has the dentry's key again + self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()]) + self.fs.data_scan(["scan_inodes", self.fs.get_data_pool_name()]) + self.assertIn(victim_key, self._dirfrag_keys(frag_obj_id)) + + # Start the filesystem and check that the dentry we deleted is now once again visible + # and points to the correct file data. + self.fs.mds_restart() + self.fs.wait_for_daemons() + self.mount_a.mount() + self.mount_a.wait_until_mounted() + out = self.mount_a.run_shell(["cat", "subdir/{0}".format(victim_dentry)]).stdout.getvalue().strip() + self.assertEqual(out, victim_dentry) + + # Finally, close the loop by checking our injected dentry survives a merge + mds_id = self.fs.get_active_names()[0] + self.mount_a.ls("subdir") # Do an ls to ensure both frags are in cache so the merge will work + self.fs.mds_asok(["dirfrag", "merge", "/subdir", "0/0"], mds_id) + self.fs.mds_asok(["flush", "journal"], mds_id) + frag_obj_id = "{0:x}.00000000".format(dir_ino) + keys = self._dirfrag_keys(frag_obj_id) + self.assertListEqual(sorted(keys), sorted(["%s_head" % f for f in file_names])) + + @for_teuthology + def test_parallel_execution(self): + self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 25), workers=7) + + def test_pg_files(self): + """ + That the pg files command tells us which files are associated with + a particular PG + """ + file_count = 20 + self.mount_a.run_shell(["mkdir", "mydir"]) + self.mount_a.create_n_files("mydir/myfile", file_count) + + # Some files elsewhere in the system that we will ignore + # to check that the tool is filtering properly + self.mount_a.run_shell(["mkdir", "otherdir"]) + self.mount_a.create_n_files("otherdir/otherfile", file_count) + + pgs_to_files = defaultdict(list) + # Rough (slow) reimplementation of the logic + for i in range(0, file_count): + file_path = "mydir/myfile_{0}".format(i) + ino = self.mount_a.path_to_ino(file_path) + obj = "{0:x}.{1:08x}".format(ino, 0) + pgid = json.loads(self.fs.mon_manager.raw_cluster_cmd( + "osd", "map", self.fs.get_data_pool_name(), obj, + "--format=json-pretty" + ))['pgid'] + pgs_to_files[pgid].append(file_path) + log.info("{0}: {1}".format(file_path, pgid)) + + pg_count = self.fs.get_pgs_per_fs_pool() + for pg_n in range(0, pg_count): + pg_str = "{0}.{1}".format(self.fs.get_data_pool_id(), pg_n) + out = self.fs.data_scan(["pg_files", "mydir", pg_str]) + lines = [l for l in out.split("\n") if l] + log.info("{0}: {1}".format(pg_str, lines)) + self.assertSetEqual(set(lines), set(pgs_to_files[pg_str])) + + def test_scan_links(self): + """ + The scan_links command fixes linkage errors + """ + self.mount_a.run_shell(["mkdir", "testdir1"]) + self.mount_a.run_shell(["mkdir", "testdir2"]) + dir1_ino = self.mount_a.path_to_ino("testdir1") + dir2_ino = self.mount_a.path_to_ino("testdir2") + dirfrag1_oid = "{0:x}.00000000".format(dir1_ino) + dirfrag2_oid = "{0:x}.00000000".format(dir2_ino) + + self.mount_a.run_shell(["touch", "testdir1/file1"]) + self.mount_a.run_shell(["ln", "testdir1/file1", "testdir1/link1"]) + self.mount_a.run_shell(["ln", "testdir1/file1", "testdir2/link2"]) + + mds_id = self.fs.get_active_names()[0] + self.fs.mds_asok(["flush", "journal"], mds_id) + + dirfrag1_keys = self._dirfrag_keys(dirfrag1_oid) + + # introduce duplicated primary link + file1_key = "file1_head" + self.assertIn(file1_key, dirfrag1_keys) + file1_omap_data = self.fs.rados(["getomapval", dirfrag1_oid, file1_key, '-']) + self.fs.rados(["setomapval", dirfrag2_oid, file1_key], stdin_data=file1_omap_data) + self.assertIn(file1_key, self._dirfrag_keys(dirfrag2_oid)) + + # remove a remote link, make inode link count incorrect + link1_key = 'link1_head' + self.assertIn(link1_key, dirfrag1_keys) + self.fs.rados(["rmomapkey", dirfrag1_oid, link1_key]) + + # increase good primary link's version + self.mount_a.run_shell(["touch", "testdir1/file1"]) + self.mount_a.umount_wait() + + self.fs.mds_asok(["flush", "journal"], mds_id) + self.fs.mds_stop() + self.fs.mds_fail() + + # repair linkage errors + self.fs.data_scan(["scan_links"]) + + # primary link in testdir2 was deleted? + self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid)) + + self.fs.mds_restart() + self.fs.wait_for_daemons() + + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + # link count was adjusted? + file1_nlink = self.mount_a.path_to_nlink("testdir1/file1") + self.assertEqual(file1_nlink, 2) diff --git a/src/ceph/qa/tasks/cephfs/test_dump_tree.py b/src/ceph/qa/tasks/cephfs/test_dump_tree.py new file mode 100644 index 0000000..6d943f9 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_dump_tree.py @@ -0,0 +1,66 @@ +from tasks.cephfs.cephfs_test_case import CephFSTestCase +import random +import os + +class TestDumpTree(CephFSTestCase): + def get_paths_to_ino(self): + inos = {} + p = self.mount_a.run_shell(["find", "./"]) + paths = p.stdout.getvalue().strip().split() + for path in paths: + inos[path] = self.mount_a.path_to_ino(path, False) + + return inos + + def populate(self): + self.mount_a.run_shell(["git", "clone", + "https://github.com/ceph/ceph-qa-suite"]) + + def test_basic(self): + self.mount_a.run_shell(["mkdir", "parent"]) + self.mount_a.run_shell(["mkdir", "parent/child"]) + self.mount_a.run_shell(["touch", "parent/child/file"]) + self.mount_a.run_shell(["mkdir", "parent/child/grandchild"]) + self.mount_a.run_shell(["touch", "parent/child/grandchild/file"]) + + inos = self.get_paths_to_ino() + tree = self.fs.mds_asok(["dump", "tree", "/parent/child", "1"]) + + target_inos = [inos["./parent/child"], inos["./parent/child/file"], + inos["./parent/child/grandchild"]] + + for ino in tree: + del target_inos[target_inos.index(ino['ino'])] # don't catch! + + assert(len(target_inos) == 0) + + def test_random(self): + random.seed(0) + + self.populate() + inos = self.get_paths_to_ino() + target = random.choice(inos.keys()) + + if target != "./": + target = os.path.dirname(target) + + subtree = [path for path in inos.keys() if path.startswith(target)] + target_inos = [inos[path] for path in subtree] + tree = self.fs.mds_asok(["dump", "tree", target[1:]]) + + for ino in tree: + del target_inos[target_inos.index(ino['ino'])] # don't catch! + + assert(len(target_inos) == 0) + + target_depth = target.count('/') + maxdepth = max([path.count('/') for path in subtree]) - target_depth + depth = random.randint(0, maxdepth) + target_inos = [inos[path] for path in subtree \ + if path.count('/') <= depth + target_depth] + tree = self.fs.mds_asok(["dump", "tree", target[1:], str(depth)]) + + for ino in tree: + del target_inos[target_inos.index(ino['ino'])] # don't catch! + + assert(len(target_inos) == 0) diff --git a/src/ceph/qa/tasks/cephfs/test_exports.py b/src/ceph/qa/tasks/cephfs/test_exports.py new file mode 100644 index 0000000..913999d --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_exports.py @@ -0,0 +1,107 @@ +import logging +import time +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + +class TestExports(CephFSTestCase): + MDSS_REQUIRED = 2 + + def _wait_subtrees(self, status, rank, test): + timeout = 30 + pause = 2 + test = sorted(test) + for i in range(timeout/pause): + subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=status.get_rank(self.fs.id, rank)['name']) + subtrees = filter(lambda s: s['dir']['path'].startswith('/'), subtrees) + filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees]) + log.info("%s =?= %s", filtered, test) + if filtered == test: + # Confirm export_pin in output is correct: + for s in subtrees: + self.assertTrue(s['export_pin'] == s['auth_first']) + return subtrees + time.sleep(pause) + raise RuntimeError("rank {0} failed to reach desired subtree state", rank) + + def test_export_pin(self): + self.fs.set_max_mds(2) + self.fs.wait_for_daemons() + + status = self.fs.status() + + self.mount_a.run_shell(["mkdir", "-p", "1/2/3"]) + self._wait_subtrees(status, 0, []) + + # NOP + self.mount_a.setfattr("1", "ceph.dir.pin", "-1") + self._wait_subtrees(status, 0, []) + + # NOP (rank < -1) + self.mount_a.setfattr("1", "ceph.dir.pin", "-2341") + self._wait_subtrees(status, 0, []) + + # pin /1 to rank 1 + self.mount_a.setfattr("1", "ceph.dir.pin", "1") + self._wait_subtrees(status, 1, [('/1', 1)]) + + # Check export_targets is set properly + status = self.fs.status() + log.info(status) + r0 = status.get_rank(self.fs.id, 0) + self.assertTrue(sorted(r0['export_targets']) == [1]) + + # redundant pin /1/2 to rank 1 + self.mount_a.setfattr("1/2", "ceph.dir.pin", "1") + self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1)]) + + # change pin /1/2 to rank 0 + self.mount_a.setfattr("1/2", "ceph.dir.pin", "0") + self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 0)]) + self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)]) + + # change pin /1/2/3 to (presently) non-existent rank 2 + self.mount_a.setfattr("1/2/3", "ceph.dir.pin", "2") + self._wait_subtrees(status, 0, [('/1', 1), ('/1/2', 0)]) + self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 0)]) + + # change pin /1/2 back to rank 1 + self.mount_a.setfattr("1/2", "ceph.dir.pin", "1") + self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1)]) + + # add another directory pinned to 1 + self.mount_a.run_shell(["mkdir", "-p", "1/4/5"]) + self.mount_a.setfattr("1/4/5", "ceph.dir.pin", "1") + self._wait_subtrees(status, 1, [('/1', 1), ('/1/2', 1), ('/1/4/5', 1)]) + + # change pin /1 to 0 + self.mount_a.setfattr("1", "ceph.dir.pin", "0") + self._wait_subtrees(status, 0, [('/1', 0), ('/1/2', 1), ('/1/4/5', 1)]) + + # change pin /1/2 to default (-1); does the subtree root properly respect it's parent pin? + self.mount_a.setfattr("1/2", "ceph.dir.pin", "-1") + self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1)]) + + if len(list(status.get_standbys())): + self.fs.set_max_mds(3) + self.fs.wait_for_state('up:active', rank=2) + self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2)]) + + # Check export_targets is set properly + status = self.fs.status() + log.info(status) + r0 = status.get_rank(self.fs.id, 0) + self.assertTrue(sorted(r0['export_targets']) == [1,2]) + r1 = status.get_rank(self.fs.id, 1) + self.assertTrue(sorted(r1['export_targets']) == [0]) + r2 = status.get_rank(self.fs.id, 2) + self.assertTrue(sorted(r2['export_targets']) == []) + + # Test rename + self.mount_a.run_shell(["mkdir", "-p", "a/b", "aa/bb"]) + self.mount_a.setfattr("a", "ceph.dir.pin", "1") + self.mount_a.setfattr("aa/bb", "ceph.dir.pin", "0") + self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/aa/bb', 0)]) + self.mount_a.run_shell(["mv", "aa", "a/b/"]) + self._wait_subtrees(status, 0, [('/1', 0), ('/1/4/5', 1), ('/1/2/3', 2), ('/a', 1), ('/a/b/aa/bb', 0)]) diff --git a/src/ceph/qa/tasks/cephfs/test_failover.py b/src/ceph/qa/tasks/cephfs/test_failover.py new file mode 100644 index 0000000..9d3392c --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_failover.py @@ -0,0 +1,645 @@ +import json +import logging +from unittest import case, SkipTest + +from cephfs_test_case import CephFSTestCase +from teuthology.exceptions import CommandFailedError +from teuthology import misc as teuthology +from tasks.cephfs.fuse_mount import FuseMount + +log = logging.getLogger(__name__) + + +class TestFailover(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 2 + + def test_simple(self): + """ + That when the active MDS is killed, a standby MDS is promoted into + its rank after the grace period. + + This is just a simple unit test, the harder cases are covered + in thrashing tests. + """ + + # Need all my standbys up as well as the active daemons + self.wait_for_daemon_start() + + (original_active, ) = self.fs.get_active_names() + original_standbys = self.mds_cluster.get_standby_daemons() + + # Kill the rank 0 daemon's physical process + self.fs.mds_stop(original_active) + + grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) + + # Wait until the monitor promotes his replacement + def promoted(): + active = self.fs.get_active_names() + return active and active[0] in original_standbys + + log.info("Waiting for promotion of one of the original standbys {0}".format( + original_standbys)) + self.wait_until_true( + promoted, + timeout=grace*2) + + # Start the original rank 0 daemon up again, see that he becomes a standby + self.fs.mds_restart(original_active) + self.wait_until_true( + lambda: original_active in self.mds_cluster.get_standby_daemons(), + timeout=60 # Approximately long enough for MDS to start and mon to notice + ) + + def test_client_abort(self): + """ + That a client will respect fuse_require_active_mds and error out + when the cluster appears to be unavailable. + """ + + if not isinstance(self.mount_a, FuseMount): + raise SkipTest("Requires FUSE client to inject client metadata") + + require_active = self.fs.get_config("fuse_require_active_mds", service_type="mon").lower() == "true" + if not require_active: + raise case.SkipTest("fuse_require_active_mds is not set") + + grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) + + # Check it's not laggy to begin with + (original_active, ) = self.fs.get_active_names() + self.assertNotIn("laggy_since", self.fs.mon_manager.get_mds_status(original_active)) + + self.mounts[0].umount_wait() + + # Control: that we can mount and unmount usually, while the cluster is healthy + self.mounts[0].mount() + self.mounts[0].wait_until_mounted() + self.mounts[0].umount_wait() + + # Stop the daemon processes + self.fs.mds_stop() + + # Wait for everyone to go laggy + def laggy(): + mdsmap = self.fs.get_mds_map() + for info in mdsmap['info'].values(): + if "laggy_since" not in info: + return False + + return True + + self.wait_until_true(laggy, grace * 2) + with self.assertRaises(CommandFailedError): + self.mounts[0].mount() + + def test_standby_count_wanted(self): + """ + That cluster health warnings are generated by insufficient standbys available. + """ + + # Need all my standbys up as well as the active daemons + self.wait_for_daemon_start() + + grace = float(self.fs.get_config("mds_beacon_grace", service_type="mon")) + + standbys = self.mds_cluster.get_standby_daemons() + self.assertGreaterEqual(len(standbys), 1) + self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys))) + + # Kill a standby and check for warning + victim = standbys.pop() + self.fs.mds_stop(victim) + log.info("waiting for insufficient standby daemon warning") + self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2) + + # restart the standby, see that he becomes a standby, check health clears + self.fs.mds_restart(victim) + self.wait_until_true( + lambda: victim in self.mds_cluster.get_standby_daemons(), + timeout=60 # Approximately long enough for MDS to start and mon to notice + ) + self.wait_for_health_clear(timeout=30) + + # Set it one greater than standbys ever seen + standbys = self.mds_cluster.get_standby_daemons() + self.assertGreaterEqual(len(standbys), 1) + self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1)) + log.info("waiting for insufficient standby daemon warning") + self.wait_for_health("MDS_INSUFFICIENT_STANDBY", grace*2) + + # Set it to 0 + self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0') + self.wait_for_health_clear(timeout=30) + + + + +class TestStandbyReplay(CephFSTestCase): + MDSS_REQUIRED = 4 + REQUIRE_FILESYSTEM = False + + def set_standby_for(self, leader, follower, replay): + self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader) + if replay: + self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true") + + def get_info_by_name(self, mds_name): + status = self.mds_cluster.status() + info = status.get_mds(mds_name) + if info is None: + log.warn(str(status)) + raise RuntimeError("MDS '{0}' not found".format(mds_name)) + else: + return info + + def test_standby_replay_unused(self): + # Pick out exactly 3 daemons to be run during test + use_daemons = sorted(self.mds_cluster.mds_ids[0:3]) + mds_a, mds_b, mds_c = use_daemons + log.info("Using MDS daemons: {0}".format(use_daemons)) + + # B and C should both follow A, but only one will + # really get into standby replay state. + self.set_standby_for(mds_a, mds_b, True) + self.set_standby_for(mds_a, mds_c, True) + + # Create FS and start A + fs_a = self.mds_cluster.newfs("alpha") + self.mds_cluster.mds_restart(mds_a) + fs_a.wait_for_daemons() + self.assertEqual(fs_a.get_active_names(), [mds_a]) + + # Start B, he should go into standby replay + self.mds_cluster.mds_restart(mds_b) + self.wait_for_daemon_start([mds_b]) + info_b = self.get_info_by_name(mds_b) + self.assertEqual(info_b['state'], "up:standby-replay") + self.assertEqual(info_b['standby_for_name'], mds_a) + self.assertEqual(info_b['rank'], 0) + + # Start C, he should go into standby (*not* replay) + self.mds_cluster.mds_restart(mds_c) + self.wait_for_daemon_start([mds_c]) + info_c = self.get_info_by_name(mds_c) + self.assertEqual(info_c['state'], "up:standby") + self.assertEqual(info_c['standby_for_name'], mds_a) + self.assertEqual(info_c['rank'], -1) + + # Kill B, C should go into standby replay + self.mds_cluster.mds_stop(mds_b) + self.mds_cluster.mds_fail(mds_b) + self.wait_until_equal( + lambda: self.get_info_by_name(mds_c)['state'], + "up:standby-replay", + 60) + info_c = self.get_info_by_name(mds_c) + self.assertEqual(info_c['state'], "up:standby-replay") + self.assertEqual(info_c['standby_for_name'], mds_a) + self.assertEqual(info_c['rank'], 0) + + def test_standby_failure(self): + """ + That the failure of a standby-replay daemon happens cleanly + and doesn't interrupt anything else. + """ + # Pick out exactly 2 daemons to be run during test + use_daemons = sorted(self.mds_cluster.mds_ids[0:2]) + mds_a, mds_b = use_daemons + log.info("Using MDS daemons: {0}".format(use_daemons)) + + # Configure two pairs of MDSs that are standby for each other + self.set_standby_for(mds_a, mds_b, True) + self.set_standby_for(mds_b, mds_a, False) + + # Create FS alpha and get mds_a to come up as active + fs_a = self.mds_cluster.newfs("alpha") + self.mds_cluster.mds_restart(mds_a) + fs_a.wait_for_daemons() + self.assertEqual(fs_a.get_active_names(), [mds_a]) + + # Start the standbys + self.mds_cluster.mds_restart(mds_b) + self.wait_for_daemon_start([mds_b]) + + # See the standby come up as the correct rank + info_b = self.get_info_by_name(mds_b) + self.assertEqual(info_b['state'], "up:standby-replay") + self.assertEqual(info_b['standby_for_name'], mds_a) + self.assertEqual(info_b['rank'], 0) + + # Kill the standby + self.mds_cluster.mds_stop(mds_b) + self.mds_cluster.mds_fail(mds_b) + + # See that the standby is gone and the active remains + self.assertEqual(fs_a.get_active_names(), [mds_a]) + mds_map = fs_a.get_mds_map() + self.assertEqual(len(mds_map['info']), 1) + self.assertEqual(mds_map['failed'], []) + self.assertEqual(mds_map['damaged'], []) + self.assertEqual(mds_map['stopped'], []) + + def test_rank_stopped(self): + """ + That when a rank is STOPPED, standby replays for + that rank get torn down + """ + # Pick out exactly 2 daemons to be run during test + use_daemons = sorted(self.mds_cluster.mds_ids[0:4]) + mds_a, mds_b, mds_a_s, mds_b_s = use_daemons + log.info("Using MDS daemons: {0}".format(use_daemons)) + + # a and b both get a standby + self.set_standby_for(mds_a, mds_a_s, True) + self.set_standby_for(mds_b, mds_b_s, True) + + # Create FS alpha and get mds_a to come up as active + fs_a = self.mds_cluster.newfs("alpha") + fs_a.set_max_mds(2) + + self.mds_cluster.mds_restart(mds_a) + self.wait_until_equal(lambda: fs_a.get_active_names(), [mds_a], 30) + self.mds_cluster.mds_restart(mds_b) + fs_a.wait_for_daemons() + self.assertEqual(sorted(fs_a.get_active_names()), [mds_a, mds_b]) + + # Start the standbys + self.mds_cluster.mds_restart(mds_b_s) + self.wait_for_daemon_start([mds_b_s]) + self.mds_cluster.mds_restart(mds_a_s) + self.wait_for_daemon_start([mds_a_s]) + info_b_s = self.get_info_by_name(mds_b_s) + self.assertEqual(info_b_s['state'], "up:standby-replay") + info_a_s = self.get_info_by_name(mds_a_s) + self.assertEqual(info_a_s['state'], "up:standby-replay") + + # Shrink the cluster + fs_a.set_max_mds(1) + fs_a.mon_manager.raw_cluster_cmd("mds", "stop", "{0}:1".format(fs_a.name)) + self.wait_until_equal( + lambda: fs_a.get_active_names(), [mds_a], + 60 + ) + + # Both 'b' and 'b_s' should go back to being standbys + self.wait_until_equal( + lambda: self.mds_cluster.get_standby_daemons(), {mds_b, mds_b_s}, + 60 + ) + + +class TestMultiFilesystems(CephFSTestCase): + CLIENTS_REQUIRED = 2 + MDSS_REQUIRED = 4 + + # We'll create our own filesystems and start our own daemons + REQUIRE_FILESYSTEM = False + + def setUp(self): + super(TestMultiFilesystems, self).setUp() + self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set", + "enable_multiple", "true", + "--yes-i-really-mean-it") + + def _setup_two(self): + fs_a = self.mds_cluster.newfs("alpha") + fs_b = self.mds_cluster.newfs("bravo") + + self.mds_cluster.mds_restart() + + # Wait for both filesystems to go healthy + fs_a.wait_for_daemons() + fs_b.wait_for_daemons() + + # Reconfigure client auth caps + for mount in self.mounts: + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(mount.client_id), + 'mds', 'allow', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + fs_a.get_data_pool_name(), fs_b.get_data_pool_name())) + + return fs_a, fs_b + + def test_clients(self): + fs_a, fs_b = self._setup_two() + + # Mount a client on fs_a + self.mount_a.mount(mount_fs_name=fs_a.name) + self.mount_a.write_n_mb("pad.bin", 1) + self.mount_a.write_n_mb("test.bin", 2) + a_created_ino = self.mount_a.path_to_ino("test.bin") + self.mount_a.create_files() + + # Mount a client on fs_b + self.mount_b.mount(mount_fs_name=fs_b.name) + self.mount_b.write_n_mb("test.bin", 1) + b_created_ino = self.mount_b.path_to_ino("test.bin") + self.mount_b.create_files() + + # Check that a non-default filesystem mount survives an MDS + # failover (i.e. that map subscription is continuous, not + # just the first time), reproduces #16022 + old_fs_b_mds = fs_b.get_active_names()[0] + self.mds_cluster.mds_stop(old_fs_b_mds) + self.mds_cluster.mds_fail(old_fs_b_mds) + fs_b.wait_for_daemons() + background = self.mount_b.write_background() + # Raise exception if the write doesn't finish (i.e. if client + # has not kept up with MDS failure) + try: + self.wait_until_true(lambda: background.finished, timeout=30) + except RuntimeError: + # The mount is stuck, we'll have to force it to fail cleanly + background.stdin.close() + self.mount_b.umount_wait(force=True) + raise + + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + # See that the client's files went into the correct pool + self.assertTrue(fs_a.data_objects_present(a_created_ino, 1024 * 1024)) + self.assertTrue(fs_b.data_objects_present(b_created_ino, 1024 * 1024)) + + def test_standby(self): + fs_a, fs_b = self._setup_two() + + # Assert that the remaining two MDS daemons are now standbys + a_daemons = fs_a.get_active_names() + b_daemons = fs_b.get_active_names() + self.assertEqual(len(a_daemons), 1) + self.assertEqual(len(b_daemons), 1) + original_a = a_daemons[0] + original_b = b_daemons[0] + expect_standby_daemons = set(self.mds_cluster.mds_ids) - (set(a_daemons) | set(b_daemons)) + + # Need all my standbys up as well as the active daemons + self.wait_for_daemon_start() + self.assertEqual(expect_standby_daemons, self.mds_cluster.get_standby_daemons()) + + # Kill fs_a's active MDS, see a standby take over + self.mds_cluster.mds_stop(original_a) + self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_a) + self.wait_until_equal(lambda: len(fs_a.get_active_names()), 1, 30, + reject_fn=lambda v: v > 1) + # Assert that it's a *different* daemon that has now appeared in the map for fs_a + self.assertNotEqual(fs_a.get_active_names()[0], original_a) + + # Kill fs_b's active MDS, see a standby take over + self.mds_cluster.mds_stop(original_b) + self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_b) + self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30, + reject_fn=lambda v: v > 1) + # Assert that it's a *different* daemon that has now appeared in the map for fs_a + self.assertNotEqual(fs_b.get_active_names()[0], original_b) + + # Both of the original active daemons should be gone, and all standbys used up + self.assertEqual(self.mds_cluster.get_standby_daemons(), set()) + + # Restart the ones I killed, see them reappear as standbys + self.mds_cluster.mds_restart(original_a) + self.mds_cluster.mds_restart(original_b) + self.wait_until_true( + lambda: {original_a, original_b} == self.mds_cluster.get_standby_daemons(), + timeout=30 + ) + + def test_grow_shrink(self): + # Usual setup... + fs_a, fs_b = self._setup_two() + + # Increase max_mds on fs_b, see a standby take up the role + fs_b.set_max_mds(2) + self.wait_until_equal(lambda: len(fs_b.get_active_names()), 2, 30, + reject_fn=lambda v: v > 2 or v < 1) + + # Increase max_mds on fs_a, see a standby take up the role + fs_a.set_max_mds(2) + self.wait_until_equal(lambda: len(fs_a.get_active_names()), 2, 30, + reject_fn=lambda v: v > 2 or v < 1) + + # Shrink fs_b back to 1, see a daemon go back to standby + fs_b.set_max_mds(1) + fs_b.deactivate(1) + self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30, + reject_fn=lambda v: v > 2 or v < 1) + + # Grow fs_a up to 3, see the former fs_b daemon join it. + fs_a.set_max_mds(3) + self.wait_until_equal(lambda: len(fs_a.get_active_names()), 3, 60, + reject_fn=lambda v: v > 3 or v < 2) + + def test_standby_for_name(self): + # Pick out exactly 4 daemons to be run during test + use_daemons = sorted(self.mds_cluster.mds_ids[0:4]) + mds_a, mds_b, mds_c, mds_d = use_daemons + log.info("Using MDS daemons: {0}".format(use_daemons)) + + def set_standby_for(leader, follower, replay): + self.set_conf("mds.{0}".format(follower), "mds_standby_for_name", leader) + if replay: + self.set_conf("mds.{0}".format(follower), "mds_standby_replay", "true") + + # Configure two pairs of MDSs that are standby for each other + set_standby_for(mds_a, mds_b, True) + set_standby_for(mds_b, mds_a, False) + set_standby_for(mds_c, mds_d, True) + set_standby_for(mds_d, mds_c, False) + + # Create FS alpha and get mds_a to come up as active + fs_a = self.mds_cluster.newfs("alpha") + self.mds_cluster.mds_restart(mds_a) + fs_a.wait_for_daemons() + self.assertEqual(fs_a.get_active_names(), [mds_a]) + + # Create FS bravo and get mds_c to come up as active + fs_b = self.mds_cluster.newfs("bravo") + self.mds_cluster.mds_restart(mds_c) + fs_b.wait_for_daemons() + self.assertEqual(fs_b.get_active_names(), [mds_c]) + + # Start the standbys + self.mds_cluster.mds_restart(mds_b) + self.mds_cluster.mds_restart(mds_d) + self.wait_for_daemon_start([mds_b, mds_d]) + + def get_info_by_name(fs, mds_name): + mds_map = fs.get_mds_map() + for gid_str, info in mds_map['info'].items(): + if info['name'] == mds_name: + return info + + log.warn(json.dumps(mds_map, indent=2)) + raise RuntimeError("MDS '{0}' not found in filesystem MDSMap".format(mds_name)) + + # See both standbys come up as standby replay for the correct ranks + # mds_b should be in filesystem alpha following mds_a + info_b = get_info_by_name(fs_a, mds_b) + self.assertEqual(info_b['state'], "up:standby-replay") + self.assertEqual(info_b['standby_for_name'], mds_a) + self.assertEqual(info_b['rank'], 0) + # mds_d should be in filesystem alpha following mds_c + info_d = get_info_by_name(fs_b, mds_d) + self.assertEqual(info_d['state'], "up:standby-replay") + self.assertEqual(info_d['standby_for_name'], mds_c) + self.assertEqual(info_d['rank'], 0) + + # Kill both active daemons + self.mds_cluster.mds_stop(mds_a) + self.mds_cluster.mds_fail(mds_a) + self.mds_cluster.mds_stop(mds_c) + self.mds_cluster.mds_fail(mds_c) + + # Wait for standbys to take over + fs_a.wait_for_daemons() + self.assertEqual(fs_a.get_active_names(), [mds_b]) + fs_b.wait_for_daemons() + self.assertEqual(fs_b.get_active_names(), [mds_d]) + + # Start the original active daemons up again + self.mds_cluster.mds_restart(mds_a) + self.mds_cluster.mds_restart(mds_c) + self.wait_for_daemon_start([mds_a, mds_c]) + + self.assertEqual(set(self.mds_cluster.get_standby_daemons()), + {mds_a, mds_c}) + + def test_standby_for_rank(self): + use_daemons = sorted(self.mds_cluster.mds_ids[0:4]) + mds_a, mds_b, mds_c, mds_d = use_daemons + log.info("Using MDS daemons: {0}".format(use_daemons)) + + def set_standby_for(leader_rank, leader_fs, follower_id): + self.set_conf("mds.{0}".format(follower_id), + "mds_standby_for_rank", leader_rank) + + fscid = leader_fs.get_namespace_id() + self.set_conf("mds.{0}".format(follower_id), + "mds_standby_for_fscid", fscid) + + fs_a = self.mds_cluster.newfs("alpha") + fs_b = self.mds_cluster.newfs("bravo") + set_standby_for(0, fs_a, mds_a) + set_standby_for(0, fs_a, mds_b) + set_standby_for(0, fs_b, mds_c) + set_standby_for(0, fs_b, mds_d) + + self.mds_cluster.mds_restart(mds_a) + fs_a.wait_for_daemons() + self.assertEqual(fs_a.get_active_names(), [mds_a]) + + self.mds_cluster.mds_restart(mds_c) + fs_b.wait_for_daemons() + self.assertEqual(fs_b.get_active_names(), [mds_c]) + + self.mds_cluster.mds_restart(mds_b) + self.mds_cluster.mds_restart(mds_d) + self.wait_for_daemon_start([mds_b, mds_d]) + + self.mds_cluster.mds_stop(mds_a) + self.mds_cluster.mds_fail(mds_a) + self.mds_cluster.mds_stop(mds_c) + self.mds_cluster.mds_fail(mds_c) + + fs_a.wait_for_daemons() + self.assertEqual(fs_a.get_active_names(), [mds_b]) + fs_b.wait_for_daemons() + self.assertEqual(fs_b.get_active_names(), [mds_d]) + + def test_standby_for_fscid(self): + """ + That I can set a standby FSCID with no rank, and the result is + that daemons join any rank for that filesystem. + """ + use_daemons = sorted(self.mds_cluster.mds_ids[0:4]) + mds_a, mds_b, mds_c, mds_d = use_daemons + + log.info("Using MDS daemons: {0}".format(use_daemons)) + + def set_standby_for(leader_fs, follower_id): + fscid = leader_fs.get_namespace_id() + self.set_conf("mds.{0}".format(follower_id), + "mds_standby_for_fscid", fscid) + + # Create two filesystems which should have two ranks each + fs_a = self.mds_cluster.newfs("alpha") + + fs_b = self.mds_cluster.newfs("bravo") + + fs_a.set_max_mds(2) + fs_b.set_max_mds(2) + + # Set all the daemons to have a FSCID assignment but no other + # standby preferences. + set_standby_for(fs_a, mds_a) + set_standby_for(fs_a, mds_b) + set_standby_for(fs_b, mds_c) + set_standby_for(fs_b, mds_d) + + # Now when we start all daemons at once, they should fall into + # ranks in the right filesystem + self.mds_cluster.mds_restart(mds_a) + self.mds_cluster.mds_restart(mds_b) + self.mds_cluster.mds_restart(mds_c) + self.mds_cluster.mds_restart(mds_d) + self.wait_for_daemon_start([mds_a, mds_b, mds_c, mds_d]) + fs_a.wait_for_daemons() + fs_b.wait_for_daemons() + self.assertEqual(set(fs_a.get_active_names()), {mds_a, mds_b}) + self.assertEqual(set(fs_b.get_active_names()), {mds_c, mds_d}) + + def test_standby_for_invalid_fscid(self): + """ + That an invalid standby_fscid does not cause a mon crash + """ + use_daemons = sorted(self.mds_cluster.mds_ids[0:3]) + mds_a, mds_b, mds_c = use_daemons + log.info("Using MDS daemons: {0}".format(use_daemons)) + + def set_standby_for_rank(leader_rank, follower_id): + self.set_conf("mds.{0}".format(follower_id), + "mds_standby_for_rank", leader_rank) + + # Create one fs + fs_a = self.mds_cluster.newfs("cephfs") + + # Get configured mons in the cluster, so we can see if any + # crashed later. + configured_mons = fs_a.mon_manager.get_mon_quorum() + + # Set all the daemons to have a rank assignment but no other + # standby preferences. + set_standby_for_rank(0, mds_a) + set_standby_for_rank(0, mds_b) + + # Set third daemon to have invalid fscid assignment and no other + # standby preferences + invalid_fscid = 123 + self.set_conf("mds.{0}".format(mds_c), "mds_standby_for_fscid", invalid_fscid) + + #Restart all the daemons to make the standby preference applied + self.mds_cluster.mds_restart(mds_a) + self.mds_cluster.mds_restart(mds_b) + self.mds_cluster.mds_restart(mds_c) + self.wait_for_daemon_start([mds_a, mds_b, mds_c]) + + #Stop active mds daemon service of fs + if (fs_a.get_active_names(), [mds_a]): + self.mds_cluster.mds_stop(mds_a) + self.mds_cluster.mds_fail(mds_a) + fs_a.wait_for_daemons() + else: + self.mds_cluster.mds_stop(mds_b) + self.mds_cluster.mds_fail(mds_b) + fs_a.wait_for_daemons() + + #Get active mons from cluster + active_mons = fs_a.mon_manager.get_mon_quorum() + + #Check for active quorum mon status and configured mon status + self.assertEqual(active_mons, configured_mons, + "Not all mons are in quorum Invalid standby invalid fscid test failed!") diff --git a/src/ceph/qa/tasks/cephfs/test_flush.py b/src/ceph/qa/tasks/cephfs/test_flush.py new file mode 100644 index 0000000..1f84e42 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_flush.py @@ -0,0 +1,113 @@ + +from textwrap import dedent +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO + + +class TestFlush(CephFSTestCase): + def test_flush(self): + self.mount_a.run_shell(["mkdir", "mydir"]) + self.mount_a.run_shell(["touch", "mydir/alpha"]) + dir_ino = self.mount_a.path_to_ino("mydir") + file_ino = self.mount_a.path_to_ino("mydir/alpha") + + # Unmount the client so that it isn't still holding caps + self.mount_a.umount_wait() + + # Before flush, the dirfrag object does not exist + with self.assertRaises(ObjectNotFound): + self.fs.list_dirfrag(dir_ino) + + # Before flush, the file's backtrace has not been written + with self.assertRaises(ObjectNotFound): + self.fs.read_backtrace(file_ino) + + # Before flush, there are no dentries in the root + self.assertEqual(self.fs.list_dirfrag(ROOT_INO), []) + + # Execute flush + flush_data = self.fs.mds_asok(["flush", "journal"]) + self.assertEqual(flush_data['return_code'], 0) + + # After flush, the dirfrag object has been created + dir_list = self.fs.list_dirfrag(dir_ino) + self.assertEqual(dir_list, ["alpha_head"]) + + # And the 'mydir' dentry is in the root + self.assertEqual(self.fs.list_dirfrag(ROOT_INO), ['mydir_head']) + + # ...and the data object has its backtrace + backtrace = self.fs.read_backtrace(file_ino) + self.assertEqual(['alpha', 'mydir'], [a['dname'] for a in backtrace['ancestors']]) + self.assertEqual([dir_ino, 1], [a['dirino'] for a in backtrace['ancestors']]) + self.assertEqual(file_ino, backtrace['ino']) + + # ...and the journal is truncated to just a single subtreemap from the + # newly created segment + summary_output = self.fs.journal_tool(["event", "get", "summary"]) + try: + self.assertEqual(summary_output, + dedent( + """ + Events by type: + SUBTREEMAP: 1 + Errors: 0 + """ + ).strip()) + except AssertionError: + # In some states, flushing the journal will leave you + # an extra event from locks a client held. This is + # correct behaviour: the MDS is flushing the journal, + # it's just that new events are getting added too. + # In this case, we should nevertheless see a fully + # empty journal after a second flush. + self.assertEqual(summary_output, + dedent( + """ + Events by type: + SUBTREEMAP: 1 + UPDATE: 1 + Errors: 0 + """ + ).strip()) + flush_data = self.fs.mds_asok(["flush", "journal"]) + self.assertEqual(flush_data['return_code'], 0) + self.assertEqual(self.fs.journal_tool(["event", "get", "summary"]), + dedent( + """ + Events by type: + SUBTREEMAP: 1 + Errors: 0 + """ + ).strip()) + + # Now for deletion! + # We will count the RADOS deletions and MDS file purges, to verify that + # the expected behaviour is happening as a result of the purge + initial_dels = self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete'] + initial_purges = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued'] + + # Use a client to delete a file + self.mount_a.mount() + self.mount_a.wait_until_mounted() + self.mount_a.run_shell(["rm", "-rf", "mydir"]) + + # Flush the journal so that the directory inode can be purged + flush_data = self.fs.mds_asok(["flush", "journal"]) + self.assertEqual(flush_data['return_code'], 0) + + # We expect to see a single file purge + self.wait_until_true( + lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued'] - initial_purges >= 2, + 60) + + # We expect two deletions, one of the dirfrag and one of the backtrace + self.wait_until_true( + lambda: self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete'] - initial_dels >= 2, + 60) # timeout is fairly long to allow for tick+rados latencies + + with self.assertRaises(ObjectNotFound): + self.fs.list_dirfrag(dir_ino) + with self.assertRaises(ObjectNotFound): + self.fs.read_backtrace(file_ino) + self.assertEqual(self.fs.list_dirfrag(ROOT_INO), []) diff --git a/src/ceph/qa/tasks/cephfs/test_forward_scrub.py b/src/ceph/qa/tasks/cephfs/test_forward_scrub.py new file mode 100644 index 0000000..ac912dd --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_forward_scrub.py @@ -0,0 +1,291 @@ + +""" +Test that the forward scrub functionality can traverse metadata and apply +requested tags, on well formed metadata. + +This is *not* the real testing for forward scrub, which will need to test +how the functionality responds to damaged metadata. + +""" +import json + +import logging +from collections import namedtuple +from textwrap import dedent + +from teuthology.orchestra.run import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +import struct + +log = logging.getLogger(__name__) + + +ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) + + +class TestForwardScrub(CephFSTestCase): + MDSS_REQUIRED = 1 + + def _read_str_xattr(self, pool, obj, attr): + """ + Read a ceph-encoded string from a rados xattr + """ + output = self.fs.rados(["getxattr", obj, attr], pool=pool) + strlen = struct.unpack('i', output[0:4])[0] + return output[4:(4 + strlen)] + + def _get_paths_to_ino(self): + inos = {} + p = self.mount_a.run_shell(["find", "./"]) + paths = p.stdout.getvalue().strip().split() + for path in paths: + inos[path] = self.mount_a.path_to_ino(path) + + return inos + + def test_apply_tag(self): + self.mount_a.run_shell(["mkdir", "parentdir"]) + self.mount_a.run_shell(["mkdir", "parentdir/childdir"]) + self.mount_a.run_shell(["touch", "rfile"]) + self.mount_a.run_shell(["touch", "parentdir/pfile"]) + self.mount_a.run_shell(["touch", "parentdir/childdir/cfile"]) + + # Build a structure mapping path to inode, as we will later want + # to check object by object and objects are named after ino number + inos = self._get_paths_to_ino() + + # Flush metadata: this is a friendly test of forward scrub so we're skipping + # the part where it's meant to cope with dirty metadata + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"]) + + tag = "mytag" + + # Execute tagging forward scrub + self.fs.mds_asok(["tag", "path", "/parentdir", tag]) + # Wait for completion + import time + time.sleep(10) + # FIXME watching clog isn't a nice mechanism for this, once we have a ScrubMap we'll + # watch that instead + + # Check that dirs were tagged + for dirpath in ["./parentdir", "./parentdir/childdir"]: + self.assertTagged(inos[dirpath], tag, self.fs.get_metadata_pool_name()) + + # Check that files were tagged + for filepath in ["./parentdir/pfile", "./parentdir/childdir/cfile"]: + self.assertTagged(inos[filepath], tag, self.fs.get_data_pool_name()) + + # This guy wasn't in the tag path, shouldn't have been tagged + self.assertUntagged(inos["./rfile"]) + + def assertUntagged(self, ino): + file_obj_name = "{0:x}.00000000".format(ino) + with self.assertRaises(CommandFailedError): + self._read_str_xattr( + self.fs.get_data_pool_name(), + file_obj_name, + "scrub_tag" + ) + + def assertTagged(self, ino, tag, pool): + file_obj_name = "{0:x}.00000000".format(ino) + wrote = self._read_str_xattr( + pool, + file_obj_name, + "scrub_tag" + ) + self.assertEqual(wrote, tag) + + def _validate_linkage(self, expected): + inos = self._get_paths_to_ino() + try: + self.assertDictEqual(inos, expected) + except AssertionError: + log.error("Expected: {0}".format(json.dumps(expected, indent=2))) + log.error("Actual: {0}".format(json.dumps(inos, indent=2))) + raise + + def test_orphan_scan(self): + # Create some files whose metadata we will flush + self.mount_a.run_python(dedent(""" + import os + mount_point = "{mount_point}" + parent = os.path.join(mount_point, "parent") + os.mkdir(parent) + flushed = os.path.join(parent, "flushed") + os.mkdir(flushed) + for f in ["alpha", "bravo", "charlie"]: + open(os.path.join(flushed, f), 'w').write(f) + """.format(mount_point=self.mount_a.mountpoint))) + + inos = self._get_paths_to_ino() + + # Flush journal + # Umount before flush to avoid cap releases putting + # things we don't want in the journal later. + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"]) + + # Create a new inode that's just in the log, i.e. would + # look orphaned to backward scan if backward scan wisnae + # respectin' tha scrub_tag xattr. + self.mount_a.mount() + self.mount_a.run_shell(["mkdir", "parent/unflushed"]) + self.mount_a.run_shell(["dd", "if=/dev/urandom", + "of=./parent/unflushed/jfile", + "bs=1M", "count=8"]) + inos["./parent/unflushed"] = self.mount_a.path_to_ino("./parent/unflushed") + inos["./parent/unflushed/jfile"] = self.mount_a.path_to_ino("./parent/unflushed/jfile") + self.mount_a.umount_wait() + + # Orphan an inode by deleting its dentry + # Our victim will be.... bravo. + self.mount_a.umount_wait() + self.fs.mds_stop() + self.fs.mds_fail() + self.fs.set_ceph_conf('mds', 'mds verify scatter', False) + self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) + frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"]) + self.fs.rados(["rmomapkey", frag_obj_id, "bravo_head"]) + + self.fs.mds_restart() + self.fs.wait_for_daemons() + + # See that the orphaned file is indeed missing from a client's POV + self.mount_a.mount() + damaged_state = self._get_paths_to_ino() + self.assertNotIn("./parent/flushed/bravo", damaged_state) + self.mount_a.umount_wait() + + # Run a tagging forward scrub + tag = "mytag123" + self.fs.mds_asok(["tag", "path", "/parent", tag]) + + # See that the orphan wisnae tagged + self.assertUntagged(inos['./parent/flushed/bravo']) + + # See that the flushed-metadata-and-still-present files are tagged + self.assertTagged(inos['./parent/flushed/alpha'], tag, self.fs.get_data_pool_name()) + self.assertTagged(inos['./parent/flushed/charlie'], tag, self.fs.get_data_pool_name()) + + # See that journalled-but-not-flushed file *was* tagged + self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name()) + + # Run cephfs-data-scan targeting only orphans + self.fs.mds_stop() + self.fs.mds_fail() + self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()]) + self.fs.data_scan([ + "scan_inodes", + "--filter-tag", tag, + self.fs.get_data_pool_name() + ]) + + # After in-place injection stats should be kosher again + self.fs.set_ceph_conf('mds', 'mds verify scatter', True) + self.fs.set_ceph_conf('mds', 'mds debug scatterstat', True) + + # And we should have all the same linkage we started with, + # and no lost+found, and no extra inodes! + self.fs.mds_restart() + self.fs.wait_for_daemons() + self.mount_a.mount() + self._validate_linkage(inos) + + def _stash_inotable(self): + # Get all active ranks + ranks = self.fs.get_all_mds_rank() + + inotable_dict = {} + for rank in ranks: + inotable_oid = "mds{rank:d}_".format(rank=rank) + "inotable" + print "Trying to fetch inotable object: " + inotable_oid + + #self.fs.get_metadata_object("InoTable", "mds0_inotable") + inotable_raw = self.fs.get_metadata_object_raw(inotable_oid) + inotable_dict[inotable_oid] = inotable_raw + return inotable_dict + + def test_inotable_sync(self): + self.mount_a.write_n_mb("file1_sixmegs", 6) + + # Flush journal + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"]) + + inotable_copy = self._stash_inotable() + + self.mount_a.mount() + + self.mount_a.write_n_mb("file2_sixmegs", 6) + self.mount_a.write_n_mb("file3_sixmegs", 6) + + inos = self._get_paths_to_ino() + + # Flush journal + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"]) + + self.mount_a.umount_wait() + + with self.assert_cluster_log("inode table repaired", invert_match=True): + self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"]) + + self.mds_cluster.mds_stop() + self.mds_cluster.mds_fail() + + # Truncate the journal (to ensure the inotable on disk + # is all that will be in the InoTable in memory) + + self.fs.journal_tool(["event", "splice", + "--inode={0}".format(inos["./file2_sixmegs"]), "summary"]) + + self.fs.journal_tool(["event", "splice", + "--inode={0}".format(inos["./file3_sixmegs"]), "summary"]) + + # Revert to old inotable. + for key, value in inotable_copy.iteritems(): + self.fs.put_metadata_object_raw(key, value) + + self.mds_cluster.mds_restart() + self.fs.wait_for_daemons() + + with self.assert_cluster_log("inode table repaired"): + self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"]) + + self.mds_cluster.mds_stop() + table_text = self.fs.table_tool(["0", "show", "inode"]) + table = json.loads(table_text) + self.assertGreater( + table['0']['data']['inotable']['free'][0]['start'], + inos['./file3_sixmegs']) + + def test_backtrace_repair(self): + """ + That the MDS can repair an inodes backtrace in the data pool + if it is found to be damaged. + """ + # Create a file for subsequent checks + self.mount_a.run_shell(["mkdir", "parent_a"]) + self.mount_a.run_shell(["touch", "parent_a/alpha"]) + file_ino = self.mount_a.path_to_ino("parent_a/alpha") + + # That backtrace and layout are written after initial flush + self.fs.mds_asok(["flush", "journal"]) + backtrace = self.fs.read_backtrace(file_ino) + self.assertEqual(['alpha', 'parent_a'], + [a['dname'] for a in backtrace['ancestors']]) + + # Go corrupt the backtrace + self.fs._write_data_xattr(file_ino, "parent", + "oh i'm sorry did i overwrite your xattr?") + + with self.assert_cluster_log("bad backtrace on inode"): + self.fs.mds_asok(["scrub_path", "/", "repair", "recursive"]) + self.fs.mds_asok(["flush", "journal"]) + backtrace = self.fs.read_backtrace(file_ino) + self.assertEqual(['alpha', 'parent_a'], + [a['dname'] for a in backtrace['ancestors']]) diff --git a/src/ceph/qa/tasks/cephfs/test_fragment.py b/src/ceph/qa/tasks/cephfs/test_fragment.py new file mode 100644 index 0000000..a62ef74 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_fragment.py @@ -0,0 +1,232 @@ + + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.orchestra import run + +import logging +log = logging.getLogger(__name__) + + +class TestFragmentation(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def get_splits(self): + return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_split'] + + def get_merges(self): + return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_merge'] + + def get_dir_ino(self, path): + dir_cache = self.fs.read_cache(path, 0) + dir_ino = None + dir_inono = self.mount_a.path_to_ino(path.strip("/")) + for ino in dir_cache: + if ino['ino'] == dir_inono: + dir_ino = ino + break + self.assertIsNotNone(dir_ino) + return dir_ino + + def _configure(self, **kwargs): + """ + Apply kwargs as MDS configuration settings, enable dirfrags + and restart the MDSs. + """ + kwargs['mds_bal_frag'] = "true" + + for k, v in kwargs.items(): + self.ceph_cluster.set_ceph_conf("mds", k, v.__str__()) + + self.fs.set_allow_dirfrags(True) + + self.mds_cluster.mds_fail_restart() + self.fs.wait_for_daemons() + + def test_oversize(self): + """ + That a directory is split when it becomes too large. + """ + + split_size = 20 + merge_size = 5 + + self._configure( + mds_bal_split_size=split_size, + mds_bal_merge_size=merge_size, + mds_bal_split_bits=1 + ) + + self.assertEqual(self.get_splits(), 0) + + self.mount_a.create_n_files("splitdir/file", split_size + 1) + + self.wait_until_true( + lambda: self.get_splits() == 1, + timeout=30 + ) + + frags = self.get_dir_ino("/splitdir")['dirfrags'] + self.assertEqual(len(frags), 2) + self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*") + self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*") + self.assertEqual( + sum([len(f['dentries']) for f in frags]), + split_size + 1 + ) + + self.assertEqual(self.get_merges(), 0) + + self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")]) + + self.wait_until_true( + lambda: self.get_merges() == 1, + timeout=30 + ) + + self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 1) + + def test_rapid_creation(self): + """ + That the fast-splitting limit of 1.5x normal limit is + applied when creating dentries quickly. + """ + + split_size = 100 + merge_size = 1 + + self._configure( + mds_bal_split_size=split_size, + mds_bal_merge_size=merge_size, + mds_bal_split_bits=3, + mds_bal_fragment_size_max=int(split_size * 1.5 + 2) + ) + + # We test this only at a single split level. If a client was sending + # IO so fast that it hit a second split before the first split + # was complete, it could violate mds_bal_fragment_size_max -- there + # is a window where the child dirfrags of a split are unfrozen + # (so they can grow), but still have STATE_FRAGMENTING (so they + # can't be split). + + # By writing 4x the split size when the split bits are set + # to 3 (i.e. 4-ways), I am reasonably sure to see precisely + # one split. The test is to check whether that split + # happens soon enough that the client doesn't exceed + # 2x the split_size (the "immediate" split mode should + # kick in at 1.5x the split size). + + self.assertEqual(self.get_splits(), 0) + self.mount_a.create_n_files("splitdir/file", split_size * 4) + self.wait_until_equal( + self.get_splits, + 1, + reject_fn=lambda s: s > 1, + timeout=30 + ) + + def test_deep_split(self): + """ + That when the directory grows many times larger than split size, + the fragments get split again. + """ + + split_size = 100 + merge_size = 1 # i.e. don't merge frag unless its empty + split_bits = 1 + + branch_factor = 2**split_bits + + # Arbitrary: how many levels shall we try fragmenting before + # ending the test? + max_depth = 5 + + self._configure( + mds_bal_split_size=split_size, + mds_bal_merge_size=merge_size, + mds_bal_split_bits=split_bits + ) + + # Each iteration we will create another level of fragments. The + # placement of dentries into fragments is by hashes (i.e. pseudo + # random), so we rely on statistics to get the behaviour that + # by writing about 1.5x as many dentries as the split_size times + # the number of frags, we will get them all to exceed their + # split size and trigger a split. + depth = 0 + files_written = 0 + splits_expected = 0 + while depth < max_depth: + log.info("Writing files for depth {0}".format(depth)) + target_files = branch_factor**depth * int(split_size * 1.5) + create_files = target_files - files_written + + self.ceph_cluster.mon_manager.raw_cluster_cmd("log", + "{0} Writing {1} files (depth={2})".format( + self.__class__.__name__, create_files, depth + )) + self.mount_a.create_n_files("splitdir/file_{0}".format(depth), + create_files) + self.ceph_cluster.mon_manager.raw_cluster_cmd("log", + "{0} Done".format(self.__class__.__name__)) + + files_written += create_files + log.info("Now have {0} files".format(files_written)) + + splits_expected += branch_factor**depth + log.info("Waiting to see {0} splits".format(splits_expected)) + try: + self.wait_until_equal( + self.get_splits, + splits_expected, + timeout=30, + reject_fn=lambda x: x > splits_expected + ) + + frags = self.get_dir_ino("/splitdir")['dirfrags'] + self.assertEqual(len(frags), branch_factor**(depth+1)) + self.assertEqual( + sum([len(f['dentries']) for f in frags]), + target_files + ) + except: + # On failures, log what fragmentation we actually ended + # up with. This block is just for logging, at the end + # we raise the exception again. + frags = self.get_dir_ino("/splitdir")['dirfrags'] + log.info("depth={0} splits_expected={1} files_written={2}".format( + depth, splits_expected, files_written + )) + log.info("Dirfrags:") + for f in frags: + log.info("{0}: {1}".format( + f['dirfrag'], len(f['dentries']) + )) + raise + + depth += 1 + + # Remember the inode number because we will be checking for + # objects later. + dir_inode_no = self.mount_a.path_to_ino("splitdir") + + self.mount_a.run_shell(["rm", "-rf", "splitdir/"]) + self.mount_a.umount_wait() + + self.fs.mds_asok(['flush', 'journal']) + + # Wait for all strays to purge + self.wait_until_equal( + lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache'] + )['mds_cache']['num_strays'], + 0, + timeout=1200 + ) + # Check that the metadata pool objects for all the myriad + # child fragments are gone + metadata_objs = self.fs.rados(["ls"]) + frag_objs = [] + for o in metadata_objs: + if o.startswith("{0:x}.".format(dir_inode_no)): + frag_objs.append(o) + self.assertListEqual(frag_objs, []) diff --git a/src/ceph/qa/tasks/cephfs/test_full.py b/src/ceph/qa/tasks/cephfs/test_full.py new file mode 100644 index 0000000..e69ccb3 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_full.py @@ -0,0 +1,414 @@ + + +import json +import logging +import os +from textwrap import dedent +import time +from teuthology.orchestra.run import CommandFailedError +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase + + +log = logging.getLogger(__name__) + + +class FullnessTestCase(CephFSTestCase): + CLIENTS_REQUIRED = 2 + + # Subclasses define whether they're filling whole cluster or just data pool + data_only = False + + # Subclasses define how many bytes should be written to achieve fullness + pool_capacity = None + fill_mb = None + + # Subclasses define what fullness means to them + def is_full(self): + raise NotImplementedError() + + def setUp(self): + CephFSTestCase.setUp(self) + + # These tests just use a single active MDS throughout, so remember its ID + # for use in mds_asok calls + self.active_mds_id = self.fs.get_active_names()[0] + + # Capture the initial OSD map epoch for later use + self.initial_osd_epoch = json.loads( + self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip() + )['epoch'] + + # Check the initial barrier epoch on the MDS: this should be + # set to the latest map at MDS startup. We do this check in + # setUp to get in there before subclasses might touch things + # in their own setUp functions. + self.assertGreaterEqual(self.fs.mds_asok(["status"], mds_id=self.active_mds_id)['osdmap_epoch_barrier'], + self.initial_osd_epoch) + + def test_barrier(self): + """ + That when an OSD epoch barrier is set on an MDS, subsequently + issued capabilities cause clients to update their OSD map to that + epoch. + """ + + # Sync up clients with initial MDS OSD map barrier + self.mount_a.open_no_data("foo") + self.mount_b.open_no_data("bar") + + # Grab mounts' initial OSD epochs: later we will check that + # it hasn't advanced beyond this point. + mount_a_initial_epoch = self.mount_a.get_osd_epoch()[0] + mount_b_initial_epoch = self.mount_b.get_osd_epoch()[0] + + # Freshly mounted at start of test, should be up to date with OSD map + self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch) + self.assertGreaterEqual(mount_b_initial_epoch, self.initial_osd_epoch) + + # Set and unset a flag to cause OSD epoch to increment + self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause") + self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause") + + out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip() + new_epoch = json.loads(out)['epoch'] + self.assertNotEqual(self.initial_osd_epoch, new_epoch) + + # Do a metadata operation on clients, witness that they end up with + # the old OSD map from startup time (nothing has prompted client + # to update its map) + self.mount_a.open_no_data("alpha") + self.mount_b.open_no_data("bravo1") + + # Sleep long enough that if the OSD map was propagating it would + # have done so (this is arbitrary because we are 'waiting' for something + # to *not* happen). + time.sleep(30) + + mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch() + self.assertEqual(mount_a_epoch, mount_a_initial_epoch) + mount_b_epoch, mount_b_barrier = self.mount_b.get_osd_epoch() + self.assertEqual(mount_b_epoch, mount_b_initial_epoch) + + # Set a barrier on the MDS + self.fs.mds_asok(["osdmap", "barrier", new_epoch.__str__()], mds_id=self.active_mds_id) + + # Do an operation on client B, witness that it ends up with + # the latest OSD map from the barrier. This shouldn't generate any + # cap revokes to A because B was already the last one to touch + # a file in root. + self.mount_b.run_shell(["touch", "bravo2"]) + self.mount_b.open_no_data("bravo2") + + # Some time passes here because the metadata part of the operation + # completes immediately, while the resulting OSD map update happens + # asynchronously (it's an Objecter::_maybe_request_map) as a result + # of seeing the new epoch barrier. + self.wait_until_equal( + lambda: self.mount_b.get_osd_epoch(), + (new_epoch, new_epoch), + 30, + lambda x: x[0] > new_epoch or x[1] > new_epoch) + + # ...and none of this should have affected the oblivious mount a, + # because it wasn't doing any data or metadata IO + mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch() + self.assertEqual(mount_a_epoch, mount_a_initial_epoch) + + def _data_pool_name(self): + data_pool_names = self.fs.get_data_pool_names() + if len(data_pool_names) > 1: + raise RuntimeError("This test can't handle multiple data pools") + else: + return data_pool_names[0] + + def _test_full(self, easy_case): + """ + - That a client trying to write data to a file is prevented + from doing so with an -EFULL result + - That they are also prevented from creating new files by the MDS. + - That they may delete another file to get the system healthy again + + :param easy_case: if true, delete a successfully written file to + free up space. else, delete the file that experienced + the failed write. + """ + + osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd')) + + log.info("Writing {0}MB should fill this cluster".format(self.fill_mb)) + + # Fill up the cluster. This dd may or may not fail, as it depends on + # how soon the cluster recognises its own fullness + self.mount_a.write_n_mb("large_file_a", self.fill_mb / 2) + try: + self.mount_a.write_n_mb("large_file_b", self.fill_mb / 2) + except CommandFailedError: + log.info("Writing file B failed (full status happened already)") + assert self.is_full() + else: + log.info("Writing file B succeeded (full status will happen soon)") + self.wait_until_true(lambda: self.is_full(), + timeout=osd_mon_report_interval_max * 5) + + # Attempting to write more data should give me ENOSPC + with self.assertRaises(CommandFailedError) as ar: + self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb / 2) + self.assertEqual(ar.exception.exitstatus, 1) # dd returns 1 on "No space" + + # Wait for the MDS to see the latest OSD map so that it will reliably + # be applying the policy of rejecting non-deletion metadata operations + # while in the full state. + osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch'] + self.wait_until_true( + lambda: self.fs.mds_asok(['status'], mds_id=self.active_mds_id)['osdmap_epoch'] >= osd_epoch, + timeout=10) + + if not self.data_only: + with self.assertRaises(CommandFailedError): + self.mount_a.write_n_mb("small_file_1", 0) + + # Clear out some space + if easy_case: + self.mount_a.run_shell(['rm', '-f', 'large_file_a']) + self.mount_a.run_shell(['rm', '-f', 'large_file_b']) + else: + # In the hard case it is the file that filled the system. + # Before the new #7317 (ENOSPC, epoch barrier) changes, this + # would fail because the last objects written would be + # stuck in the client cache as objecter operations. + self.mount_a.run_shell(['rm', '-f', 'large_file_b']) + self.mount_a.run_shell(['rm', '-f', 'large_file_a']) + + # Here we are waiting for two things to happen: + # * The MDS to purge the stray folder and execute object deletions + # * The OSDs to inform the mon that they are no longer full + self.wait_until_true(lambda: not self.is_full(), + timeout=osd_mon_report_interval_max * 5) + + # Wait for the MDS to see the latest OSD map so that it will reliably + # be applying the free space policy + osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch'] + self.wait_until_true( + lambda: self.fs.mds_asok(['status'], mds_id=self.active_mds_id)['osdmap_epoch'] >= osd_epoch, + timeout=10) + + # Now I should be able to write again + self.mount_a.write_n_mb("large_file", 50, seek=0) + + # Ensure that the MDS keeps its OSD epoch barrier across a restart + + def test_full_different_file(self): + self._test_full(True) + + def test_full_same_file(self): + self._test_full(False) + + def _remote_write_test(self, template): + """ + Run some remote python in a way that's useful for + testing free space behaviour (see test_* methods using this) + """ + file_path = os.path.join(self.mount_a.mountpoint, "full_test_file") + + # Enough to trip the full flag + osd_mon_report_interval_max = int(self.fs.get_config("osd_mon_report_interval_max", service_type='osd')) + mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon")) + + # Sufficient data to cause RADOS cluster to go 'full' + log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb)) + + # Long enough for RADOS cluster to notice it is full and set flag on mons + # (report_interval for mon to learn PG stats, tick interval for it to update OSD map, + # factor of 1.5 for I/O + network latency in committing OSD map and distributing it + # to the OSDs) + full_wait = (osd_mon_report_interval_max + mon_tick_interval) * 1.5 + + # Configs for this test should bring this setting down in order to + # run reasonably quickly + if osd_mon_report_interval_max > 10: + log.warn("This test may run rather slowly unless you decrease" + "osd_mon_report_interval_max (5 is a good setting)!") + + self.mount_a.run_python(template.format( + fill_mb=self.fill_mb, + file_path=file_path, + full_wait=full_wait, + is_fuse=isinstance(self.mount_a, FuseMount) + )) + + def test_full_fclose(self): + # A remote script which opens a file handle, fills up the filesystem, and then + # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync + remote_script = dedent(""" + import time + import datetime + import subprocess + import os + + # Write some buffered data through before going full, all should be well + print "writing some data through which we expect to succeed" + bytes = 0 + f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT) + bytes += os.write(f, 'a' * 4096) + os.fsync(f) + print "fsync'ed data successfully, will now attempt to fill fs" + + # Okay, now we're going to fill up the filesystem, and then keep + # writing until we see an error from fsync. As long as we're doing + # buffered IO, the error should always only appear from fsync and not + # from write + full = False + + for n in range(0, {fill_mb}): + bytes += os.write(f, 'x' * 1024 * 1024) + print "wrote bytes via buffered write, may repeat" + print "done writing bytes" + + # OK, now we should sneak in under the full condition + # due to the time it takes the OSDs to report to the + # mons, and get a successful fsync on our full-making data + os.fsync(f) + print "successfully fsync'ed prior to getting full state reported" + + # Now wait for the full flag to get set so that our + # next flush IO will fail + time.sleep(30) + + # A buffered IO, should succeed + print "starting buffered write we expect to succeed" + os.write(f, 'x' * 4096) + print "wrote, now waiting 30s and then doing a close we expect to fail" + + # Wait long enough for a background flush that should fail + time.sleep(30) + + if {is_fuse}: + # ...and check that the failed background flush is reflected in fclose + try: + os.close(f) + except OSError: + print "close() returned an error as expected" + else: + raise RuntimeError("close() failed to raise error") + else: + # The kernel cephfs client does not raise errors on fclose + os.close(f) + + os.unlink("{file_path}") + """) + self._remote_write_test(remote_script) + + def test_full_fsync(self): + """ + That when the full flag is encountered during asynchronous + flushes, such that an fwrite() succeeds but an fsync/fclose() + should return the ENOSPC error. + """ + + # A remote script which opens a file handle, fills up the filesystem, and then + # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync + remote_script = dedent(""" + import time + import datetime + import subprocess + import os + + # Write some buffered data through before going full, all should be well + print "writing some data through which we expect to succeed" + bytes = 0 + f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT) + bytes += os.write(f, 'a' * 4096) + os.fsync(f) + print "fsync'ed data successfully, will now attempt to fill fs" + + # Okay, now we're going to fill up the filesystem, and then keep + # writing until we see an error from fsync. As long as we're doing + # buffered IO, the error should always only appear from fsync and not + # from write + full = False + + for n in range(0, {fill_mb} + 1): + try: + bytes += os.write(f, 'x' * 1024 * 1024) + print "wrote bytes via buffered write, moving on to fsync" + except OSError as e: + print "Unexpected error %s from write() instead of fsync()" % e + raise + + try: + os.fsync(f) + print "fsync'ed successfully" + except OSError as e: + print "Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0)) + full = True + break + else: + print "Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0)) + + if n > {fill_mb} * 0.8: + # Be cautious in the last region where we expect to hit + # the full condition, so that we don't overshoot too dramatically + print "sleeping a bit as we've exceeded 80% of our expected full ratio" + time.sleep({full_wait}) + + if not full: + raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes) + + # close() should not raise an error because we already caught it in + # fsync. There shouldn't have been any more writeback errors + # since then because all IOs got cancelled on the full flag. + print "calling close" + os.close(f) + print "close() did not raise error" + + os.unlink("{file_path}") + """) + + self._remote_write_test(remote_script) + + +class TestQuotaFull(FullnessTestCase): + """ + Test per-pool fullness, which indicates quota limits exceeded + """ + pool_capacity = 1024 * 1024 * 32 # arbitrary low-ish limit + fill_mb = pool_capacity / (1024 * 1024) + + # We are only testing quota handling on the data pool, not the metadata + # pool. + data_only = True + + def setUp(self): + super(TestQuotaFull, self).setUp() + + pool_name = self.fs.get_data_pool_name() + self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name, + "max_bytes", "{0}".format(self.pool_capacity)) + + def is_full(self): + return self.fs.is_pool_full(self.fs.get_data_pool_name()) + + +class TestClusterFull(FullnessTestCase): + """ + Test cluster-wide fullness, which indicates that an OSD has become too full + """ + pool_capacity = None + REQUIRE_MEMSTORE = True + + def setUp(self): + super(TestClusterFull, self).setUp() + + if self.pool_capacity is None: + # This is a hack to overcome weird fluctuations in the reported + # `max_avail` attribute of pools that sometimes occurs in between + # tests (reason as yet unclear, but this dodges the issue) + TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail'] + TestClusterFull.fill_mb = int(1.05 * (self.pool_capacity / (1024.0 * 1024.0))) + + def is_full(self): + return self.fs.is_full() + +# Hide the parent class so that unittest.loader doesn't try to run it. +del globals()['FullnessTestCase'] diff --git a/src/ceph/qa/tasks/cephfs/test_journal_migration.py b/src/ceph/qa/tasks/cephfs/test_journal_migration.py new file mode 100644 index 0000000..64fe939 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_journal_migration.py @@ -0,0 +1,118 @@ + +from StringIO import StringIO +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from tasks.workunit import task as workunit + +JOURNAL_FORMAT_LEGACY = 0 +JOURNAL_FORMAT_RESILIENT = 1 + + +class TestJournalMigration(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 2 + + def test_journal_migration(self): + old_journal_version = JOURNAL_FORMAT_LEGACY + new_journal_version = JOURNAL_FORMAT_RESILIENT + + # Pick out two daemons to use + mds_a, mds_b = sorted(self.mds_cluster.mds_ids[0:2]) + + self.mount_a.umount_wait() + self.fs.mds_stop() + + # Enable standby replay, to cover the bug case #8811 where + # a standby replay might mistakenly end up trying to rewrite + # the journal at the same time as an active daemon. + self.fs.set_ceph_conf('mds', 'mds standby replay', "true") + self.fs.set_ceph_conf('mds', 'mds standby for rank', "0") + + # Create a filesystem using the older journal format. + self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version) + self.fs.recreate() + self.fs.mds_restart(mds_id=mds_a) + self.fs.wait_for_daemons() + self.assertEqual(self.fs.get_active_names(), [mds_a]) + + def replay_names(): + return [s['name'] + for s in self.fs.status().get_replays(fscid = self.fs.id)] + + # Start the standby and wait for it to come up + self.fs.mds_restart(mds_id=mds_b) + self.wait_until_equal( + replay_names, + [mds_b], + timeout = 30) + + # Do some client work so that the log is populated with something. + with self.mount_a.mounted(): + self.mount_a.create_files() + self.mount_a.check_files() # sanity, this should always pass + + # Run a more substantial workunit so that the length of the log to be + # coverted is going span at least a few segments + workunit(self.ctx, { + 'clients': { + "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"], + }, + "timeout": "3h" + }) + + # Modify the ceph.conf to ask the MDS to use the new journal format. + self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version) + + # Restart the MDS. + self.fs.mds_fail_restart(mds_id=mds_a) + self.fs.mds_fail_restart(mds_id=mds_b) + + # This ensures that all daemons come up into a valid state + self.fs.wait_for_daemons() + + # Check that files created in the initial client workload are still visible + # in a client mount. + with self.mount_a.mounted(): + self.mount_a.check_files() + + # Verify that the journal really has been rewritten. + journal_version = self.fs.get_journal_version() + if journal_version != new_journal_version: + raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format( + new_journal_version, journal_version() + )) + + # Verify that cephfs-journal-tool can now read the rewritten journal + inspect_out = self.fs.journal_tool(["journal", "inspect"]) + if not inspect_out.endswith(": OK"): + raise RuntimeError("Unexpected journal-tool result: '{0}'".format( + inspect_out + )) + + self.fs.journal_tool(["event", "get", "json", "--path", "/tmp/journal.json"]) + p = self.fs.tool_remote.run( + args=[ + "python", + "-c", + "import json; print len(json.load(open('/tmp/journal.json')))" + ], + stdout=StringIO()) + event_count = int(p.stdout.getvalue().strip()) + if event_count < 1000: + # Approximate value of "lots", expected from having run fsstress + raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count)) + + # Do some client work to check that writing the log is still working + with self.mount_a.mounted(): + workunit(self.ctx, { + 'clients': { + "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"], + }, + "timeout": "3h" + }) + + # Check that both an active and a standby replay are still up + self.assertEqual(len(replay_names()), 1) + self.assertEqual(len(self.fs.get_active_names()), 1) + self.assertTrue(self.mds_cluster.mds_daemons[mds_a].running()) + self.assertTrue(self.mds_cluster.mds_daemons[mds_b].running()) + diff --git a/src/ceph/qa/tasks/cephfs/test_journal_repair.py b/src/ceph/qa/tasks/cephfs/test_journal_repair.py new file mode 100644 index 0000000..62cbbb0 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_journal_repair.py @@ -0,0 +1,443 @@ + +""" +Test our tools for recovering the content of damaged journals +""" + +import json +import logging +from textwrap import dedent +import time + +from teuthology.exceptions import CommandFailedError, ConnectionLostError +from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology +from tasks.workunit import task as workunit + +log = logging.getLogger(__name__) + + +class TestJournalRepair(CephFSTestCase): + MDSS_REQUIRED = 2 + + def test_inject_to_empty(self): + """ + That when some dentries in the journal but nothing is in + the backing store, we correctly populate the backing store + from the journalled dentries. + """ + + # Inject metadata operations + self.mount_a.run_shell(["touch", "rootfile"]) + self.mount_a.run_shell(["mkdir", "subdir"]) + self.mount_a.run_shell(["touch", "subdir/subdirfile"]) + # There are several different paths for handling hardlinks, depending + # on whether an existing dentry (being overwritten) is also a hardlink + self.mount_a.run_shell(["mkdir", "linkdir"]) + + # Test inode -> remote transition for a dentry + self.mount_a.run_shell(["touch", "linkdir/link0"]) + self.mount_a.run_shell(["rm", "-f", "linkdir/link0"]) + self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"]) + + # Test nothing -> remote transition + self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"]) + + # Test remote -> inode transition + self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"]) + self.mount_a.run_shell(["rm", "-f", "linkdir/link2"]) + self.mount_a.run_shell(["touch", "linkdir/link2"]) + + # Test remote -> diff remote transition + self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"]) + self.mount_a.run_shell(["rm", "-f", "linkdir/link3"]) + self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"]) + + # Test an empty directory + self.mount_a.run_shell(["mkdir", "subdir/subsubdir"]) + self.mount_a.run_shell(["sync"]) + + # Before we unmount, make a note of the inode numbers, later we will + # check that they match what we recover from the journal + rootfile_ino = self.mount_a.path_to_ino("rootfile") + subdir_ino = self.mount_a.path_to_ino("subdir") + linkdir_ino = self.mount_a.path_to_ino("linkdir") + subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile") + subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir") + + self.mount_a.umount_wait() + + # Stop the MDS + self.fs.mds_stop() + self.fs.mds_fail() + + # Now, the journal should contain the operations, but the backing + # store shouldn't + with self.assertRaises(ObjectNotFound): + self.fs.list_dirfrag(subdir_ino) + self.assertEqual(self.fs.list_dirfrag(ROOT_INO), []) + + # Execute the dentry recovery, this should populate the backing store + self.fs.journal_tool(['event', 'recover_dentries', 'list']) + + # Dentries in ROOT_INO are present + self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head'])) + self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head']) + self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)), + sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head'])) + + # Now check the MDS can read what we wrote: truncate the journal + # and start the mds. + self.fs.journal_tool(['journal', 'reset']) + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + # List files + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + # First ls -R to populate MDCache, such that hardlinks will + # resolve properly (recover_dentries does not create backtraces, + # so ordinarily hardlinks to inodes that happen not to have backtraces + # will be invisible in readdir). + # FIXME: hook in forward scrub here to regenerate backtraces + proc = self.mount_a.run_shell(['ls', '-R']) + self.mount_a.umount_wait() # remount to clear client cache before our second ls + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + proc = self.mount_a.run_shell(['ls', '-R']) + self.assertEqual(proc.stdout.getvalue().strip(), + dedent(""" + .: + linkdir + rootfile + subdir + + ./linkdir: + link0 + link1 + link2 + link3 + + ./subdir: + subdirfile + subsubdir + + ./subdir/subsubdir: + """).strip()) + + # Check the correct inos were preserved by path + self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile")) + self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir")) + self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile")) + self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir")) + + # Check that the hard link handling came out correctly + self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino) + self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino) + self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino) + self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino) + + # Create a new file, ensure it is not issued the same ino as one of the + # recovered ones + self.mount_a.run_shell(["touch", "afterwards"]) + new_ino = self.mount_a.path_to_ino("afterwards") + self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino]) + + # Check that we can do metadata ops in the recovered directory + self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"]) + + @for_teuthology # 308s + def test_reset(self): + """ + That after forcibly modifying the backing store, we can get back into + a good state by resetting the MDSMap. + + The scenario is that we have two active MDSs, and we lose the journals. Once + we have completely lost confidence in the integrity of the metadata, we want to + return the system to a single-MDS state to go into a scrub to recover what we + can. + """ + + # Set max_mds to 2 + self.fs.set_max_mds(2) + + # See that we have two active MDSs + self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30, + reject_fn=lambda v: v > 2 or v < 1) + active_mds_names = self.fs.get_active_names() + + # Switch off any unneeded MDS daemons + for unneeded_mds in set(self.mds_cluster.mds_ids) - set(active_mds_names): + self.mds_cluster.mds_stop(unneeded_mds) + self.mds_cluster.mds_fail(unneeded_mds) + + # Create a dir on each rank + self.mount_a.run_shell(["mkdir", "alpha"]) + self.mount_a.run_shell(["mkdir", "bravo"]) + self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0") + self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1") + + def subtrees_assigned(): + got_subtrees = self.fs.mds_asok(["get", "subtrees"], mds_id=active_mds_names[0]) + + for s in got_subtrees: + if s['dir']['path'] == '/bravo': + if s['auth_first'] == 1: + return True + else: + # Should not happen + raise RuntimeError("/bravo is subtree but not rank 1!") + + return False + + # Ensure the pinning has taken effect and the /bravo dir is now + # migrated to rank 1. + self.wait_until_true(subtrees_assigned, 30) + + # Do some IO (this should be split across ranks according to + # the rank-pinned dirs) + self.mount_a.create_n_files("alpha/file", 1000) + self.mount_a.create_n_files("bravo/file", 1000) + + # Flush the journals so that we have some backing store data + # belonging to one MDS, and some to the other MDS. + for mds_name in active_mds_names: + self.fs.mds_asok(["flush", "journal"], mds_name) + + # Stop (hard) the second MDS daemon + self.fs.mds_stop(active_mds_names[1]) + + # Wipe out the tables for MDS rank 1 so that it is broken and can't start + # (this is the simulated failure that we will demonstrate that the disaster + # recovery tools can get us back from) + self.fs.erase_metadata_objects(prefix="mds1_") + + # Try to access files from the client + blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False) + + # Check that this "ls -R" blocked rather than completing: indicates + # it got stuck trying to access subtrees which were on the now-dead MDS. + log.info("Sleeping to check ls is blocked...") + time.sleep(60) + self.assertFalse(blocked_ls.finished) + + # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1 + # is not coming back. Kill it. + log.info("Killing mount, it's blocked on the MDS we killed") + self.mount_a.kill() + self.mount_a.kill_cleanup() + try: + # Now that the mount is dead, the ls -R should error out. + blocked_ls.wait() + except (CommandFailedError, ConnectionLostError): + # The ConnectionLostError case is for kernel client, where + # killing the mount also means killing the node. + pass + + # See that the second MDS will crash when it starts and tries to + # acquire rank 1 + damaged_id = active_mds_names[1] + self.fs.mds_restart(damaged_id) + + # The daemon taking the damaged rank should start starting, then + # restart back into standby after asking the mon to mark the rank + # damaged. + def is_marked_damaged(): + mds_map = self.fs.get_mds_map() + return 1 in mds_map['damaged'] + + self.wait_until_true(is_marked_damaged, 60) + + def get_state(): + info = self.mds_cluster.get_mds_info(damaged_id) + return info['state'] if info is not None else None + + self.wait_until_equal( + get_state, + "up:standby", + timeout=60) + + self.fs.mds_stop(damaged_id) + self.fs.mds_fail(damaged_id) + + # Now give up and go through a disaster recovery procedure + self.fs.mds_stop(active_mds_names[0]) + self.fs.mds_fail(active_mds_names[0]) + # Invoke recover_dentries quietly, because otherwise log spews millions of lines + self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=0, quiet=True) + self.fs.journal_tool(["event", "recover_dentries", "summary"], rank=1, quiet=True) + self.fs.table_tool(["0", "reset", "session"]) + self.fs.journal_tool(["journal", "reset"], rank=0) + self.fs.erase_mds_objects(1) + self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name, + '--yes-i-really-mean-it') + + # Bring an MDS back online, mount a client, and see that we can walk the full + # filesystem tree again + self.fs.mds_fail_restart(active_mds_names[0]) + self.wait_until_equal(lambda: self.fs.get_active_names(), [active_mds_names[0]], 30, + reject_fn=lambda v: len(v) > 1) + self.mount_a.mount() + self.mount_a.run_shell(["ls", "-R"], wait=True) + + def test_table_tool(self): + active_mdss = self.fs.get_active_names() + self.assertEqual(len(active_mdss), 1) + mds_name = active_mdss[0] + + self.mount_a.run_shell(["touch", "foo"]) + self.fs.mds_asok(["flush", "journal"], mds_name) + + log.info(self.fs.table_tool(["all", "show", "inode"])) + log.info(self.fs.table_tool(["all", "show", "snap"])) + log.info(self.fs.table_tool(["all", "show", "session"])) + + # Inode table should always be the same because initial state + # and choice of inode are deterministic. + # Should see one inode consumed + self.assertEqual( + json.loads(self.fs.table_tool(["all", "show", "inode"])), + {"0": { + "data": { + "version": 2, + "inotable": { + "projected_free": [ + {"start": 1099511628777, + "len": 1099511626775}], + "free": [ + {"start": 1099511628777, + "len": 1099511626775}]}}, + "result": 0}} + + ) + + # Should see one session + session_data = json.loads(self.fs.table_tool( + ["all", "show", "session"])) + self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 1) + self.assertEqual(session_data["0"]["result"], 0) + + # Should see no snaps + self.assertEqual( + json.loads(self.fs.table_tool(["all", "show", "snap"])), + {"version": 0, + "snapserver": {"last_snap": 1, + "pending_noop": [], + "snaps": [], + "need_to_purge": {}, + "pending_update": [], + "pending_destroy": []}, + "result": 0} + ) + + # Reset everything + for table in ["session", "inode", "snap"]: + self.fs.table_tool(["all", "reset", table]) + + log.info(self.fs.table_tool(["all", "show", "inode"])) + log.info(self.fs.table_tool(["all", "show", "snap"])) + log.info(self.fs.table_tool(["all", "show", "session"])) + + # Should see 0 sessions + session_data = json.loads(self.fs.table_tool( + ["all", "show", "session"])) + self.assertEqual(len(session_data["0"]["data"]["Sessions"]), 0) + self.assertEqual(session_data["0"]["result"], 0) + + # Should see entire inode range now marked free + self.assertEqual( + json.loads(self.fs.table_tool(["all", "show", "inode"])), + {"0": {"data": {"version": 1, + "inotable": {"projected_free": [ + {"start": 1099511627776, + "len": 1099511627776}], + "free": [ + {"start": 1099511627776, + "len": 1099511627776}]}}, + "result": 0}} + ) + + # Should see no snaps + self.assertEqual( + json.loads(self.fs.table_tool(["all", "show", "snap"])), + {"version": 1, + "snapserver": {"last_snap": 1, + "pending_noop": [], + "snaps": [], + "need_to_purge": {}, + "pending_update": [], + "pending_destroy": []}, + "result": 0} + ) + + def test_table_tool_take_inos(self): + initial_range_start = 1099511627776 + initial_range_len = 1099511627776 + # Initially a completely clear range + self.assertEqual( + json.loads(self.fs.table_tool(["all", "show", "inode"])), + {"0": {"data": {"version": 0, + "inotable": {"projected_free": [ + {"start": initial_range_start, + "len": initial_range_len}], + "free": [ + {"start": initial_range_start, + "len": initial_range_len}]}}, + "result": 0}} + ) + + # Remove some + self.assertEqual( + json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])), + {"0": {"data": {"version": 1, + "inotable": {"projected_free": [ + {"start": initial_range_start + 101, + "len": initial_range_len - 101}], + "free": [ + {"start": initial_range_start + 101, + "len": initial_range_len - 101}]}}, + "result": 0}} + ) + + @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth + def test_journal_smoke(self): + workunit(self.ctx, { + 'clients': { + "client.{0}".format(self.mount_a.client_id): [ + "fs/misc/trivial_sync.sh"], + }, + "timeout": "1h" + }) + + for mount in self.mounts: + mount.umount_wait() + + self.fs.mds_stop() + self.fs.mds_fail() + + # journal tool smoke + workunit(self.ctx, { + 'clients': { + "client.{0}".format(self.mount_a.client_id): [ + "suites/cephfs_journal_tool_smoke.sh"], + }, + "timeout": "1h" + }) + + + + self.fs.mds_restart() + self.fs.wait_for_daemons() + + self.mount_a.mount() + + # trivial sync moutn a + workunit(self.ctx, { + 'clients': { + "client.{0}".format(self.mount_a.client_id): [ + "fs/misc/trivial_sync.sh"], + }, + "timeout": "1h" + }) + diff --git a/src/ceph/qa/tasks/cephfs/test_mantle.py b/src/ceph/qa/tasks/cephfs/test_mantle.py new file mode 100644 index 0000000..6cd86ad --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_mantle.py @@ -0,0 +1,109 @@ +from tasks.cephfs.cephfs_test_case import CephFSTestCase +import json +import logging + +log = logging.getLogger(__name__) +failure = "using old balancer; mantle failed for balancer=" +success = "mantle balancer version changed: " + +class TestMantle(CephFSTestCase): + def start_mantle(self): + self.wait_for_health_clear(timeout=30) + self.fs.set_max_mds(2) + self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30, + reject_fn=lambda v: v > 2 or v < 1) + + for m in self.fs.get_active_names(): + self.fs.mds_asok(['config', 'set', 'debug_objecter', '20'], mds_id=m) + self.fs.mds_asok(['config', 'set', 'debug_ms', '0'], mds_id=m) + self.fs.mds_asok(['config', 'set', 'debug_mds', '0'], mds_id=m) + self.fs.mds_asok(['config', 'set', 'debug_mds_balancer', '5'], mds_id=m) + + def push_balancer(self, obj, lua_code, expect): + self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', obj) + self.fs.rados(["put", obj, "-"], stdin_data=lua_code) + with self.assert_cluster_log(failure + obj + " " + expect): + log.info("run a " + obj + " balancer that expects=" + expect) + + def test_version_empty(self): + self.start_mantle() + expect = " : (2) No such file or directory" + + ret = self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer') + assert(ret == 22) # EINVAL + + self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', " ") + with self.assert_cluster_log(failure + " " + expect): pass + + def test_version_not_in_rados(self): + self.start_mantle() + expect = failure + "ghost.lua : (2) No such file or directory" + self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "ghost.lua") + with self.assert_cluster_log(expect): pass + + def test_balancer_invalid(self): + self.start_mantle() + expect = ": (22) Invalid argument" + + lua_code = "this is invalid lua code!" + self.push_balancer("invalid.lua", lua_code, expect) + + lua_code = "BAL_LOG()" + self.push_balancer("invalid_log.lua", lua_code, expect) + + lua_code = "BAL_LOG(0)" + self.push_balancer("invalid_log_again.lua", lua_code, expect) + + def test_balancer_valid(self): + self.start_mantle() + lua_code = "BAL_LOG(0, \"test\")\nreturn {3, 4}" + self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua") + self.fs.rados(["put", "valid.lua", "-"], stdin_data=lua_code) + with self.assert_cluster_log(success + "valid.lua"): + log.info("run a valid.lua balancer") + + def test_return_invalid(self): + self.start_mantle() + expect = ": (22) Invalid argument" + + lua_code = "return \"hello\"" + self.push_balancer("string.lua", lua_code, expect) + + lua_code = "return 3" + self.push_balancer("number.lua", lua_code, expect) + + lua_code = "return {}" + self.push_balancer("dict_empty.lua", lua_code, expect) + + lua_code = "return {\"this\", \"is\", \"a\", \"test\"}" + self.push_balancer("dict_of_strings.lua", lua_code, expect) + + lua_code = "return {3, \"test\"}" + self.push_balancer("dict_of_mixed.lua", lua_code, expect) + + lua_code = "return {3}" + self.push_balancer("not_enough_numbers.lua", lua_code, expect) + + lua_code = "return {3, 4, 5, 6, 7, 8, 9}" + self.push_balancer("too_many_numbers.lua", lua_code, expect) + + def test_dead_osd(self): + self.start_mantle() + expect = " : (110) Connection timed out" + + # kill the OSDs so that the balancer pull from RADOS times out + osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty')) + for i in range(0, len(osd_map['osds'])): + self.fs.mon_manager.raw_cluster_cmd_result('osd', 'down', str(i)) + self.fs.mon_manager.raw_cluster_cmd_result('osd', 'out', str(i)) + + # trigger a pull from RADOS + self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua") + + # make the timeout a little longer since dead OSDs spam ceph -w + with self.assert_cluster_log(failure + "valid.lua" + expect, timeout=30): + log.info("run a balancer that should timeout") + + # cleanup + for i in range(0, len(osd_map['osds'])): + self.fs.mon_manager.raw_cluster_cmd_result('osd', 'in', str(i)) diff --git a/src/ceph/qa/tasks/cephfs/test_misc.py b/src/ceph/qa/tasks/cephfs/test_misc.py new file mode 100644 index 0000000..d857cfd --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_misc.py @@ -0,0 +1,149 @@ + +from unittest import SkipTest +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.orchestra.run import CommandFailedError +import errno +import time +import json + + +class TestMisc(CephFSTestCase): + CLIENTS_REQUIRED = 2 + + LOAD_SETTINGS = ["mds_session_autoclose"] + mds_session_autoclose = None + + def test_getattr_caps(self): + """ + Check if MDS recognizes the 'mask' parameter of open request. + The paramter allows client to request caps when opening file + """ + + if not isinstance(self.mount_a, FuseMount): + raise SkipTest("Require FUSE client") + + # Enable debug. Client will requests CEPH_CAP_XATTR_SHARED + # on lookup/open + self.mount_b.umount_wait() + self.set_conf('client', 'client debug getattr caps', 'true') + self.mount_b.mount() + self.mount_b.wait_until_mounted() + + # create a file and hold it open. MDS will issue CEPH_CAP_EXCL_* + # to mount_a + p = self.mount_a.open_background("testfile") + self.mount_b.wait_for_visible("testfile") + + # this tiggers a lookup request and an open request. The debug + # code will check if lookup/open reply contains xattrs + self.mount_b.run_shell(["cat", "testfile"]) + + self.mount_a.kill_background(p) + + def test_fs_new(self): + data_pool_name = self.fs.get_data_pool_name() + + self.fs.mds_stop() + self.fs.mds_fail() + + self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name, + '--yes-i-really-mean-it') + + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete', + self.fs.metadata_pool_name, + self.fs.metadata_pool_name, + '--yes-i-really-really-mean-it') + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', + self.fs.metadata_pool_name, + self.fs.get_pgs_per_fs_pool().__str__()) + + dummyfile = '/etc/fstab' + + self.fs.put_metadata_object_raw("key", dummyfile) + + def get_pool_df(fs, name): + try: + return fs.get_pool_df(name)['objects'] > 0 + except RuntimeError as e: + return False + + self.wait_until_true(lambda: get_pool_df(self.fs, self.fs.metadata_pool_name), timeout=30) + + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name, + self.fs.metadata_pool_name, + data_pool_name) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + raise AssertionError("Expected EINVAL") + + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name, + self.fs.metadata_pool_name, + data_pool_name, "--force") + + self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name, + '--yes-i-really-mean-it') + + + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete', + self.fs.metadata_pool_name, + self.fs.metadata_pool_name, + '--yes-i-really-really-mean-it') + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', + self.fs.metadata_pool_name, + self.fs.get_pgs_per_fs_pool().__str__()) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name, + self.fs.metadata_pool_name, + data_pool_name) + + def test_evict_client(self): + """ + Check that a slow client session won't get evicted if it's the + only session + """ + + self.mount_b.umount_wait() + ls_data = self.fs.mds_asok(['session', 'ls']) + self.assert_session_count(1, ls_data) + + self.mount_a.kill() + self.mount_a.kill_cleanup() + + time.sleep(self.mds_session_autoclose * 1.5) + ls_data = self.fs.mds_asok(['session', 'ls']) + self.assert_session_count(1, ls_data) + + self.mount_a.mount() + self.mount_a.wait_until_mounted() + self.mount_b.mount() + self.mount_b.wait_until_mounted() + + ls_data = self._session_list() + self.assert_session_count(2, ls_data) + + self.mount_a.kill() + self.mount_a.kill_cleanup() + + time.sleep(self.mds_session_autoclose * 1.5) + ls_data = self.fs.mds_asok(['session', 'ls']) + self.assert_session_count(1, ls_data) + + def test_filtered_df(self): + pool_name = self.fs.get_data_pool_name() + raw_df = self.fs.get_pool_df(pool_name) + raw_avail = float(raw_df["max_avail"]) + out = self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'get', + pool_name, 'size', + '-f', 'json-pretty') + j = json.loads(out) + pool_size = int(j['size']) + + proc = self.mount_a.run_shell(['df', '.']) + output = proc.stdout.getvalue() + fs_avail = output.split('\n')[1].split()[3] + fs_avail = float(fs_avail) * 1024 + + ratio = raw_avail / fs_avail + assert 0.9 < ratio < 1.1 diff --git a/src/ceph/qa/tasks/cephfs/test_pool_perm.py b/src/ceph/qa/tasks/cephfs/test_pool_perm.py new file mode 100644 index 0000000..22775e7 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_pool_perm.py @@ -0,0 +1,113 @@ +from textwrap import dedent +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase +import os + + +class TestPoolPerm(CephFSTestCase): + def test_pool_perm(self): + self.mount_a.run_shell(["touch", "test_file"]) + + file_path = os.path.join(self.mount_a.mountpoint, "test_file") + + remote_script = dedent(""" + import os + import errno + + fd = os.open("{path}", os.O_RDWR) + try: + if {check_read}: + ret = os.read(fd, 1024) + else: + os.write(fd, 'content') + except OSError, e: + if e.errno != errno.EPERM: + raise + else: + raise RuntimeError("client does not check permission of data pool") + """) + + client_name = "client.{0}".format(self.mount_a.client_id) + + # set data pool read only + self.fs.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd', + 'allow r pool={0}'.format(self.fs.get_data_pool_name())) + + self.mount_a.umount_wait() + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + # write should fail + self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(False))) + + # set data pool write only + self.fs.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd', + 'allow w pool={0}'.format(self.fs.get_data_pool_name())) + + self.mount_a.umount_wait() + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + # read should fail + self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(True))) + + def test_forbidden_modification(self): + """ + That a client who does not have the capability for setting + layout pools is prevented from doing so. + """ + + # Set up + client_name = "client.{0}".format(self.mount_a.client_id) + new_pool_name = "data_new" + self.fs.add_data_pool(new_pool_name) + + self.mount_a.run_shell(["touch", "layoutfile"]) + self.mount_a.run_shell(["mkdir", "layoutdir"]) + + # Set MDS 'rw' perms: missing 'p' means no setting pool layouts + self.fs.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', client_name, 'mds', 'allow rw', 'mon', 'allow r', + 'osd', + 'allow rw pool={0},allow rw pool={1}'.format( + self.fs.get_data_pool_names()[0], + self.fs.get_data_pool_names()[1], + )) + + self.mount_a.umount_wait() + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + with self.assertRaises(CommandFailedError): + self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool", + new_pool_name) + with self.assertRaises(CommandFailedError): + self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool", + new_pool_name) + self.mount_a.umount_wait() + + # Set MDS 'rwp' perms: should now be able to set layouts + self.fs.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', client_name, 'mds', 'allow rwp', 'mon', 'allow r', + 'osd', + 'allow rw pool={0},allow rw pool={1}'.format( + self.fs.get_data_pool_names()[0], + self.fs.get_data_pool_names()[1], + )) + self.mount_a.mount() + self.mount_a.wait_until_mounted() + self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool", + new_pool_name) + self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool", + new_pool_name) + self.mount_a.umount_wait() + + def tearDown(self): + self.fs.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_a.client_id), + 'mds', 'allow', 'mon', 'allow r', 'osd', + 'allow rw pool={0}'.format(self.fs.get_data_pool_names()[0])) + super(TestPoolPerm, self).tearDown() + diff --git a/src/ceph/qa/tasks/cephfs/test_quota.py b/src/ceph/qa/tasks/cephfs/test_quota.py new file mode 100644 index 0000000..ee11c58 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_quota.py @@ -0,0 +1,106 @@ + +from cephfs_test_case import CephFSTestCase + +from teuthology.exceptions import CommandFailedError + +class TestQuota(CephFSTestCase): + CLIENTS_REQUIRED = 2 + MDSS_REQUIRED = 1 + + def test_remote_update_getfattr(self): + """ + That quota changes made from one client are visible to another + client looking at ceph.quota xattrs + """ + self.mount_a.run_shell(["mkdir", "subdir"]) + + self.assertEqual( + self.mount_a.getfattr("./subdir", "ceph.quota.max_files"), + None) + self.assertEqual( + self.mount_b.getfattr("./subdir", "ceph.quota.max_files"), + None) + + self.mount_a.setfattr("./subdir", "ceph.quota.max_files", "10") + self.assertEqual( + self.mount_a.getfattr("./subdir", "ceph.quota.max_files"), + "10") + + # Should be visible as soon as setxattr operation completes on + # mds (we get here sooner because setfattr gets an early reply) + self.wait_until_equal( + lambda: self.mount_b.getfattr("./subdir", "ceph.quota.max_files"), + "10", timeout=10) + + def test_remote_update_df(self): + """ + That when a client modifies the quota on a directory used + as another client's root, the other client sees the change + reflected in their statfs output. + """ + + self.mount_b.umount_wait() + + self.mount_a.run_shell(["mkdir", "subdir"]) + + size_before = 1024 * 1024 * 128 + self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes", + "%s" % size_before) + + self.mount_b.mount(mount_path="/subdir") + + self.assertDictEqual( + self.mount_b.df(), + { + "total": size_before, + "used": 0, + "available": size_before + }) + + size_after = 1024 * 1024 * 256 + self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes", + "%s" % size_after) + + # Should be visible as soon as setxattr operation completes on + # mds (we get here sooner because setfattr gets an early reply) + self.wait_until_equal( + lambda: self.mount_b.df(), + { + "total": size_after, + "used": 0, + "available": size_after + }, + timeout=10 + ) + + def test_remote_update_write(self): + """ + That when a client modifies the quota on a directory used + as another client's root, the other client sees the effect + of the change when writing data. + """ + + self.mount_a.run_shell(["mkdir", "subdir_files"]) + self.mount_a.run_shell(["mkdir", "subdir_data"]) + + # Set some nice high quotas that mount_b's initial operations + # will be well within + self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "100") + self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "104857600") + + # Do some writes within my quota + self.mount_b.create_n_files("subdir_files/file", 20) + self.mount_b.write_n_mb("subdir_data/file", 20) + + # Set quotas lower than what mount_b already wrote, it should + # refuse to write more once it's seen them + self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "10") + self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "1048576") + + # Do some writes that would have been okay within the old quota, + # but are forbidden under the new quota + with self.assertRaises(CommandFailedError): + self.mount_b.create_n_files("subdir_files/file", 40) + with self.assertRaises(CommandFailedError): + self.mount_b.write_n_mb("subdir_data/file", 40) + diff --git a/src/ceph/qa/tasks/cephfs/test_readahead.py b/src/ceph/qa/tasks/cephfs/test_readahead.py new file mode 100644 index 0000000..31e7bf1 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_readahead.py @@ -0,0 +1,31 @@ +import logging +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + + +class TestReadahead(CephFSTestCase): + def test_flush(self): + if not isinstance(self.mount_a, FuseMount): + self.skipTest("FUSE needed for measuring op counts") + + # Create 32MB file + self.mount_a.run_shell(["dd", "if=/dev/urandom", "of=foo", "bs=1M", "count=32"]) + + # Unmount and remount the client to flush cache + self.mount_a.umount_wait() + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + initial_op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r'] + self.mount_a.run_shell(["dd", "if=foo", "of=/dev/null", "bs=128k", "count=32"]) + op_r = self.mount_a.admin_socket(['perf', 'dump', 'objecter'])['objecter']['op_r'] + assert op_r >= initial_op_r + op_r -= initial_op_r + log.info("read operations: {0}".format(op_r)) + + # with exponentially increasing readahead, we should see fewer than 10 operations + # but this test simply checks if the client is doing a remote read for each local read + if op_r >= 32: + raise RuntimeError("readahead not working") diff --git a/src/ceph/qa/tasks/cephfs/test_recovery_pool.py b/src/ceph/qa/tasks/cephfs/test_recovery_pool.py new file mode 100644 index 0000000..097342a --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_recovery_pool.py @@ -0,0 +1,220 @@ + +""" +Test our tools for recovering metadata from the data pool into an alternate pool +""" +import json + +import logging +import os +from textwrap import dedent +import traceback +from collections import namedtuple, defaultdict + +from teuthology.orchestra.run import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology + +log = logging.getLogger(__name__) + + +ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) + + +class OverlayWorkload(object): + def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount): + self._orig_fs = orig_fs + self._recovery_fs = recovery_fs + self._orig_mount = orig_mount + self._recovery_mount = recovery_mount + self._initial_state = None + + # Accumulate backtraces for every failed validation, and return them. Backtraces + # are rather verbose, but we only see them when something breaks, and they + # let us see which check failed without having to decorate each check with + # a string + self._errors = [] + + def assert_equal(self, a, b): + try: + if a != b: + raise AssertionError("{0} != {1}".format(a, b)) + except AssertionError as e: + self._errors.append( + ValidationError(e, traceback.format_exc(3)) + ) + + def write(self): + """ + Write the workload files to the mount + """ + raise NotImplementedError() + + def validate(self): + """ + Read from the mount and validate that the workload files are present (i.e. have + survived or been reconstructed from the test scenario) + """ + raise NotImplementedError() + + def damage(self): + """ + Damage the filesystem pools in ways that will be interesting to recover from. By + default just wipe everything in the metadata pool + """ + # Delete every object in the metadata pool + objects = self._orig_fs.rados(["ls"]).split("\n") + for o in objects: + self._orig_fs.rados(["rm", o]) + + def flush(self): + """ + Called after client unmount, after write: flush whatever you want + """ + self._orig_fs.mds_asok(["flush", "journal"]) + self._recovery_fs.mds_asok(["flush", "journal"]) + + +class SimpleOverlayWorkload(OverlayWorkload): + """ + Single file, single directory, check that it gets recovered and so does its size + """ + def write(self): + self._orig_mount.run_shell(["mkdir", "subdir"]) + self._orig_mount.write_n_mb("subdir/sixmegs", 6) + self._initial_state = self._orig_mount.stat("subdir/sixmegs") + + def validate(self): + self._recovery_mount.run_shell(["ls", "subdir"]) + st = self._recovery_mount.stat("subdir/sixmegs") + self.assert_equal(st['st_size'], self._initial_state['st_size']) + return self._errors + +class TestRecoveryPool(CephFSTestCase): + MDSS_REQUIRED = 2 + CLIENTS_REQUIRED = 2 + REQUIRE_RECOVERY_FILESYSTEM = True + + def is_marked_damaged(self, rank): + mds_map = self.fs.get_mds_map() + return rank in mds_map['damaged'] + + def _rebuild_metadata(self, workload, other_pool=None, workers=1): + """ + That when all objects in metadata pool are removed, we can rebuild a metadata pool + based on the contents of a data pool, and a client can see and read our files. + """ + + # First, inject some files + + workload.write() + + # Unmount the client and flush the journal: the tool should also cope with + # situations where there is dirty metadata, but we'll test that separately + self.mount_a.umount_wait() + self.mount_b.umount_wait() + workload.flush() + + # Create the alternate pool if requested + recovery_fs = self.recovery_fs.name + recovery_pool = self.recovery_fs.get_metadata_pool_name() + self.recovery_fs.data_scan(['init', '--force-init', + '--filesystem', recovery_fs, + '--alternate-pool', recovery_pool]) + self.recovery_fs.mon_manager.raw_cluster_cmd('-s') + self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"]) + self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"]) + self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"]) + + # Stop the MDS + self.fs.mds_stop() + self.fs.mds_fail() + + # After recovery, we need the MDS to not be strict about stats (in production these options + # are off by default, but in QA we need to explicitly disable them) + self.fs.set_ceph_conf('mds', 'mds verify scatter', False) + self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) + + # Apply any data damage the workload wants + workload.damage() + + # Reset the MDS map in case multiple ranks were in play: recovery procedure + # only understands how to rebuild metadata under rank 0 + self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name, + '--yes-i-really-mean-it') + + def get_state(mds_id): + info = self.mds_cluster.get_mds_info(mds_id) + return info['state'] if info is not None else None + + self.fs.table_tool([self.fs.name + ":0", "reset", "session"]) + self.fs.table_tool([self.fs.name + ":0", "reset", "snap"]) + self.fs.table_tool([self.fs.name + ":0", "reset", "inode"]) + + # Run the recovery procedure + if False: + with self.assertRaises(CommandFailedError): + # Normal reset should fail when no objects are present, we'll use --force instead + self.fs.journal_tool(["journal", "reset"]) + + self.fs.mds_stop() + self.fs.data_scan(['scan_extents', '--alternate-pool', + recovery_pool, '--filesystem', self.fs.name, + self.fs.get_data_pool_name()]) + self.fs.data_scan(['scan_inodes', '--alternate-pool', + recovery_pool, '--filesystem', self.fs.name, + '--force-corrupt', '--force-init', + self.fs.get_data_pool_name()]) + self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event', + 'recover_dentries', 'list', + '--alternate-pool', recovery_pool]) + + self.fs.data_scan(['init', '--force-init', '--filesystem', + self.fs.name]) + self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name, + '--force-corrupt', '--force-init', + self.fs.get_data_pool_name()]) + self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event', + 'recover_dentries', 'list']) + + self.fs.journal_tool(['--rank=' + recovery_fs + ":0", 'journal', + 'reset', '--force']) + self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal', + 'reset', '--force']) + self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', + recovery_fs + ":0") + + # Mark the MDS repaired + self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') + + # Start the MDS + self.fs.mds_restart() + self.recovery_fs.mds_restart() + self.fs.wait_for_daemons() + self.recovery_fs.wait_for_daemons() + for mds_id in self.recovery_fs.mds_ids: + self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id, + 'injectargs', '--debug-mds=20') + self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id, + 'scrub_path', '/', + 'recursive', 'repair') + log.info(str(self.mds_cluster.status())) + + # Mount a client + self.mount_a.mount() + self.mount_b.mount(mount_fs_name=recovery_fs) + self.mount_a.wait_until_mounted() + self.mount_b.wait_until_mounted() + + # See that the files are present and correct + errors = workload.validate() + if errors: + log.error("Validation errors found: {0}".format(len(errors))) + for e in errors: + log.error(e.exception) + log.error(e.backtrace) + raise AssertionError("Validation failed, first error: {0}\n{1}".format( + errors[0].exception, errors[0].backtrace + )) + + def test_rebuild_simple(self): + self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs, + self.mount_a, self.mount_b)) diff --git a/src/ceph/qa/tasks/cephfs/test_scrub_checks.py b/src/ceph/qa/tasks/cephfs/test_scrub_checks.py new file mode 100644 index 0000000..a2de527 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_scrub_checks.py @@ -0,0 +1,245 @@ +""" +MDS admin socket scrubbing-related tests. +""" +import json +import logging +import errno +import time +from teuthology.exceptions import CommandFailedError +import os +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + + +class TestScrubChecks(CephFSTestCase): + """ + Run flush and scrub commands on the specified files in the filesystem. This + task will run through a sequence of operations, but it is not comprehensive + on its own -- it doesn't manipulate the mds cache state to test on both + in- and out-of-memory parts of the hierarchy. So it's designed to be run + multiple times within a single test run, so that the test can manipulate + memory state. + + Usage: + mds_scrub_checks: + mds_rank: 0 + path: path/to/test/dir + client: 0 + run_seq: [0-9]+ + + Increment the run_seq on subsequent invocations within a single test run; + it uses that value to generate unique folder and file names. + """ + + MDSS_REQUIRED = 1 + CLIENTS_REQUIRED = 1 + + def test_scrub_checks(self): + self._checks(0) + self._checks(1) + + def _checks(self, run_seq): + mds_rank = 0 + test_dir = "scrub_test_path" + + abs_test_path = "/{0}".format(test_dir) + + log.info("mountpoint: {0}".format(self.mount_a.mountpoint)) + client_path = os.path.join(self.mount_a.mountpoint, test_dir) + log.info("client_path: {0}".format(client_path)) + + log.info("Cloning repo into place") + repo_path = self.clone_repo(self.mount_a, client_path) + + log.info("Initiating mds_scrub_checks on mds.{id_}, " + + "test_path {path}, run_seq {seq}".format( + id_=mds_rank, path=abs_test_path, seq=run_seq) + ) + + + success_validator = lambda j, r: self.json_validator(j, r, "return_code", 0) + + nep = "{test_path}/i/dont/exist".format(test_path=abs_test_path) + self.asok_command(mds_rank, "flush_path {nep}".format(nep=nep), + lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT)) + self.asok_command(mds_rank, "scrub_path {nep}".format(nep=nep), + lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT)) + + test_repo_path = "{test_path}/ceph-qa-suite".format(test_path=abs_test_path) + dirpath = "{repo_path}/suites".format(repo_path=test_repo_path) + + if run_seq == 0: + log.info("First run: flushing {dirpath}".format(dirpath=dirpath)) + command = "flush_path {dirpath}".format(dirpath=dirpath) + self.asok_command(mds_rank, command, success_validator) + command = "scrub_path {dirpath}".format(dirpath=dirpath) + self.asok_command(mds_rank, command, success_validator) + + filepath = "{repo_path}/suites/fs/verify/validater/valgrind.yaml".format( + repo_path=test_repo_path) + if run_seq == 0: + log.info("First run: flushing {filepath}".format(filepath=filepath)) + command = "flush_path {filepath}".format(filepath=filepath) + self.asok_command(mds_rank, command, success_validator) + command = "scrub_path {filepath}".format(filepath=filepath) + self.asok_command(mds_rank, command, success_validator) + + filepath = "{repo_path}/suites/fs/basic/clusters/fixed-3-cephfs.yaml". \ + format(repo_path=test_repo_path) + command = "scrub_path {filepath}".format(filepath=filepath) + self.asok_command(mds_rank, command, + lambda j, r: self.json_validator(j, r, "performed_validation", + False)) + + if run_seq == 0: + log.info("First run: flushing base dir /") + command = "flush_path /" + self.asok_command(mds_rank, command, success_validator) + command = "scrub_path /" + self.asok_command(mds_rank, command, success_validator) + + new_dir = "{repo_path}/new_dir_{i}".format(repo_path=repo_path, i=run_seq) + test_new_dir = "{repo_path}/new_dir_{i}".format(repo_path=test_repo_path, + i=run_seq) + self.mount_a.run_shell(["mkdir", new_dir]) + command = "flush_path {dir}".format(dir=test_new_dir) + self.asok_command(mds_rank, command, success_validator) + + new_file = "{repo_path}/new_file_{i}".format(repo_path=repo_path, + i=run_seq) + test_new_file = "{repo_path}/new_file_{i}".format(repo_path=test_repo_path, + i=run_seq) + self.mount_a.write_n_mb(new_file, 1) + + command = "flush_path {file}".format(file=test_new_file) + self.asok_command(mds_rank, command, success_validator) + + # check that scrub fails on errors + ino = self.mount_a.path_to_ino(new_file) + rados_obj_name = "{ino:x}.00000000".format(ino=ino) + command = "scrub_path {file}".format(file=test_new_file) + + # Missing parent xattr -> ENODATA + self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name()) + self.asok_command(mds_rank, command, + lambda j, r: self.json_validator(j, r, "return_code", -errno.ENODATA)) + + # Missing object -> ENOENT + self.fs.rados(["rm", rados_obj_name], pool=self.fs.get_data_pool_name()) + self.asok_command(mds_rank, command, + lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT)) + + command = "flush_path /" + self.asok_command(mds_rank, command, success_validator) + + def test_scrub_repair(self): + mds_rank = 0 + test_dir = "scrub_repair_path" + + self.mount_a.run_shell(["sudo", "mkdir", test_dir]) + self.mount_a.run_shell(["sudo", "touch", "{0}/file".format(test_dir)]) + dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino(test_dir)) + + self.mount_a.umount_wait() + + # flush journal entries to dirfrag objects, and expire journal + self.fs.mds_asok(['flush', 'journal']) + self.fs.mds_stop() + + # remove the dentry from dirfrag, cause incorrect fragstat/rstat + self.fs.rados(["rmomapkey", dir_objname, "file_head"], + pool=self.fs.get_metadata_pool_name()) + + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + # fragstat indicates the directory is not empty, rmdir should fail + with self.assertRaises(CommandFailedError) as ar: + self.mount_a.run_shell(["sudo", "rmdir", test_dir]) + self.assertEqual(ar.exception.exitstatus, 1) + + self.asok_command(mds_rank, "scrub_path /{0} repair".format(test_dir), + lambda j, r: self.json_validator(j, r, "return_code", 0)) + + # wait a few second for background repair + time.sleep(10) + + # fragstat should be fixed + self.mount_a.run_shell(["sudo", "rmdir", test_dir]) + + @staticmethod + def json_validator(json_out, rc, element, expected_value): + if rc != 0: + return False, "asok command returned error {rc}".format(rc=rc) + element_value = json_out.get(element) + if element_value != expected_value: + return False, "unexpectedly got {jv} instead of {ev}!".format( + jv=element_value, ev=expected_value) + return True, "Succeeded" + + def asok_command(self, mds_rank, command, validator): + log.info("Running command '{command}'".format(command=command)) + + command_list = command.split() + + # we just assume there's an active mds for every rank + mds_id = self.fs.get_active_names()[mds_rank] + proc = self.fs.mon_manager.admin_socket('mds', mds_id, + command_list, check_status=False) + rout = proc.exitstatus + sout = proc.stdout.getvalue() + + if sout.strip(): + jout = json.loads(sout) + else: + jout = None + + log.info("command '{command}' got response code " + + "'{rout}' and stdout '{sout}'".format( + command=command, rout=rout, sout=sout)) + + success, errstring = validator(jout, rout) + + if not success: + raise AsokCommandFailedError(command, rout, jout, errstring) + + return jout + + def clone_repo(self, client_mount, path): + repo = "ceph-qa-suite" + repo_path = os.path.join(path, repo) + client_mount.run_shell(["mkdir", "-p", path]) + + try: + client_mount.stat(repo_path) + except CommandFailedError: + client_mount.run_shell([ + "git", "clone", '--branch', 'giant', + "http://github.com/ceph/{repo}".format(repo=repo), + "{path}/{repo}".format(path=path, repo=repo) + ]) + + return repo_path + + +class AsokCommandFailedError(Exception): + """ + Exception thrown when we get an unexpected response + on an admin socket command + """ + + def __init__(self, command, rc, json_out, errstring): + self.command = command + self.rc = rc + self.json = json_out + self.errstring = errstring + + def __str__(self): + return "Admin socket: {command} failed with rc={rc}," + \ + "json output={json}, because '{es}'".format( + command=self.command, rc=self.rc, + json=self.json, es=self.errstring) diff --git a/src/ceph/qa/tasks/cephfs/test_sessionmap.py b/src/ceph/qa/tasks/cephfs/test_sessionmap.py new file mode 100644 index 0000000..9d12ab6 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_sessionmap.py @@ -0,0 +1,235 @@ +from StringIO import StringIO +import json +import logging +from unittest import SkipTest + +from tasks.cephfs.fuse_mount import FuseMount +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + + +class TestSessionMap(CephFSTestCase): + CLIENTS_REQUIRED = 2 + MDSS_REQUIRED = 2 + + def test_tell_session_drop(self): + """ + That when a `tell` command is sent using the python CLI, + its MDS session is gone after it terminates + """ + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + mds_id = self.fs.get_lone_mds_id() + self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "session", "ls") + + ls_data = self.fs.mds_asok(['session', 'ls']) + self.assertEqual(len(ls_data), 0) + + def _get_thread_count(self, mds_id): + remote = self.fs.mds_daemons[mds_id].remote + + ps_txt = remote.run( + args=["ps", "-ww", "axo", "nlwp,cmd"], + stdout=StringIO() + ).stdout.getvalue().strip() + lines = ps_txt.split("\n")[1:] + + for line in lines: + if "ceph-mds" in line and not "daemon-helper" in line: + if line.find("-i {0}".format(mds_id)) != -1: + log.info("Found ps line for daemon: {0}".format(line)) + return int(line.split()[0]) + + raise RuntimeError("No process found in ps output for MDS {0}: {1}".format( + mds_id, ps_txt + )) + + def test_tell_conn_close(self): + """ + That when a `tell` command is sent using the python CLI, + the thread count goes back to where it started (i.e. we aren't + leaving connections open) + """ + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + mds_id = self.fs.get_lone_mds_id() + + initial_thread_count = self._get_thread_count(mds_id) + self.fs.mon_manager.raw_cluster_cmd("tell", "mds.{0}".format(mds_id), "session", "ls") + final_thread_count = self._get_thread_count(mds_id) + + self.assertEqual(initial_thread_count, final_thread_count) + + def test_mount_conn_close(self): + """ + That when a client unmounts, the thread count on the MDS goes back + to what it was before the client mounted + """ + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + mds_id = self.fs.get_lone_mds_id() + + initial_thread_count = self._get_thread_count(mds_id) + self.mount_a.mount() + self.mount_a.wait_until_mounted() + self.assertGreater(self._get_thread_count(mds_id), initial_thread_count) + self.mount_a.umount_wait() + final_thread_count = self._get_thread_count(mds_id) + + self.assertEqual(initial_thread_count, final_thread_count) + + def test_version_splitting(self): + """ + That when many sessions are updated, they are correctly + split into multiple versions to obey mds_sessionmap_keys_per_op + """ + + # Start umounted + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + # Configure MDS to write one OMAP key at once + self.set_conf('mds', 'mds_sessionmap_keys_per_op', 1) + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + # I would like two MDSs, so that I can do an export dir later + self.fs.set_max_mds(2) + self.fs.wait_for_daemons() + + active_mds_names = self.fs.get_active_names() + rank_0_id = active_mds_names[0] + rank_1_id = active_mds_names[1] + log.info("Ranks 0 and 1 are {0} and {1}".format( + rank_0_id, rank_1_id)) + + # Bring the clients back + self.mount_a.mount() + self.mount_b.mount() + self.mount_a.create_files() # Kick the client into opening sessions + self.mount_b.create_files() + + # See that they've got sessions + self.assert_session_count(2, mds_id=rank_0_id) + + # See that we persist their sessions + self.fs.mds_asok(["flush", "journal"], rank_0_id) + table_json = json.loads(self.fs.table_tool(["0", "show", "session"])) + log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2))) + self.assertEqual(table_json['0']['result'], 0) + self.assertEqual(len(table_json['0']['data']['Sessions']), 2) + + # Now, induce a "force_open_sessions" event by exporting a dir + self.mount_a.run_shell(["mkdir", "bravo"]) + self.mount_a.run_shell(["touch", "bravo/file"]) + self.mount_b.run_shell(["ls", "-l", "bravo/file"]) + + def get_omap_wrs(): + return self.fs.mds_asok(['perf', 'dump', 'objecter'], rank_1_id)['objecter']['omap_wr'] + + # Flush so that there are no dirty sessions on rank 1 + self.fs.mds_asok(["flush", "journal"], rank_1_id) + + # Export so that we get a force_open to rank 1 for the two sessions from rank 0 + initial_omap_wrs = get_omap_wrs() + self.fs.mds_asok(['export', 'dir', '/bravo', '1'], rank_0_id) + + # This is the critical (if rather subtle) check: that in the process of doing an export dir, + # we hit force_open_sessions, and as a result we end up writing out the sessionmap. There + # will be two sessions dirtied here, and because we have set keys_per_op to 1, we should see + # a single session get written out (the first of the two, triggered by the second getting marked + # dirty) + # The number of writes is two per session, because the header (sessionmap version) update and + # KV write both count. + self.wait_until_true( + lambda: get_omap_wrs() - initial_omap_wrs == 2, + timeout=10 # Long enough for an export to get acked + ) + + # Now end our sessions and check the backing sessionmap is updated correctly + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + # In-memory sessionmap check + self.assert_session_count(0, mds_id=rank_0_id) + + # On-disk sessionmap check + self.fs.mds_asok(["flush", "journal"], rank_0_id) + table_json = json.loads(self.fs.table_tool(["0", "show", "session"])) + log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2))) + self.assertEqual(table_json['0']['result'], 0) + self.assertEqual(len(table_json['0']['data']['Sessions']), 0) + + def _sudo_write_file(self, remote, path, data): + """ + Write data to a remote file as super user + + :param remote: Remote site. + :param path: Path on the remote being written to. + :param data: Data to be written. + + Both perms and owner are passed directly to chmod. + """ + remote.run( + args=[ + 'sudo', + 'python', + '-c', + 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))', + path, + ], + stdin=data, + ) + + def _configure_auth(self, mount, id_name, mds_caps, osd_caps=None, mon_caps=None): + """ + Set up auth credentials for a client mount, and write out the keyring + for the client to use. + """ + + if osd_caps is None: + osd_caps = "allow rw" + + if mon_caps is None: + mon_caps = "allow r" + + out = self.fs.mon_manager.raw_cluster_cmd( + "auth", "get-or-create", "client.{name}".format(name=id_name), + "mds", mds_caps, + "osd", osd_caps, + "mon", mon_caps + ) + mount.client_id = id_name + self._sudo_write_file(mount.client_remote, mount.get_keyring_path(), out) + self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path()) + + def test_session_reject(self): + if not isinstance(self.mount_a, FuseMount): + raise SkipTest("Requires FUSE client to inject client metadata") + + self.mount_a.run_shell(["mkdir", "foo"]) + self.mount_a.run_shell(["mkdir", "foo/bar"]) + self.mount_a.umount_wait() + + # Mount B will be my rejected client + self.mount_b.umount_wait() + + # Configure a client that is limited to /foo/bar + self._configure_auth(self.mount_b, "badguy", "allow rw path=/foo/bar") + # Check he can mount that dir and do IO + self.mount_b.mount(mount_path="/foo/bar") + self.mount_b.wait_until_mounted() + self.mount_b.create_destroy() + self.mount_b.umount_wait() + + # Configure the client to claim that its mount point metadata is /baz + self.set_conf("client.badguy", "client_metadata", "root=/baz") + # Try to mount the client, see that it fails + with self.assert_cluster_log("client session with invalid root '/baz' denied"): + with self.assertRaises(CommandFailedError): + self.mount_b.mount(mount_path="/foo/bar") diff --git a/src/ceph/qa/tasks/cephfs/test_strays.py b/src/ceph/qa/tasks/cephfs/test_strays.py new file mode 100644 index 0000000..b64f3e9 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_strays.py @@ -0,0 +1,1049 @@ +import json +import time +import logging +from textwrap import dedent +import datetime +import gevent +import datetime + +from teuthology.orchestra.run import CommandFailedError, Raw +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology + +log = logging.getLogger(__name__) + + +class TestStrays(CephFSTestCase): + MDSS_REQUIRED = 2 + + OPS_THROTTLE = 1 + FILES_THROTTLE = 2 + + # Range of different file sizes used in throttle test's workload + throttle_workload_size_range = 16 + + @for_teuthology + def test_ops_throttle(self): + self._test_throttling(self.OPS_THROTTLE) + + @for_teuthology + def test_files_throttle(self): + self._test_throttling(self.FILES_THROTTLE) + + def test_dir_deletion(self): + """ + That when deleting a bunch of dentries and the containing + directory, everything gets purged. + Catches cases where the client might e.g. fail to trim + the unlinked dir from its cache. + """ + file_count = 1000 + create_script = dedent(""" + import os + + mount_path = "{mount_path}" + subdir = "delete_me" + size = {size} + file_count = {file_count} + os.mkdir(os.path.join(mount_path, subdir)) + for i in xrange(0, file_count): + filename = "{{0}}_{{1}}.bin".format(i, size) + f = open(os.path.join(mount_path, subdir, filename), 'w') + f.write(size * 'x') + f.close() + """.format( + mount_path=self.mount_a.mountpoint, + size=1024, + file_count=file_count + )) + + self.mount_a.run_python(create_script) + + # That the dirfrag object is created + self.fs.mds_asok(["flush", "journal"]) + dir_ino = self.mount_a.path_to_ino("delete_me") + self.assertTrue(self.fs.dirfrag_exists(dir_ino, 0)) + + # Remove everything + self.mount_a.run_shell(["rm", "-rf", "delete_me"]) + self.fs.mds_asok(["flush", "journal"]) + + # That all the removed files get created as strays + strays = self.get_mdc_stat("strays_created") + self.assertEqual(strays, file_count + 1) + + # That the strays all get enqueued for purge + self.wait_until_equal( + lambda: self.get_mdc_stat("strays_enqueued"), + strays, + timeout=600 + + ) + + # That all the purge operations execute + self.wait_until_equal( + lambda: self.get_stat("purge_queue", "pq_executed"), + strays, + timeout=600 + ) + + # That finally, the directory metadata object is gone + self.assertFalse(self.fs.dirfrag_exists(dir_ino, 0)) + + # That finally, the data objects are all gone + self.await_data_pool_empty() + + def _test_throttling(self, throttle_type): + self.data_log = [] + try: + return self._do_test_throttling(throttle_type) + except: + for l in self.data_log: + log.info(",".join([l_.__str__() for l_ in l])) + raise + + def _do_test_throttling(self, throttle_type): + """ + That the mds_max_purge_ops setting is respected + """ + + def set_throttles(files, ops): + """ + Helper for updating ops/files limits, and calculating effective + ops_per_pg setting to give the same ops limit. + """ + self.set_conf('mds', 'mds_max_purge_files', "%d" % files) + self.set_conf('mds', 'mds_max_purge_ops', "%d" % ops) + + pgs = self.fs.mon_manager.get_pool_property( + self.fs.get_data_pool_name(), + "pg_num" + ) + ops_per_pg = float(ops) / pgs + self.set_conf('mds', 'mds_max_purge_ops_per_pg', "%s" % ops_per_pg) + + # Test conditions depend on what we're going to be exercising. + # * Lift the threshold on whatever throttle we are *not* testing, so + # that the throttle of interest is the one that will be the bottleneck + # * Create either many small files (test file count throttling) or fewer + # large files (test op throttling) + if throttle_type == self.OPS_THROTTLE: + set_throttles(files=100000000, ops=16) + size_unit = 1024 * 1024 # big files, generate lots of ops + file_multiplier = 100 + elif throttle_type == self.FILES_THROTTLE: + # The default value of file limit is pretty permissive, so to avoid + # the test running too fast, create lots of files and set the limit + # pretty low. + set_throttles(ops=100000000, files=6) + size_unit = 1024 # small, numerous files + file_multiplier = 200 + else: + raise NotImplemented(throttle_type) + + # Pick up config changes + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + create_script = dedent(""" + import os + + mount_path = "{mount_path}" + subdir = "delete_me" + size_unit = {size_unit} + file_multiplier = {file_multiplier} + os.mkdir(os.path.join(mount_path, subdir)) + for i in xrange(0, file_multiplier): + for size in xrange(0, {size_range}*size_unit, size_unit): + filename = "{{0}}_{{1}}.bin".format(i, size / size_unit) + f = open(os.path.join(mount_path, subdir, filename), 'w') + f.write(size * 'x') + f.close() + """.format( + mount_path=self.mount_a.mountpoint, + size_unit=size_unit, + file_multiplier=file_multiplier, + size_range=self.throttle_workload_size_range + )) + + self.mount_a.run_python(create_script) + + # We will run the deletion in the background, to reduce the risk of it completing before + # we have started monitoring the stray statistics. + def background(): + self.mount_a.run_shell(["rm", "-rf", "delete_me"]) + self.fs.mds_asok(["flush", "journal"]) + + background_thread = gevent.spawn(background) + + total_inodes = file_multiplier * self.throttle_workload_size_range + 1 + mds_max_purge_ops = int(self.fs.get_config("mds_max_purge_ops", 'mds')) + mds_max_purge_files = int(self.fs.get_config("mds_max_purge_files", 'mds')) + + # During this phase we look for the concurrent ops to exceed half + # the limit (a heuristic) and not exceed the limit (a correctness + # condition). + purge_timeout = 600 + elapsed = 0 + files_high_water = 0 + ops_high_water = 0 + + while True: + stats = self.fs.mds_asok(['perf', 'dump']) + mdc_stats = stats['mds_cache'] + pq_stats = stats['purge_queue'] + if elapsed >= purge_timeout: + raise RuntimeError("Timeout waiting for {0} inodes to purge, stats:{1}".format(total_inodes, mdc_stats)) + + num_strays = mdc_stats['num_strays'] + num_strays_purging = pq_stats['pq_executing'] + num_purge_ops = pq_stats['pq_executing_ops'] + + self.data_log.append([datetime.datetime.now(), num_strays, num_strays_purging, num_purge_ops]) + + files_high_water = max(files_high_water, num_strays_purging) + ops_high_water = max(ops_high_water, num_purge_ops) + + total_strays_created = mdc_stats['strays_created'] + total_strays_purged = pq_stats['pq_executed'] + + if total_strays_purged == total_inodes: + log.info("Complete purge in {0} seconds".format(elapsed)) + break + elif total_strays_purged > total_inodes: + raise RuntimeError("Saw more strays than expected, mdc stats: {0}".format(mdc_stats)) + else: + if throttle_type == self.OPS_THROTTLE: + # 11 is filer_max_purge_ops plus one for the backtrace: + # limit is allowed to be overshot by this much. + if num_purge_ops > mds_max_purge_ops + 11: + raise RuntimeError("num_purge_ops violates threshold {0}/{1}".format( + num_purge_ops, mds_max_purge_ops + )) + elif throttle_type == self.FILES_THROTTLE: + if num_strays_purging > mds_max_purge_files: + raise RuntimeError("num_strays_purging violates threshold {0}/{1}".format( + num_strays_purging, mds_max_purge_files + )) + else: + raise NotImplemented(throttle_type) + + log.info("Waiting for purge to complete {0}/{1}, {2}/{3}".format( + num_strays_purging, num_strays, + total_strays_purged, total_strays_created + )) + time.sleep(1) + elapsed += 1 + + background_thread.join() + + # Check that we got up to a respectable rate during the purge. This is totally + # racy, but should be safeish unless the cluster is pathologically slow, or + # insanely fast such that the deletions all pass before we have polled the + # statistics. + if throttle_type == self.OPS_THROTTLE: + if ops_high_water < mds_max_purge_ops / 2: + raise RuntimeError("Ops in flight high water is unexpectedly low ({0} / {1})".format( + ops_high_water, mds_max_purge_ops + )) + elif throttle_type == self.FILES_THROTTLE: + if files_high_water < mds_max_purge_files / 2: + raise RuntimeError("Files in flight high water is unexpectedly low ({0} / {1})".format( + ops_high_water, mds_max_purge_files + )) + + # Sanity check all MDC stray stats + stats = self.fs.mds_asok(['perf', 'dump']) + mdc_stats = stats['mds_cache'] + pq_stats = stats['purge_queue'] + self.assertEqual(mdc_stats['num_strays'], 0) + self.assertEqual(mdc_stats['num_strays_delayed'], 0) + self.assertEqual(pq_stats['pq_executing'], 0) + self.assertEqual(pq_stats['pq_executing_ops'], 0) + self.assertEqual(mdc_stats['strays_created'], total_inodes) + self.assertEqual(mdc_stats['strays_enqueued'], total_inodes) + self.assertEqual(pq_stats['pq_executed'], total_inodes) + + def get_mdc_stat(self, name, mds_id=None): + return self.get_stat("mds_cache", name, mds_id) + + def get_stat(self, subsys, name, mds_id=None): + return self.fs.mds_asok(['perf', 'dump', subsys, name], + mds_id=mds_id)[subsys][name] + + def _wait_for_counter(self, subsys, counter, expect_val, timeout=60, + mds_id=None): + self.wait_until_equal( + lambda: self.get_stat(subsys, counter, mds_id), + expect_val=expect_val, timeout=timeout, + reject_fn=lambda x: x > expect_val + ) + + def test_open_inode(self): + """ + That the case of a dentry unlinked while a client holds an + inode open is handled correctly. + + The inode should be moved into a stray dentry, while the original + dentry and directory should be purged. + + The inode's data should be purged when the client eventually closes + it. + """ + mount_a_client_id = self.mount_a.get_global_id() + + # Write some bytes to a file + size_mb = 8 + + # Hold the file open + p = self.mount_a.open_background("open_file") + self.mount_a.write_n_mb("open_file", size_mb) + open_file_ino = self.mount_a.path_to_ino("open_file") + + self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2) + + # Unlink the dentry + self.mount_a.run_shell(["rm", "-f", "open_file"]) + + # Wait to see the stray count increment + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=1, timeout=60, reject_fn=lambda x: x > 1) + + # See that while the stray count has incremented, none have passed + # on to the purge queue + self.assertEqual(self.get_mdc_stat("strays_created"), 1) + self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0) + + # See that the client still holds 2 caps + self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2) + + # See that the data objects remain in the data pool + self.assertTrue(self.fs.data_objects_present(open_file_ino, size_mb * 1024 * 1024)) + + # Now close the file + self.mount_a.kill_background(p) + + # Wait to see the client cap count decrement + self.wait_until_equal( + lambda: self.get_session(mount_a_client_id)['num_caps'], + expect_val=1, timeout=60, reject_fn=lambda x: x > 2 or x < 1 + ) + # Wait to see the purge counter increment, stray count go to zero + self._wait_for_counter("mds_cache", "strays_enqueued", 1) + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=0, timeout=6, reject_fn=lambda x: x > 1 + ) + self._wait_for_counter("purge_queue", "pq_executed", 1) + + # See that the data objects no longer exist + self.assertTrue(self.fs.data_objects_absent(open_file_ino, size_mb * 1024 * 1024)) + + self.await_data_pool_empty() + + def test_hardlink_reintegration(self): + """ + That removal of primary dentry of hardlinked inode results + in reintegration of inode into the previously-remote dentry, + rather than lingering as a stray indefinitely. + """ + # Write some bytes to file_a + size_mb = 8 + self.mount_a.run_shell(["mkdir", "dir_1"]) + self.mount_a.write_n_mb("dir_1/file_a", size_mb) + ino = self.mount_a.path_to_ino("dir_1/file_a") + + # Create a hardlink named file_b + self.mount_a.run_shell(["mkdir", "dir_2"]) + self.mount_a.run_shell(["ln", "dir_1/file_a", "dir_2/file_b"]) + self.assertEqual(self.mount_a.path_to_ino("dir_2/file_b"), ino) + + # Flush journal + self.fs.mds_asok(['flush', 'journal']) + + # See that backtrace for the file points to the file_a path + pre_unlink_bt = self.fs.read_backtrace(ino) + self.assertEqual(pre_unlink_bt['ancestors'][0]['dname'], "file_a") + + # empty mds cache. otherwise mds reintegrates stray when unlink finishes + self.mount_a.umount_wait() + self.fs.mds_asok(['flush', 'journal']) + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + self.mount_a.mount() + + # Unlink file_a + self.mount_a.run_shell(["rm", "-f", "dir_1/file_a"]) + + # See that a stray was created + self.assertEqual(self.get_mdc_stat("num_strays"), 1) + self.assertEqual(self.get_mdc_stat("strays_created"), 1) + + # Wait, see that data objects are still present (i.e. that the + # stray did not advance to purging given time) + time.sleep(30) + self.assertTrue(self.fs.data_objects_present(ino, size_mb * 1024 * 1024)) + self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0) + + # See that before reintegration, the inode's backtrace points to a stray dir + self.fs.mds_asok(['flush', 'journal']) + self.assertTrue(self.get_backtrace_path(ino).startswith("stray")) + + last_reintegrated = self.get_mdc_stat("strays_reintegrated") + + # Do a metadata operation on the remaining link (mv is heavy handed, but + # others like touch may be satisfied from caps without poking MDS) + self.mount_a.run_shell(["mv", "dir_2/file_b", "dir_2/file_c"]) + + # Stray reintegration should happen as a result of the eval_remote call + # on responding to a client request. + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=0, + timeout=60 + ) + + # See the reintegration counter increment + curr_reintegrated = self.get_mdc_stat("strays_reintegrated") + self.assertGreater(curr_reintegrated, last_reintegrated) + last_reintegrated = curr_reintegrated + + # Flush the journal + self.fs.mds_asok(['flush', 'journal']) + + # See that the backtrace for the file points to the remaining link's path + post_reint_bt = self.fs.read_backtrace(ino) + self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_c") + + # mds should reintegrates stray when unlink finishes + self.mount_a.run_shell(["ln", "dir_2/file_c", "dir_2/file_d"]) + self.mount_a.run_shell(["rm", "-f", "dir_2/file_c"]) + + # Stray reintegration should happen as a result of the notify_stray call + # on completion of unlink + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=0, + timeout=60 + ) + + # See the reintegration counter increment + curr_reintegrated = self.get_mdc_stat("strays_reintegrated") + self.assertGreater(curr_reintegrated, last_reintegrated) + last_reintegrated = curr_reintegrated + + # Flush the journal + self.fs.mds_asok(['flush', 'journal']) + + # See that the backtrace for the file points to the newest link's path + post_reint_bt = self.fs.read_backtrace(ino) + self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_d") + + # Now really delete it + self.mount_a.run_shell(["rm", "-f", "dir_2/file_d"]) + self._wait_for_counter("mds_cache", "strays_enqueued", 1) + self._wait_for_counter("purge_queue", "pq_executed", 1) + + self.assert_purge_idle() + self.assertTrue(self.fs.data_objects_absent(ino, size_mb * 1024 * 1024)) + + # We caused the inode to go stray 3 times + self.assertEqual(self.get_mdc_stat("strays_created"), 3) + # We purged it at the last + self.assertEqual(self.get_mdc_stat("strays_enqueued"), 1) + + def test_mv_hardlink_cleanup(self): + """ + That when doing a rename from A to B, and B has hardlinks, + then we make a stray for B which is then reintegrated + into one of his hardlinks. + """ + # Create file_a, file_b, and a hardlink to file_b + size_mb = 8 + self.mount_a.write_n_mb("file_a", size_mb) + file_a_ino = self.mount_a.path_to_ino("file_a") + + self.mount_a.write_n_mb("file_b", size_mb) + file_b_ino = self.mount_a.path_to_ino("file_b") + + self.mount_a.run_shell(["ln", "file_b", "linkto_b"]) + self.assertEqual(self.mount_a.path_to_ino("linkto_b"), file_b_ino) + + # mv file_a file_b + self.mount_a.run_shell(["mv", "file_a", "file_b"]) + + # Stray reintegration should happen as a result of the notify_stray call on + # completion of rename + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=0, + timeout=60 + ) + + self.assertEqual(self.get_mdc_stat("strays_created"), 1) + self.assertGreaterEqual(self.get_mdc_stat("strays_reintegrated"), 1) + + # No data objects should have been deleted, as both files still have linkage. + self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024)) + self.assertTrue(self.fs.data_objects_present(file_b_ino, size_mb * 1024 * 1024)) + + self.fs.mds_asok(['flush', 'journal']) + + post_reint_bt = self.fs.read_backtrace(file_b_ino) + self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "linkto_b") + + def _setup_two_ranks(self): + # Set up two MDSs + self.fs.set_max_mds(2) + + # See that we have two active MDSs + self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30, + reject_fn=lambda v: v > 2 or v < 1) + + active_mds_names = self.fs.get_active_names() + rank_0_id = active_mds_names[0] + rank_1_id = active_mds_names[1] + log.info("Ranks 0 and 1 are {0} and {1}".format( + rank_0_id, rank_1_id)) + + # Get rid of other MDS daemons so that it's easier to know which + # daemons to expect in which ranks after restarts + for unneeded_mds in set(self.mds_cluster.mds_ids) - {rank_0_id, rank_1_id}: + self.mds_cluster.mds_stop(unneeded_mds) + self.mds_cluster.mds_fail(unneeded_mds) + + return rank_0_id, rank_1_id + + def _force_migrate(self, to_id, path, watch_ino): + """ + :param to_id: MDS id to move it to + :param path: Filesystem path (string) to move + :param watch_ino: Inode number to look for at destination to confirm move + :return: None + """ + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", "1", path]) + + # Poll the MDS cache dump to watch for the export completing + migrated = False + migrate_timeout = 60 + migrate_elapsed = 0 + while not migrated: + data = self.fs.mds_asok(["dump", "cache"], to_id) + for inode_data in data: + if inode_data['ino'] == watch_ino: + log.debug("Found ino in cache: {0}".format(json.dumps(inode_data, indent=2))) + if inode_data['is_auth'] is True: + migrated = True + break + + if not migrated: + if migrate_elapsed > migrate_timeout: + raise RuntimeError("Migration hasn't happened after {0}s!".format(migrate_elapsed)) + else: + migrate_elapsed += 1 + time.sleep(1) + + def _is_stopped(self, rank): + mds_map = self.fs.get_mds_map() + return rank not in [i['rank'] for i in mds_map['info'].values()] + + def test_purge_on_shutdown(self): + """ + That when an MDS rank is shut down, its purge queue is + drained in the process. + """ + rank_0_id, rank_1_id = self._setup_two_ranks() + + self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0") + self.mds_cluster.mds_fail_restart(rank_1_id) + self.fs.wait_for_daemons() + + file_count = 5 + + self.mount_a.create_n_files("delete_me/file", file_count) + + self._force_migrate(rank_1_id, "delete_me", + self.mount_a.path_to_ino("delete_me/file_0")) + + self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")]) + self.mount_a.umount_wait() + + # See all the strays go into purge queue + self._wait_for_counter("mds_cache", "strays_created", file_count, mds_id=rank_1_id) + self._wait_for_counter("mds_cache", "strays_enqueued", file_count, mds_id=rank_1_id) + self.assertEqual(self.get_stat("mds_cache", "num_strays", mds_id=rank_1_id), 0) + + # See nothing get purged from the purge queue (yet) + time.sleep(10) + self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0) + + # Shut down rank 1 + self.fs.set_max_mds(1) + self.fs.deactivate(1) + + # It shouldn't proceed past stopping because its still not allowed + # to purge + time.sleep(10) + self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0) + self.assertFalse(self._is_stopped(1)) + + # Permit the daemon to start purging again + self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.{0}'.format(rank_1_id), + 'injectargs', + "--mds_max_purge_files 100") + + # It should now proceed through shutdown + self.wait_until_true( + lambda: self._is_stopped(1), + timeout=60 + ) + + # ...and in the process purge all that data + self.await_data_pool_empty() + + def test_migration_on_shutdown(self): + """ + That when an MDS rank is shut down, any non-purgeable strays + get migrated to another rank. + """ + + rank_0_id, rank_1_id = self._setup_two_ranks() + + # Create a non-purgeable stray in a ~mds1 stray directory + # by doing a hard link and deleting the original file + self.mount_a.run_shell(["mkdir", "dir_1", "dir_2"]) + self.mount_a.run_shell(["touch", "dir_1/original"]) + self.mount_a.run_shell(["ln", "dir_1/original", "dir_2/linkto"]) + + self._force_migrate(rank_1_id, "dir_1", + self.mount_a.path_to_ino("dir_1/original")) + + # empty mds cache. otherwise mds reintegrates stray when unlink finishes + self.mount_a.umount_wait() + self.fs.mds_asok(['flush', 'journal'], rank_0_id) + self.fs.mds_asok(['flush', 'journal'], rank_1_id) + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + active_mds_names = self.fs.get_active_names() + rank_0_id = active_mds_names[0] + rank_1_id = active_mds_names[1] + + self.mount_a.mount() + + self.mount_a.run_shell(["rm", "-f", "dir_1/original"]) + self.mount_a.umount_wait() + + self._wait_for_counter("mds_cache", "strays_created", 1, + mds_id=rank_1_id) + + # Shut down rank 1 + self.fs.mon_manager.raw_cluster_cmd_result('mds', 'set', "max_mds", "1") + self.fs.mon_manager.raw_cluster_cmd_result('mds', 'deactivate', "1") + + # Wait til we get to a single active MDS mdsmap state + self.wait_until_true(lambda: self._is_stopped(1), timeout=120) + + # See that the stray counter on rank 0 has incremented + self.assertEqual(self.get_mdc_stat("strays_created", rank_0_id), 1) + + def assert_backtrace(self, ino, expected_path): + """ + Assert that the backtrace in the data pool for an inode matches + an expected /foo/bar path. + """ + expected_elements = expected_path.strip("/").split("/") + bt = self.fs.read_backtrace(ino) + actual_elements = list(reversed([dn['dname'] for dn in bt['ancestors']])) + self.assertListEqual(expected_elements, actual_elements) + + def get_backtrace_path(self, ino): + bt = self.fs.read_backtrace(ino) + elements = reversed([dn['dname'] for dn in bt['ancestors']]) + return "/".join(elements) + + def assert_purge_idle(self): + """ + Assert that the MDS perf counters indicate no strays exist and + no ongoing purge activity. Sanity check for when PurgeQueue should + be idle. + """ + mdc_stats = self.fs.mds_asok(['perf', 'dump', "mds_cache"])['mds_cache'] + pq_stats = self.fs.mds_asok(['perf', 'dump', "purge_queue"])['purge_queue'] + self.assertEqual(mdc_stats["num_strays"], 0) + self.assertEqual(mdc_stats["num_strays_delayed"], 0) + self.assertEqual(pq_stats["pq_executing"], 0) + self.assertEqual(pq_stats["pq_executing_ops"], 0) + + def test_mv_cleanup(self): + """ + That when doing a rename from A to B, and B has no hardlinks, + then we make a stray for B and purge him. + """ + # Create file_a and file_b, write some to both + size_mb = 8 + self.mount_a.write_n_mb("file_a", size_mb) + file_a_ino = self.mount_a.path_to_ino("file_a") + self.mount_a.write_n_mb("file_b", size_mb) + file_b_ino = self.mount_a.path_to_ino("file_b") + + self.fs.mds_asok(['flush', 'journal']) + self.assert_backtrace(file_a_ino, "file_a") + self.assert_backtrace(file_b_ino, "file_b") + + # mv file_a file_b + self.mount_a.run_shell(['mv', 'file_a', 'file_b']) + + # See that stray counter increments + self.assertEqual(self.get_mdc_stat("strays_created"), 1) + # Wait for purge counter to increment + self._wait_for_counter("mds_cache", "strays_enqueued", 1) + self._wait_for_counter("purge_queue", "pq_executed", 1) + + self.assert_purge_idle() + + # file_b should have been purged + self.assertTrue(self.fs.data_objects_absent(file_b_ino, size_mb * 1024 * 1024)) + + # Backtrace should have updated from file_a to file_b + self.fs.mds_asok(['flush', 'journal']) + self.assert_backtrace(file_a_ino, "file_b") + + # file_a's data should still exist + self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024)) + + def _pool_df(self, pool_name): + """ + Return a dict like + { + "kb_used": 0, + "bytes_used": 0, + "max_avail": 19630292406, + "objects": 0 + } + + :param pool_name: Which pool (must exist) + """ + out = self.fs.mon_manager.raw_cluster_cmd("df", "--format=json-pretty") + for p in json.loads(out)['pools']: + if p['name'] == pool_name: + return p['stats'] + + raise RuntimeError("Pool '{0}' not found".format(pool_name)) + + def await_data_pool_empty(self): + self.wait_until_true( + lambda: self._pool_df( + self.fs.get_data_pool_name() + )['objects'] == 0, + timeout=60) + + def test_snapshot_remove(self): + """ + That removal of a snapshot that references a now-unlinked file results + in purging on the stray for the file. + """ + # Enable snapshots + self.fs.mon_manager.raw_cluster_cmd("mds", "set", "allow_new_snaps", "true", + "--yes-i-really-mean-it") + + # Create a dir with a file in it + size_mb = 8 + self.mount_a.run_shell(["mkdir", "snapdir"]) + self.mount_a.run_shell(["mkdir", "snapdir/subdir"]) + self.mount_a.write_test_pattern("snapdir/subdir/file_a", size_mb * 1024 * 1024) + file_a_ino = self.mount_a.path_to_ino("snapdir/subdir/file_a") + + # Snapshot the dir + self.mount_a.run_shell(["mkdir", "snapdir/.snap/snap1"]) + + # Cause the head revision to deviate from the snapshot + self.mount_a.write_n_mb("snapdir/subdir/file_a", size_mb) + + # Flush the journal so that backtraces, dirfrag objects will actually be written + self.fs.mds_asok(["flush", "journal"]) + + # Unlink the file + self.mount_a.run_shell(["rm", "-f", "snapdir/subdir/file_a"]) + self.mount_a.run_shell(["rmdir", "snapdir/subdir"]) + + # Unmount the client because when I come back to check the data is still + # in the file I don't want to just see what's in the page cache. + self.mount_a.umount_wait() + + self.assertEqual(self.get_mdc_stat("strays_created"), 2) + + # FIXME: at this stage we see a purge and the stray count drops to + # zero, but there's actually still a stray, so at the very + # least the StrayManager stats code is slightly off + + self.mount_a.mount() + + # See that the data from the snapshotted revision of the file is still present + # and correct + self.mount_a.validate_test_pattern("snapdir/.snap/snap1/subdir/file_a", size_mb * 1024 * 1024) + + # Remove the snapshot + self.mount_a.run_shell(["rmdir", "snapdir/.snap/snap1"]) + + # Purging file_a doesn't happen until after we've flushed the journal, because + # it is referenced by the snapshotted subdir, and the snapshot isn't really + # gone until the journal references to it are gone + self.fs.mds_asok(["flush", "journal"]) + + # Wait for purging to complete, which requires the OSDMap to propagate to the OSDs. + # See also: http://tracker.ceph.com/issues/20072 + self.wait_until_true( + lambda: self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024), + timeout=60 + ) + + # See that a purge happens now + self._wait_for_counter("mds_cache", "strays_enqueued", 2) + self._wait_for_counter("purge_queue", "pq_executed", 2) + + self.await_data_pool_empty() + + def test_fancy_layout(self): + """ + purge stray file with fancy layout + """ + + file_name = "fancy_layout_file" + self.mount_a.run_shell(["touch", file_name]) + + file_layout = "stripe_unit=1048576 stripe_count=4 object_size=8388608" + self.mount_a.setfattr(file_name, "ceph.file.layout", file_layout) + + # 35MB requires 7 objects + size_mb = 35 + self.mount_a.write_n_mb(file_name, size_mb) + + self.mount_a.run_shell(["rm", "-f", file_name]) + self.fs.mds_asok(["flush", "journal"]) + + # can't use self.fs.data_objects_absent here, it does not support fancy layout + self.await_data_pool_empty() + + def test_dirfrag_limit(self): + """ + That the directory fragment size cannot exceed mds_bal_fragment_size_max (using a limit of 50 in all configurations). + + That fragmentation (forced) will allow more entries to be created. + + That unlinking fails when the stray directory fragment becomes too large and that unlinking may continue once those strays are purged. + """ + + self.fs.set_allow_dirfrags(True) + + LOW_LIMIT = 50 + for mds in self.fs.get_daemon_names(): + self.fs.mds_asok(["config", "set", "mds_bal_fragment_size_max", str(LOW_LIMIT)], mds) + + try: + self.mount_a.run_python(dedent(""" + import os + path = os.path.join("{path}", "subdir") + os.mkdir(path) + for n in range(0, {file_count}): + open(os.path.join(path, "%s" % n), 'w').write("%s" % n) + """.format( + path=self.mount_a.mountpoint, + file_count=LOW_LIMIT+1 + ))) + except CommandFailedError: + pass # ENOSPAC + else: + raise RuntimeError("fragment size exceeded") + + # Now test that we can go beyond the limit if we fragment the directory + + self.mount_a.run_python(dedent(""" + import os + path = os.path.join("{path}", "subdir2") + os.mkdir(path) + for n in range(0, {file_count}): + open(os.path.join(path, "%s" % n), 'w').write("%s" % n) + dfd = os.open(path, os.O_DIRECTORY) + os.fsync(dfd) + """.format( + path=self.mount_a.mountpoint, + file_count=LOW_LIMIT + ))) + + # Ensure that subdir2 is fragmented + mds_id = self.fs.get_active_names()[0] + self.fs.mds_asok(["dirfrag", "split", "/subdir2", "0/0", "1"], mds_id) + + # remount+flush (release client caps) + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"], mds_id) + self.mount_a.mount() + self.mount_a.wait_until_mounted() + + # Create 50% more files than the current fragment limit + self.mount_a.run_python(dedent(""" + import os + path = os.path.join("{path}", "subdir2") + for n in range({file_count}, ({file_count}*3)//2): + open(os.path.join(path, "%s" % n), 'w').write("%s" % n) + """.format( + path=self.mount_a.mountpoint, + file_count=LOW_LIMIT + ))) + + # Now test the stray directory size is limited and recovers + strays_before = self.get_mdc_stat("strays_created") + try: + self.mount_a.run_python(dedent(""" + import os + path = os.path.join("{path}", "subdir3") + os.mkdir(path) + for n in range({file_count}): + fpath = os.path.join(path, "%s" % n) + f = open(fpath, 'w') + f.write("%s" % n) + f.close() + os.unlink(fpath) + """.format( + path=self.mount_a.mountpoint, + file_count=LOW_LIMIT*10 # 10 stray directories, should collide before this count + ))) + except CommandFailedError: + pass # ENOSPAC + else: + raise RuntimeError("fragment size exceeded") + + strays_after = self.get_mdc_stat("strays_created") + self.assertGreaterEqual(strays_after-strays_before, LOW_LIMIT) + + self._wait_for_counter("mds_cache", "strays_enqueued", strays_after) + self._wait_for_counter("purge_queue", "pq_executed", strays_after) + + self.mount_a.run_python(dedent(""" + import os + path = os.path.join("{path}", "subdir4") + os.mkdir(path) + for n in range({file_count}): + fpath = os.path.join(path, "%s" % n) + f = open(fpath, 'w') + f.write("%s" % n) + f.close() + os.unlink(fpath) + """.format( + path=self.mount_a.mountpoint, + file_count=LOW_LIMIT + ))) + + def test_purge_queue_upgrade(self): + """ + That when starting on a system with no purge queue in the metadata + pool, we silently create one. + :return: + """ + + self.mds_cluster.mds_stop() + self.mds_cluster.mds_fail() + self.fs.rados(["rm", "500.00000000"]) + self.mds_cluster.mds_restart() + self.fs.wait_for_daemons() + + def test_purge_queue_op_rate(self): + """ + A busy purge queue is meant to aggregate operations sufficiently + that our RADOS ops to the metadata pool are not O(files). Check + that that is so. + :return: + """ + + # For low rates of deletion, the rate of metadata ops actually + # will be o(files), so to see the desired behaviour we have to give + # the system a significant quantity, i.e. an order of magnitude + # more than the number of files it will purge at one time. + + max_purge_files = 2 + + self.set_conf('mds', 'mds_bal_frag', 'false') + self.set_conf('mds', 'mds_max_purge_files', "%d" % max_purge_files) + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + phase_1_files = 256 + phase_2_files = 512 + + self.mount_a.run_shell(["mkdir", "phase1"]) + self.mount_a.create_n_files("phase1/file", phase_1_files) + + self.mount_a.run_shell(["mkdir", "phase2"]) + self.mount_a.create_n_files("phase2/file", phase_2_files) + + def unlink_and_count_ops(path, expected_deletions): + initial_ops = self.get_stat("objecter", "op") + initial_pq_executed = self.get_stat("purge_queue", "pq_executed") + + self.mount_a.run_shell(["rm", "-rf", path]) + + self._wait_for_counter( + "purge_queue", "pq_executed", initial_pq_executed + expected_deletions + ) + + final_ops = self.get_stat("objecter", "op") + + # Calculation of the *overhead* operations, i.e. do not include + # the operations where we actually delete files. + return final_ops - initial_ops - expected_deletions + + self.fs.mds_asok(['flush', 'journal']) + phase1_ops = unlink_and_count_ops("phase1/", phase_1_files + 1) + + self.fs.mds_asok(['flush', 'journal']) + phase2_ops = unlink_and_count_ops("phase2/", phase_2_files + 1) + + log.info("Phase 1: {0}".format(phase1_ops)) + log.info("Phase 2: {0}".format(phase2_ops)) + + # The success criterion is that deleting double the number + # of files doesn't generate double the number of overhead ops + # -- this comparison is a rough approximation of that rule. + self.assertTrue(phase2_ops < phase1_ops * 1.25) + + # Finally, check that our activity did include properly quiescing + # the queue (i.e. call to Journaler::write_head in the right place), + # by restarting the MDS and checking that it doesn't try re-executing + # any of the work we did. + self.fs.mds_asok(['flush', 'journal']) # flush to ensure no strays + # hanging around + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + time.sleep(10) + self.assertEqual(self.get_stat("purge_queue", "pq_executed"), 0) + + def test_replicated_delete_speed(self): + """ + That deletions of replicated metadata are not pathologically slow + """ + rank_0_id, rank_1_id = self._setup_two_ranks() + + self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0") + self.mds_cluster.mds_fail_restart(rank_1_id) + self.fs.wait_for_daemons() + + file_count = 10 + + self.mount_a.create_n_files("delete_me/file", file_count) + + self._force_migrate(rank_1_id, "delete_me", + self.mount_a.path_to_ino("delete_me/file_0")) + + begin = datetime.datetime.now() + self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")]) + end = datetime.datetime.now() + + # What we're really checking here is that we are completing client + # operations immediately rather than delaying until the next tick. + tick_period = float(self.fs.get_config("mds_tick_interval", + service_type="mds")) + + duration = (end - begin).total_seconds() + self.assertLess(duration, (file_count * tick_period) * 0.25) + diff --git a/src/ceph/qa/tasks/cephfs/test_volume_client.py b/src/ceph/qa/tasks/cephfs/test_volume_client.py new file mode 100644 index 0000000..0876af9 --- /dev/null +++ b/src/ceph/qa/tasks/cephfs/test_volume_client.py @@ -0,0 +1,1016 @@ +import json +import logging +import time +import os +from textwrap import dedent +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from tasks.cephfs.fuse_mount import FuseMount +from teuthology.exceptions import CommandFailedError + +log = logging.getLogger(__name__) + + +class TestVolumeClient(CephFSTestCase): + # One for looking at the global filesystem, one for being + # the VolumeClient, two for mounting the created shares + CLIENTS_REQUIRED = 4 + + def _volume_client_python(self, client, script, vol_prefix=None, ns_prefix=None): + # Can't dedent this *and* the script we pass in, because they might have different + # levels of indentation to begin with, so leave this string zero-indented + if vol_prefix: + vol_prefix = "\"" + vol_prefix + "\"" + if ns_prefix: + ns_prefix = "\"" + ns_prefix + "\"" + return client.run_python(""" +from ceph_volume_client import CephFSVolumeClient, VolumePath +import logging +log = logging.getLogger("ceph_volume_client") +log.addHandler(logging.StreamHandler()) +log.setLevel(logging.DEBUG) +vc = CephFSVolumeClient("manila", "{conf_path}", "ceph", {vol_prefix}, {ns_prefix}) +vc.connect() +{payload} +vc.disconnect() + """.format(payload=script, conf_path=client.config_path, vol_prefix=vol_prefix, ns_prefix=ns_prefix)) + + def _sudo_write_file(self, remote, path, data): + """ + Write data to a remote file as super user + + :param remote: Remote site. + :param path: Path on the remote being written to. + :param data: Data to be written. + + Both perms and owner are passed directly to chmod. + """ + remote.run( + args=[ + 'sudo', + 'python', + '-c', + 'import shutil, sys; shutil.copyfileobj(sys.stdin, file(sys.argv[1], "wb"))', + path, + ], + stdin=data, + ) + + def _configure_vc_auth(self, mount, id_name): + """ + Set up auth credentials for the VolumeClient user + """ + out = self.fs.mon_manager.raw_cluster_cmd( + "auth", "get-or-create", "client.{name}".format(name=id_name), + "mds", "allow *", + "osd", "allow rw", + "mon", "allow *" + ) + mount.client_id = id_name + self._sudo_write_file(mount.client_remote, mount.get_keyring_path(), out) + self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path()) + + def _configure_guest_auth(self, volumeclient_mount, guest_mount, + guest_entity, mount_path, + namespace_prefix=None, readonly=False, + tenant_id=None): + """ + Set up auth credentials for the guest client to mount a volume. + + :param volumeclient_mount: mount used as the handle for driving + volumeclient. + :param guest_mount: mount used by the guest client. + :param guest_entity: auth ID used by the guest client. + :param mount_path: path of the volume. + :param namespace_prefix: name prefix of the RADOS namespace, which + is used for the volume's layout. + :param readonly: defaults to False. If set to 'True' only read-only + mount access is granted to the guest. + :param tenant_id: (OpenStack) tenant ID of the guest client. + """ + + head, volume_id = os.path.split(mount_path) + head, group_id = os.path.split(head) + head, volume_prefix = os.path.split(head) + volume_prefix = "/" + volume_prefix + + # Authorize the guest client's auth ID to mount the volume. + key = self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + auth_result = vc.authorize(vp, "{guest_entity}", readonly={readonly}, + tenant_id="{tenant_id}") + print auth_result['auth_key'] + """.format( + group_id=group_id, + volume_id=volume_id, + guest_entity=guest_entity, + readonly=readonly, + tenant_id=tenant_id)), volume_prefix, namespace_prefix + ) + + # CephFSVolumeClient's authorize() does not return the secret + # key to a caller who isn't multi-tenant aware. Explicitly + # query the key for such a client. + if not tenant_id: + key = self.fs.mon_manager.raw_cluster_cmd( + "auth", "get-key", "client.{name}".format(name=guest_entity), + ) + + # The guest auth ID should exist. + existing_ids = [a['entity'] for a in self.auth_list()] + self.assertIn("client.{0}".format(guest_entity), existing_ids) + + # Create keyring file for the guest client. + keyring_txt = dedent(""" + [client.{guest_entity}] + key = {key} + + """.format( + guest_entity=guest_entity, + key=key + )) + guest_mount.client_id = guest_entity + self._sudo_write_file(guest_mount.client_remote, + guest_mount.get_keyring_path(), + keyring_txt) + + # Add a guest client section to the ceph config file. + self.set_conf("client.{0}".format(guest_entity), "client quota", "True") + self.set_conf("client.{0}".format(guest_entity), "debug client", "20") + self.set_conf("client.{0}".format(guest_entity), "debug objecter", "20") + self.set_conf("client.{0}".format(guest_entity), + "keyring", guest_mount.get_keyring_path()) + + def test_default_prefix(self): + group_id = "grpid" + volume_id = "volid" + DEFAULT_VOL_PREFIX = "volumes" + DEFAULT_NS_PREFIX = "fsvolumens_" + + self.mount_b.umount_wait() + self._configure_vc_auth(self.mount_b, "manila") + + #create a volume with default prefix + self._volume_client_python(self.mount_b, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.create_volume(vp, 10, data_isolated=True) + """.format( + group_id=group_id, + volume_id=volume_id, + ))) + + # The dir should be created + self.mount_a.stat(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id)) + + #namespace should be set + ns_in_attr = self.mount_a.getfattr(os.path.join(DEFAULT_VOL_PREFIX, group_id, volume_id), "ceph.dir.layout.pool_namespace") + namespace = "{0}{1}".format(DEFAULT_NS_PREFIX, volume_id) + self.assertEqual(namespace, ns_in_attr) + + + def test_lifecycle(self): + """ + General smoke test for create, extend, destroy + """ + + # I'm going to use mount_c later as a guest for mounting the created + # shares + self.mounts[2].umount_wait() + + # I'm going to leave mount_b unmounted and just use it as a handle for + # driving volumeclient. It's a little hacky but we don't have a more + # general concept for librados/libcephfs clients as opposed to full + # blown mounting clients. + self.mount_b.umount_wait() + self._configure_vc_auth(self.mount_b, "manila") + + guest_entity = "guest" + group_id = "grpid" + volume_id = "volid" + + volume_prefix = "/myprefix" + namespace_prefix = "mynsprefix_" + + # Create a 100MB volume + volume_size = 100 + mount_path = self._volume_client_python(self.mount_b, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + create_result = vc.create_volume(vp, 1024*1024*{volume_size}) + print create_result['mount_path'] + """.format( + group_id=group_id, + volume_id=volume_id, + volume_size=volume_size + )), volume_prefix, namespace_prefix) + + # The dir should be created + self.mount_a.stat(os.path.join("myprefix", group_id, volume_id)) + + # Authorize and configure credentials for the guest to mount the + # the volume. + self._configure_guest_auth(self.mount_b, self.mounts[2], guest_entity, + mount_path, namespace_prefix) + self.mounts[2].mount(mount_path=mount_path) + + # The kernel client doesn't have the quota-based df behaviour, + # or quotas at all, so only exercise the client behaviour when + # running fuse. + if isinstance(self.mounts[2], FuseMount): + # df should see volume size, same as the quota set on volume's dir + self.assertEqual(self.mounts[2].df()['total'], + volume_size * 1024 * 1024) + self.assertEqual( + self.mount_a.getfattr( + os.path.join(volume_prefix.strip("/"), group_id, volume_id), + "ceph.quota.max_bytes"), + "%s" % (volume_size * 1024 * 1024)) + + # df granularity is 4MB block so have to write at least that much + data_bin_mb = 4 + self.mounts[2].write_n_mb("data.bin", data_bin_mb) + + # Write something outside volume to check this space usage is + # not reported in the volume's DF. + other_bin_mb = 8 + self.mount_a.write_n_mb("other.bin", other_bin_mb) + + # global: df should see all the writes (data + other). This is a > + # rather than a == because the global spaced used includes all pools + def check_df(): + used = self.mount_a.df()['used'] + return used >= (other_bin_mb * 1024 * 1024) + + self.wait_until_true(check_df, timeout=30) + + # Hack: do a metadata IO to kick rstats + self.mounts[2].run_shell(["touch", "foo"]) + + # volume: df should see the data_bin_mb consumed from quota, same + # as the rbytes for the volume's dir + self.wait_until_equal( + lambda: self.mounts[2].df()['used'], + data_bin_mb * 1024 * 1024, timeout=60) + self.wait_until_equal( + lambda: self.mount_a.getfattr( + os.path.join(volume_prefix.strip("/"), group_id, volume_id), + "ceph.dir.rbytes"), + "%s" % (data_bin_mb * 1024 * 1024), timeout=60) + + # sync so that file data are persist to rados + self.mounts[2].run_shell(["sync"]) + + # Our data should stay in particular rados namespace + pool_name = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool") + namespace = "{0}{1}".format(namespace_prefix, volume_id) + ns_in_attr = self.mount_a.getfattr(os.path.join("myprefix", group_id, volume_id), "ceph.dir.layout.pool_namespace") + self.assertEqual(namespace, ns_in_attr) + + objects_in_ns = set(self.fs.rados(["ls"], pool=pool_name, namespace=namespace).split("\n")) + self.assertNotEqual(objects_in_ns, set()) + + # De-authorize the guest + self._volume_client_python(self.mount_b, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.deauthorize(vp, "{guest_entity}") + vc.evict("{guest_entity}") + """.format( + group_id=group_id, + volume_id=volume_id, + guest_entity=guest_entity + )), volume_prefix, namespace_prefix) + + # Once deauthorized, the client should be unable to do any more metadata ops + # The way that the client currently behaves here is to block (it acts like + # it has lost network, because there is nothing to tell it that is messages + # are being dropped because it's identity is gone) + background = self.mounts[2].write_n_mb("rogue.bin", 1, wait=False) + time.sleep(10) # Approximate check for 'stuck' as 'still running after 10s' + self.assertFalse(background.finished) + + # After deauthorisation, the client ID should be gone (this was the only + # volume it was authorised for) + self.assertNotIn("client.{0}".format(guest_entity), [e['entity'] for e in self.auth_list()]) + + # Clean up the dead mount (ceph-fuse's behaviour here is a bit undefined) + self.mounts[2].kill() + self.mounts[2].kill_cleanup() + try: + background.wait() + except CommandFailedError: + # We killed the mount out from under you + pass + + self._volume_client_python(self.mount_b, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.delete_volume(vp) + vc.purge_volume(vp) + """.format( + group_id=group_id, + volume_id=volume_id, + )), volume_prefix, namespace_prefix) + + def test_idempotency(self): + """ + That the volumeclient interface works when calling everything twice + """ + self.mount_b.umount_wait() + self._configure_vc_auth(self.mount_b, "manila") + + guest_entity = "guest" + group_id = "grpid" + volume_id = "volid" + self._volume_client_python(self.mount_b, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.create_volume(vp, 10) + vc.create_volume(vp, 10) + vc.authorize(vp, "{guest_entity}") + vc.authorize(vp, "{guest_entity}") + vc.deauthorize(vp, "{guest_entity}") + vc.deauthorize(vp, "{guest_entity}") + vc.delete_volume(vp) + vc.delete_volume(vp) + vc.purge_volume(vp) + vc.purge_volume(vp) + + vc.create_volume(vp, 10, data_isolated=True) + vc.create_volume(vp, 10, data_isolated=True) + vc.authorize(vp, "{guest_entity}") + vc.authorize(vp, "{guest_entity}") + vc.deauthorize(vp, "{guest_entity}") + vc.deauthorize(vp, "{guest_entity}") + vc.evict("{guest_entity}") + vc.evict("{guest_entity}") + vc.delete_volume(vp, data_isolated=True) + vc.delete_volume(vp, data_isolated=True) + vc.purge_volume(vp, data_isolated=True) + vc.purge_volume(vp, data_isolated=True) + """.format( + group_id=group_id, + volume_id=volume_id, + guest_entity=guest_entity + ))) + + def test_data_isolated(self): + """ + That data isolated shares get their own pool + :return: + """ + + # Because the teuthology config template sets mon_max_pg_per_osd to + # 10000 (i.e. it just tries to ignore health warnings), reset it to something + # sane before using volume_client, to avoid creating pools with absurdly large + # numbers of PGs. + self.set_conf("global", "mon max pg per osd", "300") + for mon_daemon_state in self.ctx.daemons.iter_daemons_of_role('mon'): + mon_daemon_state.restart() + + self.mount_b.umount_wait() + self._configure_vc_auth(self.mount_b, "manila") + + # Calculate how many PGs we'll expect the new volume pool to have + osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty')) + max_per_osd = int(self.fs.get_config('mon_max_pg_per_osd')) + osd_count = len(osd_map['osds']) + max_overall = osd_count * max_per_osd + + existing_pg_count = 0 + for p in osd_map['pools']: + existing_pg_count += p['pg_num'] + + expected_pg_num = (max_overall - existing_pg_count) / 10 + log.info("max_per_osd {0}".format(max_per_osd)) + log.info("osd_count {0}".format(osd_count)) + log.info("max_overall {0}".format(max_overall)) + log.info("existing_pg_count {0}".format(existing_pg_count)) + log.info("expected_pg_num {0}".format(expected_pg_num)) + + pools_a = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools'] + + group_id = "grpid" + volume_id = "volid" + self._volume_client_python(self.mount_b, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.create_volume(vp, 10, data_isolated=True) + """.format( + group_id=group_id, + volume_id=volume_id, + ))) + + pools_b = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools'] + + # Should have created one new pool + new_pools = set(p['pool_name'] for p in pools_b) - set([p['pool_name'] for p in pools_a]) + self.assertEqual(len(new_pools), 1) + + # It should have followed the heuristic for PG count + # (this is an overly strict test condition, so we may want to remove + # it at some point as/when the logic gets fancier) + created_pg_num = self.fs.mon_manager.get_pool_property(list(new_pools)[0], "pg_num") + self.assertEqual(expected_pg_num, created_pg_num) + + def test_15303(self): + """ + Reproducer for #15303 "Client holds incorrect complete flag on dir + after losing caps" (http://tracker.ceph.com/issues/15303) + """ + for m in self.mounts: + m.umount_wait() + + # Create a dir on mount A + self.mount_a.mount() + self.mount_a.run_shell(["mkdir", "parent1"]) + self.mount_a.run_shell(["mkdir", "parent2"]) + self.mount_a.run_shell(["mkdir", "parent1/mydir"]) + + # Put some files in it from mount B + self.mount_b.mount() + self.mount_b.run_shell(["touch", "parent1/mydir/afile"]) + self.mount_b.umount_wait() + + # List the dir's contents on mount A + self.assertListEqual(self.mount_a.ls("parent1/mydir"), + ["afile"]) + + def test_evict_client(self): + """ + That a volume client can be evicted based on its auth ID and the volume + path it has mounted. + """ + + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Requires FUSE client to inject client metadata") + + # mounts[1] would be used as handle for driving VolumeClient. mounts[2] + # and mounts[3] would be used as guests to mount the volumes/shares. + + for i in range(1, 4): + self.mounts[i].umount_wait() + + volumeclient_mount = self.mounts[1] + self._configure_vc_auth(volumeclient_mount, "manila") + guest_mounts = (self.mounts[2], self.mounts[3]) + + guest_entity = "guest" + group_id = "grpid" + mount_paths = [] + volume_ids = [] + + # Create two volumes. Authorize 'guest' auth ID to mount the two + # volumes. Mount the two volumes. Write data to the volumes. + for i in range(2): + # Create volume. + volume_ids.append("volid_{0}".format(str(i))) + mount_paths.append( + self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + create_result = vc.create_volume(vp, 10 * 1024 * 1024) + print create_result['mount_path'] + """.format( + group_id=group_id, + volume_id=volume_ids[i] + )))) + + # Authorize 'guest' auth ID to mount the volume. + self._configure_guest_auth(volumeclient_mount, guest_mounts[i], + guest_entity, mount_paths[i]) + + # Mount the volume. + guest_mounts[i].mountpoint_dir_name = 'mnt.{id}.{suffix}'.format( + id=guest_entity, suffix=str(i)) + guest_mounts[i].mount(mount_path=mount_paths[i]) + guest_mounts[i].write_n_mb("data.bin", 1) + + + # Evict client, guest_mounts[0], using auth ID 'guest' and has mounted + # one volume. + self._volume_client_python(self.mount_b, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.deauthorize(vp, "{guest_entity}") + vc.evict("{guest_entity}", volume_path=vp) + """.format( + group_id=group_id, + volume_id=volume_ids[0], + guest_entity=guest_entity + ))) + + # Evicted guest client, guest_mounts[0], should not be able to do + # anymore metadata ops. It should start failing all operations + # when it sees that its own address is in the blacklist. + try: + guest_mounts[0].write_n_mb("rogue.bin", 1) + except CommandFailedError: + pass + else: + raise RuntimeError("post-eviction write should have failed!") + + # The blacklisted guest client should now be unmountable + guest_mounts[0].umount_wait() + + # Guest client, guest_mounts[1], using the same auth ID 'guest', but + # has mounted the other volume, should be able to use its volume + # unaffected. + guest_mounts[1].write_n_mb("data.bin.1", 1) + + # Cleanup. + for i in range(2): + self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.deauthorize(vp, "{guest_entity}") + vc.delete_volume(vp) + vc.purge_volume(vp) + """.format( + group_id=group_id, + volume_id=volume_ids[i], + guest_entity=guest_entity + ))) + + + def test_purge(self): + """ + Reproducer for #15266, exception trying to purge volumes that + contain non-ascii filenames. + + Additionally test any other purge corner cases here. + """ + # I'm going to leave mount_b unmounted and just use it as a handle for + # driving volumeclient. It's a little hacky but we don't have a more + # general concept for librados/libcephfs clients as opposed to full + # blown mounting clients. + self.mount_b.umount_wait() + self._configure_vc_auth(self.mount_b, "manila") + + group_id = "grpid" + # Use a unicode volume ID (like Manila), to reproduce #15266 + volume_id = u"volid" + + # Create + mount_path = self._volume_client_python(self.mount_b, dedent(""" + vp = VolumePath("{group_id}", u"{volume_id}") + create_result = vc.create_volume(vp, 10) + print create_result['mount_path'] + """.format( + group_id=group_id, + volume_id=volume_id + ))) + + # Strip leading "/" + mount_path = mount_path[1:] + + # A file with non-ascii characters + self.mount_a.run_shell(["touch", os.path.join(mount_path, u"b\u00F6b")]) + + # A file with no permissions to do anything + self.mount_a.run_shell(["touch", os.path.join(mount_path, "noperms")]) + self.mount_a.run_shell(["chmod", "0000", os.path.join(mount_path, "noperms")]) + + self._volume_client_python(self.mount_b, dedent(""" + vp = VolumePath("{group_id}", u"{volume_id}") + vc.delete_volume(vp) + vc.purge_volume(vp) + """.format( + group_id=group_id, + volume_id=volume_id + ))) + + # Check it's really gone + self.assertEqual(self.mount_a.ls("volumes/_deleting"), []) + self.assertEqual(self.mount_a.ls("volumes/"), ["_deleting", group_id]) + + def test_readonly_authorization(self): + """ + That guest clients can be restricted to read-only mounts of volumes. + """ + + volumeclient_mount = self.mounts[1] + guest_mount = self.mounts[2] + volumeclient_mount.umount_wait() + guest_mount.umount_wait() + + # Configure volumeclient_mount as the handle for driving volumeclient. + self._configure_vc_auth(volumeclient_mount, "manila") + + guest_entity = "guest" + group_id = "grpid" + volume_id = "volid" + + # Create a volume. + mount_path = self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + create_result = vc.create_volume(vp, 1024*1024*10) + print create_result['mount_path'] + """.format( + group_id=group_id, + volume_id=volume_id, + ))) + + # Authorize and configure credentials for the guest to mount the + # the volume with read-write access. + self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity, + mount_path, readonly=False) + + # Mount the volume, and write to it. + guest_mount.mount(mount_path=mount_path) + guest_mount.write_n_mb("data.bin", 1) + + # Change the guest auth ID's authorization to read-only mount access. + self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.deauthorize(vp, "{guest_entity}") + """.format( + group_id=group_id, + volume_id=volume_id, + guest_entity=guest_entity + ))) + self._configure_guest_auth(volumeclient_mount, guest_mount, guest_entity, + mount_path, readonly=True) + + # The effect of the change in access level to read-only is not + # immediate. The guest sees the change only after a remount of + # the volume. + guest_mount.umount_wait() + guest_mount.mount(mount_path=mount_path) + + # Read existing content of the volume. + self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"]) + # Cannot write into read-only volume. + with self.assertRaises(CommandFailedError): + guest_mount.write_n_mb("rogue.bin", 1) + + def test_get_authorized_ids(self): + """ + That for a volume, the authorized IDs and their access levels + can be obtained using CephFSVolumeClient's get_authorized_ids(). + """ + volumeclient_mount = self.mounts[1] + volumeclient_mount.umount_wait() + + # Configure volumeclient_mount as the handle for driving volumeclient. + self._configure_vc_auth(volumeclient_mount, "manila") + + group_id = "grpid" + volume_id = "volid" + guest_entity_1 = "guest1" + guest_entity_2 = "guest2" + + log.info("print group ID: {0}".format(group_id)) + + # Create a volume. + auths = self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.create_volume(vp, 1024*1024*10) + auths = vc.get_authorized_ids(vp) + print auths + """.format( + group_id=group_id, + volume_id=volume_id, + ))) + # Check the list of authorized IDs for the volume. + expected_result = None + self.assertEqual(str(expected_result), auths) + + # Allow two auth IDs access to the volume. + auths = self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.authorize(vp, "{guest_entity_1}", readonly=False) + vc.authorize(vp, "{guest_entity_2}", readonly=True) + auths = vc.get_authorized_ids(vp) + print auths + """.format( + group_id=group_id, + volume_id=volume_id, + guest_entity_1=guest_entity_1, + guest_entity_2=guest_entity_2, + ))) + # Check the list of authorized IDs and their access levels. + expected_result = [(u'guest1', u'rw'), (u'guest2', u'r')] + self.assertItemsEqual(str(expected_result), auths) + + # Disallow both the auth IDs' access to the volume. + auths = self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.deauthorize(vp, "{guest_entity_1}") + vc.deauthorize(vp, "{guest_entity_2}") + auths = vc.get_authorized_ids(vp) + print auths + """.format( + group_id=group_id, + volume_id=volume_id, + guest_entity_1=guest_entity_1, + guest_entity_2=guest_entity_2, + ))) + # Check the list of authorized IDs for the volume. + expected_result = None + self.assertItemsEqual(str(expected_result), auths) + + def test_multitenant_volumes(self): + """ + That volume access can be restricted to a tenant. + + That metadata used to enforce tenant isolation of + volumes is stored as a two-way mapping between auth + IDs and volumes that they're authorized to access. + """ + volumeclient_mount = self.mounts[1] + volumeclient_mount.umount_wait() + + # Configure volumeclient_mount as the handle for driving volumeclient. + self._configure_vc_auth(volumeclient_mount, "manila") + + group_id = "groupid" + volume_id = "volumeid" + + # Guest clients belonging to different tenants, but using the same + # auth ID. + auth_id = "guest" + guestclient_1 = { + "auth_id": auth_id, + "tenant_id": "tenant1", + } + guestclient_2 = { + "auth_id": auth_id, + "tenant_id": "tenant2", + } + + # Create a volume. + self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.create_volume(vp, 1024*1024*10) + """.format( + group_id=group_id, + volume_id=volume_id, + ))) + + # Check that volume metadata file is created on volume creation. + vol_metadata_filename = "_{0}:{1}.meta".format(group_id, volume_id) + self.assertIn(vol_metadata_filename, self.mounts[0].ls("volumes")) + + # Authorize 'guestclient_1', using auth ID 'guest' and belonging to + # 'tenant1', with 'rw' access to the volume. + self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}") + """.format( + group_id=group_id, + volume_id=volume_id, + auth_id=guestclient_1["auth_id"], + tenant_id=guestclient_1["tenant_id"] + ))) + + # Check that auth metadata file for auth ID 'guest', is + # created on authorizing 'guest' access to the volume. + auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"]) + self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes")) + + # Verify that the auth metadata file stores the tenant ID that the + # auth ID belongs to, the auth ID's authorized access levels + # for different volumes, versioning details, etc. + expected_auth_metadata = { + u"version": 2, + u"compat_version": 1, + u"dirty": False, + u"tenant_id": u"tenant1", + u"volumes": { + u"groupid/volumeid": { + u"dirty": False, + u"access_level": u"rw", + } + } + } + + auth_metadata = self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + auth_metadata = vc._auth_metadata_get("{auth_id}") + print auth_metadata + """.format( + group_id=group_id, + volume_id=volume_id, + auth_id=guestclient_1["auth_id"], + ))) + + self.assertItemsEqual(str(expected_auth_metadata), auth_metadata) + + # Verify that the volume metadata file stores info about auth IDs + # and their access levels to the volume, versioning details, etc. + expected_vol_metadata = { + u"version": 2, + u"compat_version": 1, + u"auths": { + u"guest": { + u"dirty": False, + u"access_level": u"rw" + } + } + } + + vol_metadata = self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + volume_metadata = vc._volume_metadata_get(vp) + print volume_metadata + """.format( + group_id=group_id, + volume_id=volume_id, + ))) + self.assertItemsEqual(str(expected_vol_metadata), vol_metadata) + + # Cannot authorize 'guestclient_2' to access the volume. + # It uses auth ID 'guest', which has already been used by a + # 'guestclient_1' belonging to an another tenant for accessing + # the volume. + with self.assertRaises(CommandFailedError): + self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}") + """.format( + group_id=group_id, + volume_id=volume_id, + auth_id=guestclient_2["auth_id"], + tenant_id=guestclient_2["tenant_id"] + ))) + + # Check that auth metadata file is cleaned up on removing + # auth ID's only access to a volume. + self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.deauthorize(vp, "{guest_entity}") + """.format( + group_id=group_id, + volume_id=volume_id, + guest_entity=guestclient_1["auth_id"] + ))) + + self.assertNotIn(auth_metadata_filename, self.mounts[0].ls("volumes")) + + # Check that volume metadata file is cleaned up on volume deletion. + self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.delete_volume(vp) + """.format( + group_id=group_id, + volume_id=volume_id, + ))) + self.assertNotIn(vol_metadata_filename, self.mounts[0].ls("volumes")) + + def test_recover_metadata(self): + """ + That volume client can recover from partial auth updates using + metadata files, which store auth info and its update status info. + """ + volumeclient_mount = self.mounts[1] + volumeclient_mount.umount_wait() + + # Configure volumeclient_mount as the handle for driving volumeclient. + self._configure_vc_auth(volumeclient_mount, "manila") + + group_id = "groupid" + volume_id = "volumeid" + + guestclient = { + "auth_id": "guest", + "tenant_id": "tenant", + } + + # Create a volume. + self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.create_volume(vp, 1024*1024*10) + """.format( + group_id=group_id, + volume_id=volume_id, + ))) + + # Authorize 'guestclient' access to the volume. + self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + vc.authorize(vp, "{auth_id}", tenant_id="{tenant_id}") + """.format( + group_id=group_id, + volume_id=volume_id, + auth_id=guestclient["auth_id"], + tenant_id=guestclient["tenant_id"] + ))) + + # Check that auth metadata file for auth ID 'guest' is created. + auth_metadata_filename = "${0}.meta".format(guestclient["auth_id"]) + self.assertIn(auth_metadata_filename, self.mounts[0].ls("volumes")) + + # Induce partial auth update state by modifying the auth metadata file, + # and then run recovery procedure. + self._volume_client_python(volumeclient_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + auth_metadata = vc._auth_metadata_get("{auth_id}") + auth_metadata['dirty'] = True + vc._auth_metadata_set("{auth_id}", auth_metadata) + vc.recover() + """.format( + group_id=group_id, + volume_id=volume_id, + auth_id=guestclient["auth_id"], + ))) + + def test_put_object(self): + vc_mount = self.mounts[1] + vc_mount.umount_wait() + self._configure_vc_auth(vc_mount, "manila") + + obj_data = 'test data' + obj_name = 'test_vc_obj_1' + pool_name = self.fs.get_data_pool_names()[0] + + self._volume_client_python(vc_mount, dedent(""" + vc.put_object("{pool_name}", "{obj_name}", b"{obj_data}") + """.format( + pool_name = pool_name, + obj_name = obj_name, + obj_data = obj_data + ))) + + read_data = self.fs.rados(['get', obj_name, '-'], pool=pool_name) + self.assertEqual(obj_data, read_data) + + def test_get_object(self): + vc_mount = self.mounts[1] + vc_mount.umount_wait() + self._configure_vc_auth(vc_mount, "manila") + + obj_data = 'test_data' + obj_name = 'test_vc_ob_2' + pool_name = self.fs.get_data_pool_names()[0] + + self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data) + + self._volume_client_python(vc_mount, dedent(""" + data_read = vc.get_object("{pool_name}", "{obj_name}") + assert data_read == b"{obj_data}" + """.format( + pool_name = pool_name, + obj_name = obj_name, + obj_data = obj_data + ))) + + def test_delete_object(self): + vc_mount = self.mounts[1] + vc_mount.umount_wait() + self._configure_vc_auth(vc_mount, "manila") + + obj_data = 'test data' + obj_name = 'test_vc_obj_3' + pool_name = self.fs.get_data_pool_names()[0] + + self.fs.rados(['put', obj_name, '-'], pool=pool_name, stdin_data=obj_data) + + self._volume_client_python(vc_mount, dedent(""" + data_read = vc.delete_object("{pool_name}", "{obj_name}") + """.format( + pool_name = pool_name, + obj_name = obj_name, + ))) + + with self.assertRaises(CommandFailedError): + self.fs.rados(['stat', obj_name], pool=pool_name) + + # Check idempotency -- no error raised trying to delete non-existent + # object + self._volume_client_python(vc_mount, dedent(""" + data_read = vc.delete_object("{pool_name}", "{obj_name}") + """.format( + pool_name = pool_name, + obj_name = obj_name, + ))) + + def test_21501(self): + """ + Reproducer for #21501 "ceph_volume_client: sets invalid caps for + existing IDs with no caps" (http://tracker.ceph.com/issues/21501) + """ + + vc_mount = self.mounts[1] + vc_mount.umount_wait() + + # Configure vc_mount as the handle for driving volumeclient + self._configure_vc_auth(vc_mount, "manila") + + # Create a volume + group_id = "grpid" + volume_id = "volid" + mount_path = self._volume_client_python(vc_mount, dedent(""" + vp = VolumePath("{group_id}", "{volume_id}") + create_result = vc.create_volume(vp, 1024*1024*10) + print create_result['mount_path'] + """.format( + group_id=group_id, + volume_id=volume_id + ))) + + # Create an auth ID with no caps + guest_id = '21501' + self.fs.mon_manager.raw_cluster_cmd_result( + 'auth', 'get-or-create', 'client.{0}'.format(guest_id)) + + guest_mount = self.mounts[2] + guest_mount.umount_wait() + + # Set auth caps for the auth ID using the volumeclient + self._configure_guest_auth(vc_mount, guest_mount, guest_id, mount_path) + + # Mount the volume in the guest using the auth ID to assert that the + # auth caps are valid + guest_mount.mount(mount_path=mount_path) |