summaryrefslogtreecommitdiffstats
path: root/src/ceph/qa/tasks/cephfs/test_recovery_pool.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/ceph/qa/tasks/cephfs/test_recovery_pool.py')
-rw-r--r--src/ceph/qa/tasks/cephfs/test_recovery_pool.py220
1 files changed, 0 insertions, 220 deletions
diff --git a/src/ceph/qa/tasks/cephfs/test_recovery_pool.py b/src/ceph/qa/tasks/cephfs/test_recovery_pool.py
deleted file mode 100644
index 097342a..0000000
--- a/src/ceph/qa/tasks/cephfs/test_recovery_pool.py
+++ /dev/null
@@ -1,220 +0,0 @@
-
-"""
-Test our tools for recovering metadata from the data pool into an alternate pool
-"""
-import json
-
-import logging
-import os
-from textwrap import dedent
-import traceback
-from collections import namedtuple, defaultdict
-
-from teuthology.orchestra.run import CommandFailedError
-from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology
-
-log = logging.getLogger(__name__)
-
-
-ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
-
-
-class OverlayWorkload(object):
- def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
- self._orig_fs = orig_fs
- self._recovery_fs = recovery_fs
- self._orig_mount = orig_mount
- self._recovery_mount = recovery_mount
- self._initial_state = None
-
- # Accumulate backtraces for every failed validation, and return them. Backtraces
- # are rather verbose, but we only see them when something breaks, and they
- # let us see which check failed without having to decorate each check with
- # a string
- self._errors = []
-
- def assert_equal(self, a, b):
- try:
- if a != b:
- raise AssertionError("{0} != {1}".format(a, b))
- except AssertionError as e:
- self._errors.append(
- ValidationError(e, traceback.format_exc(3))
- )
-
- def write(self):
- """
- Write the workload files to the mount
- """
- raise NotImplementedError()
-
- def validate(self):
- """
- Read from the mount and validate that the workload files are present (i.e. have
- survived or been reconstructed from the test scenario)
- """
- raise NotImplementedError()
-
- def damage(self):
- """
- Damage the filesystem pools in ways that will be interesting to recover from. By
- default just wipe everything in the metadata pool
- """
- # Delete every object in the metadata pool
- objects = self._orig_fs.rados(["ls"]).split("\n")
- for o in objects:
- self._orig_fs.rados(["rm", o])
-
- def flush(self):
- """
- Called after client unmount, after write: flush whatever you want
- """
- self._orig_fs.mds_asok(["flush", "journal"])
- self._recovery_fs.mds_asok(["flush", "journal"])
-
-
-class SimpleOverlayWorkload(OverlayWorkload):
- """
- Single file, single directory, check that it gets recovered and so does its size
- """
- def write(self):
- self._orig_mount.run_shell(["mkdir", "subdir"])
- self._orig_mount.write_n_mb("subdir/sixmegs", 6)
- self._initial_state = self._orig_mount.stat("subdir/sixmegs")
-
- def validate(self):
- self._recovery_mount.run_shell(["ls", "subdir"])
- st = self._recovery_mount.stat("subdir/sixmegs")
- self.assert_equal(st['st_size'], self._initial_state['st_size'])
- return self._errors
-
-class TestRecoveryPool(CephFSTestCase):
- MDSS_REQUIRED = 2
- CLIENTS_REQUIRED = 2
- REQUIRE_RECOVERY_FILESYSTEM = True
-
- def is_marked_damaged(self, rank):
- mds_map = self.fs.get_mds_map()
- return rank in mds_map['damaged']
-
- def _rebuild_metadata(self, workload, other_pool=None, workers=1):
- """
- That when all objects in metadata pool are removed, we can rebuild a metadata pool
- based on the contents of a data pool, and a client can see and read our files.
- """
-
- # First, inject some files
-
- workload.write()
-
- # Unmount the client and flush the journal: the tool should also cope with
- # situations where there is dirty metadata, but we'll test that separately
- self.mount_a.umount_wait()
- self.mount_b.umount_wait()
- workload.flush()
-
- # Create the alternate pool if requested
- recovery_fs = self.recovery_fs.name
- recovery_pool = self.recovery_fs.get_metadata_pool_name()
- self.recovery_fs.data_scan(['init', '--force-init',
- '--filesystem', recovery_fs,
- '--alternate-pool', recovery_pool])
- self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
- self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
- self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
- self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
-
- # Stop the MDS
- self.fs.mds_stop()
- self.fs.mds_fail()
-
- # After recovery, we need the MDS to not be strict about stats (in production these options
- # are off by default, but in QA we need to explicitly disable them)
- self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
- self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
-
- # Apply any data damage the workload wants
- workload.damage()
-
- # Reset the MDS map in case multiple ranks were in play: recovery procedure
- # only understands how to rebuild metadata under rank 0
- self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name,
- '--yes-i-really-mean-it')
-
- def get_state(mds_id):
- info = self.mds_cluster.get_mds_info(mds_id)
- return info['state'] if info is not None else None
-
- self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
- self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
- self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
-
- # Run the recovery procedure
- if False:
- with self.assertRaises(CommandFailedError):
- # Normal reset should fail when no objects are present, we'll use --force instead
- self.fs.journal_tool(["journal", "reset"])
-
- self.fs.mds_stop()
- self.fs.data_scan(['scan_extents', '--alternate-pool',
- recovery_pool, '--filesystem', self.fs.name,
- self.fs.get_data_pool_name()])
- self.fs.data_scan(['scan_inodes', '--alternate-pool',
- recovery_pool, '--filesystem', self.fs.name,
- '--force-corrupt', '--force-init',
- self.fs.get_data_pool_name()])
- self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
- 'recover_dentries', 'list',
- '--alternate-pool', recovery_pool])
-
- self.fs.data_scan(['init', '--force-init', '--filesystem',
- self.fs.name])
- self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
- '--force-corrupt', '--force-init',
- self.fs.get_data_pool_name()])
- self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'event',
- 'recover_dentries', 'list'])
-
- self.fs.journal_tool(['--rank=' + recovery_fs + ":0", 'journal',
- 'reset', '--force'])
- self.fs.journal_tool(['--rank=' + self.fs.name + ":0", 'journal',
- 'reset', '--force'])
- self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
- recovery_fs + ":0")
-
- # Mark the MDS repaired
- self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
-
- # Start the MDS
- self.fs.mds_restart()
- self.recovery_fs.mds_restart()
- self.fs.wait_for_daemons()
- self.recovery_fs.wait_for_daemons()
- for mds_id in self.recovery_fs.mds_ids:
- self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + mds_id,
- 'injectargs', '--debug-mds=20')
- self.fs.mon_manager.raw_cluster_cmd('daemon', "mds." + mds_id,
- 'scrub_path', '/',
- 'recursive', 'repair')
- log.info(str(self.mds_cluster.status()))
-
- # Mount a client
- self.mount_a.mount()
- self.mount_b.mount(mount_fs_name=recovery_fs)
- self.mount_a.wait_until_mounted()
- self.mount_b.wait_until_mounted()
-
- # See that the files are present and correct
- errors = workload.validate()
- if errors:
- log.error("Validation errors found: {0}".format(len(errors)))
- for e in errors:
- log.error(e.exception)
- log.error(e.backtrace)
- raise AssertionError("Validation failed, first error: {0}\n{1}".format(
- errors[0].exception, errors[0].backtrace
- ))
-
- def test_rebuild_simple(self):
- self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
- self.mount_a, self.mount_b))