summaryrefslogtreecommitdiffstats
path: root/src/ceph/qa/tasks/mds_creation_failure.py
diff options
context:
space:
mode:
authorQiaowei Ren <qiaowei.ren@intel.com>2018-01-04 13:43:33 +0800
committerQiaowei Ren <qiaowei.ren@intel.com>2018-01-05 11:59:39 +0800
commit812ff6ca9fcd3e629e49d4328905f33eee8ca3f5 (patch)
tree04ece7b4da00d9d2f98093774594f4057ae561d4 /src/ceph/qa/tasks/mds_creation_failure.py
parent15280273faafb77777eab341909a3f495cf248d9 (diff)
initial code repo
This patch creates initial code repo. For ceph, luminous stable release will be used for base code, and next changes and optimization for ceph will be added to it. For opensds, currently any changes can be upstreamed into original opensds repo (https://github.com/opensds/opensds), and so stor4nfv will directly clone opensds code to deploy stor4nfv environment. And the scripts for deployment based on ceph and opensds will be put into 'ci' directory. Change-Id: I46a32218884c75dda2936337604ff03c554648e4 Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
Diffstat (limited to 'src/ceph/qa/tasks/mds_creation_failure.py')
-rw-r--r--src/ceph/qa/tasks/mds_creation_failure.py85
1 files changed, 85 insertions, 0 deletions
diff --git a/src/ceph/qa/tasks/mds_creation_failure.py b/src/ceph/qa/tasks/mds_creation_failure.py
new file mode 100644
index 0000000..d1de156
--- /dev/null
+++ b/src/ceph/qa/tasks/mds_creation_failure.py
@@ -0,0 +1,85 @@
+
+import logging
+import contextlib
+import time
+import ceph_manager
+from teuthology import misc
+from teuthology.orchestra.run import CommandFailedError, Raw
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Go through filesystem creation with a synthetic failure in an MDS
+ in its 'up:creating' state, to exercise the retry behaviour.
+ """
+ # Grab handles to the teuthology objects of interest
+ mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
+ if len(mdslist) != 1:
+ # Require exactly one MDS, the code path for creation failure when
+ # a standby is available is different
+ raise RuntimeError("This task requires exactly one MDS")
+
+ mds_id = mdslist[0]
+ (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.iterkeys()
+ manager = ceph_manager.CephManager(
+ mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
+ )
+
+ # Stop MDS
+ manager.raw_cluster_cmd('mds', 'set', "max_mds", "0")
+ mds = ctx.daemons.get_daemon('mds', mds_id)
+ mds.stop()
+ manager.raw_cluster_cmd('mds', 'fail', mds_id)
+
+ # Reset the filesystem so that next start will go into CREATING
+ manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
+ manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")
+
+ # Start the MDS with mds_kill_create_at set, it will crash during creation
+ mds.restart_with_args(["--mds_kill_create_at=1"])
+ try:
+ mds.wait_for_exit()
+ except CommandFailedError as e:
+ if e.exitstatus == 1:
+ log.info("MDS creation killed as expected")
+ else:
+ log.error("Unexpected status code %s" % e.exitstatus)
+ raise
+
+ # Since I have intentionally caused a crash, I will clean up the resulting core
+ # file to avoid task.internal.coredump seeing it as a failure.
+ log.info("Removing core file from synthetic MDS failure")
+ mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
+
+ # It should have left the MDS map state still in CREATING
+ status = manager.get_mds_status(mds_id)
+ assert status['state'] == 'up:creating'
+
+ # Start the MDS again without the kill flag set, it should proceed with creation successfully
+ mds.restart()
+
+ # Wait for state ACTIVE
+ t = 0
+ create_timeout = 120
+ while True:
+ status = manager.get_mds_status(mds_id)
+ if status['state'] == 'up:active':
+ log.info("MDS creation completed successfully")
+ break
+ elif status['state'] == 'up:creating':
+ log.info("MDS still in creating state")
+ if t > create_timeout:
+ log.error("Creating did not complete within %ss" % create_timeout)
+ raise RuntimeError("Creating did not complete within %ss" % create_timeout)
+ t += 1
+ time.sleep(1)
+ else:
+ log.error("Unexpected MDS state: %s" % status['state'])
+ assert(status['state'] in ['up:active', 'up:creating'])
+
+ # The system should be back up in a happy healthy state, go ahead and run any further tasks
+ # inside this context.
+ yield