summaryrefslogtreecommitdiffstats
path: root/src/ceph/qa/tasks/mgr/test_failover.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/ceph/qa/tasks/mgr/test_failover.py')
-rw-r--r--src/ceph/qa/tasks/mgr/test_failover.py144
1 files changed, 144 insertions, 0 deletions
diff --git a/src/ceph/qa/tasks/mgr/test_failover.py b/src/ceph/qa/tasks/mgr/test_failover.py
new file mode 100644
index 0000000..0dd9cb7
--- /dev/null
+++ b/src/ceph/qa/tasks/mgr/test_failover.py
@@ -0,0 +1,144 @@
+
+import logging
+import json
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class TestFailover(MgrTestCase):
+ MGRS_REQUIRED = 2
+
+ def test_timeout(self):
+ """
+ That when an active mgr stops responding, a standby is promoted
+ after mon_mgr_beacon_grace.
+ """
+
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ # Stop that daemon
+ self.mgr_cluster.mgr_stop(original_active)
+
+ # Assert that the other mgr becomes active
+ self.wait_until_true(
+ lambda: self.mgr_cluster.get_active_id() in original_standbys,
+ timeout=60
+ )
+
+ self.mgr_cluster.mgr_restart(original_active)
+ self.wait_until_true(
+ lambda: original_active in self.mgr_cluster.get_standby_ids(),
+ timeout=10
+ )
+
+ def test_timeout_nostandby(self):
+ """
+ That when an active mgr stop responding, and no standby is
+ available, the active mgr is removed from the map anyway.
+ """
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ for s in original_standbys:
+ self.mgr_cluster.mgr_stop(s)
+ self.mgr_cluster.mgr_fail(s)
+
+ self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
+ self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+
+ grace = int(self.mgr_cluster.get_config("mon_mgr_beacon_grace"))
+ log.info("Should time out in about {0} seconds".format(grace))
+
+ self.mgr_cluster.mgr_stop(original_active)
+
+ # Now wait for the mon to notice the mgr is gone and remove it
+ # from the map.
+ self.wait_until_equal(
+ lambda: self.mgr_cluster.get_active_id(),
+ "",
+ timeout=grace * 2
+ )
+
+ self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
+ self.assertEqual(self.mgr_cluster.get_active_id(), "")
+
+ def test_explicit_fail(self):
+ """
+ That when a user explicitly fails a daemon, a standby immediately
+ replaces it.
+ :return:
+ """
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ self.mgr_cluster.mgr_fail(original_active)
+
+ # A standby should take over
+ self.wait_until_true(
+ lambda: self.mgr_cluster.get_active_id() in original_standbys,
+ timeout=60
+ )
+
+ # The one we failed should come back as a standby (he isn't
+ # really dead)
+ self.wait_until_true(
+ lambda: original_active in self.mgr_cluster.get_standby_ids(),
+ timeout=10
+ )
+
+ # Both daemons should have fully populated metadata
+ # (regression test for http://tracker.ceph.com/issues/21260)
+ meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ "mgr", "metadata"))
+ id_to_meta = dict([(i['id'], i) for i in meta])
+ for i in [original_active] + original_standbys:
+ self.assertIn(i, id_to_meta)
+ self.assertIn('ceph_version', id_to_meta[i])
+
+ # We should be able to fail back over again: the exercises
+ # our re-initialization of the python runtime within
+ # a single process lifetime.
+
+ # Get rid of any bystander standbys so that the original_active
+ # will be selected as next active.
+ new_active = self.mgr_cluster.get_active_id()
+ for daemon in original_standbys:
+ if daemon != new_active:
+ self.mgr_cluster.mgr_stop(daemon)
+ self.mgr_cluster.mgr_fail(daemon)
+
+ self.assertListEqual(self.mgr_cluster.get_standby_ids(),
+ [original_active])
+
+ self.mgr_cluster.mgr_stop(new_active)
+ self.mgr_cluster.mgr_fail(new_active)
+
+ self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+ self.assertEqual(self.mgr_cluster.get_standby_ids(), [])
+
+ def test_standby_timeout(self):
+ """
+ That when a standby daemon stops sending beacons, it is
+ removed from the list of standbys
+ :return:
+ """
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ victim = original_standbys[0]
+ self.mgr_cluster.mgr_stop(victim)
+
+ expect_standbys = set(original_standbys) - {victim}
+
+ self.wait_until_true(
+ lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+ timeout=60
+ )
+ self.assertEqual(self.mgr_cluster.get_active_id(), original_active)