summaryrefslogtreecommitdiffstats
path: root/src/ceph/qa/tasks/mgr
diff options
context:
space:
mode:
authorQiaowei Ren <qiaowei.ren@intel.com>2018-01-04 13:43:33 +0800
committerQiaowei Ren <qiaowei.ren@intel.com>2018-01-05 11:59:39 +0800
commit812ff6ca9fcd3e629e49d4328905f33eee8ca3f5 (patch)
tree04ece7b4da00d9d2f98093774594f4057ae561d4 /src/ceph/qa/tasks/mgr
parent15280273faafb77777eab341909a3f495cf248d9 (diff)
initial code repo
This patch creates initial code repo. For ceph, luminous stable release will be used for base code, and next changes and optimization for ceph will be added to it. For opensds, currently any changes can be upstreamed into original opensds repo (https://github.com/opensds/opensds), and so stor4nfv will directly clone opensds code to deploy stor4nfv environment. And the scripts for deployment based on ceph and opensds will be put into 'ci' directory. Change-Id: I46a32218884c75dda2936337604ff03c554648e4 Signed-off-by: Qiaowei Ren <qiaowei.ren@intel.com>
Diffstat (limited to 'src/ceph/qa/tasks/mgr')
-rw-r--r--src/ceph/qa/tasks/mgr/__init__.py0
-rw-r--r--src/ceph/qa/tasks/mgr/mgr_test_case.py170
-rw-r--r--src/ceph/qa/tasks/mgr/test_dashboard.py70
-rw-r--r--src/ceph/qa/tasks/mgr/test_failover.py144
-rw-r--r--src/ceph/qa/tasks/mgr/test_module_selftest.py74
5 files changed, 458 insertions, 0 deletions
diff --git a/src/ceph/qa/tasks/mgr/__init__.py b/src/ceph/qa/tasks/mgr/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/src/ceph/qa/tasks/mgr/__init__.py
diff --git a/src/ceph/qa/tasks/mgr/mgr_test_case.py b/src/ceph/qa/tasks/mgr/mgr_test_case.py
new file mode 100644
index 0000000..ec3f98d
--- /dev/null
+++ b/src/ceph/qa/tasks/mgr/mgr_test_case.py
@@ -0,0 +1,170 @@
+
+from unittest import case
+import json
+import logging
+
+from teuthology import misc
+from tasks.ceph_test_case import CephTestCase
+
+# TODO move definition of CephCluster away from the CephFS stuff
+from tasks.cephfs.filesystem import CephCluster
+
+
+log = logging.getLogger(__name__)
+
+
+class MgrCluster(CephCluster):
+ def __init__(self, ctx):
+ super(MgrCluster, self).__init__(ctx)
+ self.mgr_ids = list(misc.all_roles_of_type(ctx.cluster, 'mgr'))
+
+ if len(self.mgr_ids) == 0:
+ raise RuntimeError(
+ "This task requires at least one manager daemon")
+
+ self.mgr_daemons = dict(
+ [(mgr_id, self._ctx.daemons.get_daemon('mgr', mgr_id)) for mgr_id
+ in self.mgr_ids])
+
+ def mgr_stop(self, mgr_id):
+ self.mgr_daemons[mgr_id].stop()
+
+ def mgr_fail(self, mgr_id):
+ self.mon_manager.raw_cluster_cmd("mgr", "fail", mgr_id)
+
+ def mgr_restart(self, mgr_id):
+ self.mgr_daemons[mgr_id].restart()
+
+ def get_mgr_map(self):
+ status = json.loads(
+ self.mon_manager.raw_cluster_cmd("status", "--format=json-pretty"))
+
+ return status["mgrmap"]
+
+ def get_active_id(self):
+ return self.get_mgr_map()["active_name"]
+
+ def get_standby_ids(self):
+ return [s['name'] for s in self.get_mgr_map()["standbys"]]
+
+ def set_module_localized_conf(self, module, mgr_id, key, val):
+ self.mon_manager.raw_cluster_cmd("config-key", "set",
+ "mgr/{0}/{1}/{2}".format(
+ module, mgr_id, key
+ ), val)
+
+
+class MgrTestCase(CephTestCase):
+ MGRS_REQUIRED = 1
+
+ def setUp(self):
+ super(MgrTestCase, self).setUp()
+
+ # The test runner should have populated this
+ assert self.mgr_cluster is not None
+
+ if len(self.mgr_cluster.mgr_ids) < self.MGRS_REQUIRED:
+ raise case.SkipTest("Only have {0} manager daemons, "
+ "{1} are required".format(
+ len(self.mgr_cluster.mgr_ids), self.MGRS_REQUIRED))
+
+ # Restart all the daemons
+ for daemon in self.mgr_cluster.mgr_daemons.values():
+ daemon.stop()
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ self.mgr_cluster.mgr_fail(mgr_id)
+
+ for daemon in self.mgr_cluster.mgr_daemons.values():
+ daemon.restart()
+
+ # Wait for an active to come up
+ self.wait_until_true(lambda: self.mgr_cluster.get_active_id() != "",
+ timeout=20)
+
+ expect_standbys = set(self.mgr_cluster.mgr_ids) \
+ - {self.mgr_cluster.get_active_id()}
+ self.wait_until_true(
+ lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+ timeout=20)
+
+ def _load_module(self, module_name):
+ loaded = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ "mgr", "module", "ls"))['enabled_modules']
+ if module_name in loaded:
+ # The enable command is idempotent, but our wait for a restart
+ # isn't, so let's return now if it's already loaded
+ return
+
+ initial_gid = self.mgr_cluster.get_mgr_map()['active_gid']
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable",
+ module_name)
+
+ # Wait for the module to load
+ def has_restarted():
+ mgr_map = self.mgr_cluster.get_mgr_map()
+ done = mgr_map['active_gid'] != initial_gid and mgr_map['available']
+ if done:
+ log.info("Restarted after module load (new active {0}/{1})".format(
+ mgr_map['active_name'] , mgr_map['active_gid']))
+ return done
+ self.wait_until_true(has_restarted, timeout=30)
+
+
+ def _get_uri(self, service_name):
+ # Little dict hack so that I can assign into this from
+ # the get_or_none function
+ mgr_map = {'x': None}
+
+ def _get_or_none():
+ mgr_map['x'] = self.mgr_cluster.get_mgr_map()
+ result = mgr_map['x']['services'].get(service_name, None)
+ return result
+
+ self.wait_until_true(lambda: _get_or_none() is not None, 30)
+
+ uri = mgr_map['x']['services'][service_name]
+
+ log.info("Found {0} at {1} (daemon {2}/{3})".format(
+ service_name, uri, mgr_map['x']['active_name'],
+ mgr_map['x']['active_gid']))
+
+ return uri
+
+
+ def _assign_ports(self, module_name, config_name, min_port=7789):
+ """
+ To avoid the need to run lots of hosts in teuthology tests to
+ get different URLs per mgr, we will hand out different ports
+ to each mgr here.
+
+ This is already taken care of for us when running in a vstart
+ environment.
+ """
+ # Start handing out ports well above Ceph's range.
+ assign_port = min_port
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ self.mgr_cluster.mgr_stop(mgr_id)
+ self.mgr_cluster.mgr_fail(mgr_id)
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ log.info("Using port {0} for {1} on mgr.{2}".format(
+ assign_port, module_name, mgr_id
+ ))
+ self.mgr_cluster.set_module_localized_conf(module_name, mgr_id,
+ config_name,
+ str(assign_port))
+ assign_port += 1
+
+ for mgr_id in self.mgr_cluster.mgr_ids:
+ self.mgr_cluster.mgr_restart(mgr_id)
+
+ def is_available():
+ mgr_map = self.mgr_cluster.get_mgr_map()
+ done = mgr_map['available']
+ if done:
+ log.info("Available after assign ports (new active {0}/{1})".format(
+ mgr_map['active_name'] , mgr_map['active_gid']))
+ return done
+ self.wait_until_true(is_available, timeout=30)
diff --git a/src/ceph/qa/tasks/mgr/test_dashboard.py b/src/ceph/qa/tasks/mgr/test_dashboard.py
new file mode 100644
index 0000000..3b8a2cc
--- /dev/null
+++ b/src/ceph/qa/tasks/mgr/test_dashboard.py
@@ -0,0 +1,70 @@
+
+
+from mgr_test_case import MgrTestCase
+
+import logging
+import requests
+
+
+log = logging.getLogger(__name__)
+
+
+class TestDashboard(MgrTestCase):
+ MGRS_REQUIRED = 3
+
+ def test_standby(self):
+ self._assign_ports("dashboard", "server_port")
+ self._load_module("dashboard")
+
+ original_active = self.mgr_cluster.get_active_id()
+
+ original_uri = self._get_uri("dashboard")
+ log.info("Originally running at {0}".format(original_uri))
+
+ self.mgr_cluster.mgr_fail(original_active)
+
+ failed_over_uri = self._get_uri("dashboard")
+ log.info("After failover running at {0}".format(original_uri))
+
+ self.assertNotEqual(original_uri, failed_over_uri)
+
+ # The original active daemon should have come back up as a standby
+ # and be doing redirects to the new active daemon
+ r = requests.get(original_uri, allow_redirects=False)
+ self.assertEqual(r.status_code, 303)
+ self.assertEqual(r.headers['Location'], failed_over_uri)
+
+ def test_urls(self):
+ self._assign_ports("dashboard", "server_port")
+ self._load_module("dashboard")
+
+ base_uri = self._get_uri("dashboard")
+
+ # This is a very simple smoke test to check that the dashboard can
+ # give us a 200 response to requests. We're not testing that
+ # the content is correct or even renders!
+
+ urls = [
+ "/health",
+ "/servers",
+ "/osd/",
+ "/osd/perf/0",
+ "/rbd_mirroring",
+ "/rbd_iscsi"
+ ]
+
+ failures = []
+
+ for url in urls:
+ r = requests.get(base_uri + url, allow_redirects=False)
+ if r.status_code >= 300 and r.status_code < 400:
+ log.error("Unexpected redirect to: {0} (from {1})".format(
+ r.headers['Location'], base_uri))
+ if r.status_code != 200:
+ failures.append(url)
+
+ log.info("{0}: {1} ({2} bytes)".format(
+ url, r.status_code, len(r.content)
+ ))
+
+ self.assertListEqual(failures, [])
diff --git a/src/ceph/qa/tasks/mgr/test_failover.py b/src/ceph/qa/tasks/mgr/test_failover.py
new file mode 100644
index 0000000..0dd9cb7
--- /dev/null
+++ b/src/ceph/qa/tasks/mgr/test_failover.py
@@ -0,0 +1,144 @@
+
+import logging
+import json
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class TestFailover(MgrTestCase):
+ MGRS_REQUIRED = 2
+
+ def test_timeout(self):
+ """
+ That when an active mgr stops responding, a standby is promoted
+ after mon_mgr_beacon_grace.
+ """
+
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ # Stop that daemon
+ self.mgr_cluster.mgr_stop(original_active)
+
+ # Assert that the other mgr becomes active
+ self.wait_until_true(
+ lambda: self.mgr_cluster.get_active_id() in original_standbys,
+ timeout=60
+ )
+
+ self.mgr_cluster.mgr_restart(original_active)
+ self.wait_until_true(
+ lambda: original_active in self.mgr_cluster.get_standby_ids(),
+ timeout=10
+ )
+
+ def test_timeout_nostandby(self):
+ """
+ That when an active mgr stop responding, and no standby is
+ available, the active mgr is removed from the map anyway.
+ """
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ for s in original_standbys:
+ self.mgr_cluster.mgr_stop(s)
+ self.mgr_cluster.mgr_fail(s)
+
+ self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
+ self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+
+ grace = int(self.mgr_cluster.get_config("mon_mgr_beacon_grace"))
+ log.info("Should time out in about {0} seconds".format(grace))
+
+ self.mgr_cluster.mgr_stop(original_active)
+
+ # Now wait for the mon to notice the mgr is gone and remove it
+ # from the map.
+ self.wait_until_equal(
+ lambda: self.mgr_cluster.get_active_id(),
+ "",
+ timeout=grace * 2
+ )
+
+ self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
+ self.assertEqual(self.mgr_cluster.get_active_id(), "")
+
+ def test_explicit_fail(self):
+ """
+ That when a user explicitly fails a daemon, a standby immediately
+ replaces it.
+ :return:
+ """
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ self.mgr_cluster.mgr_fail(original_active)
+
+ # A standby should take over
+ self.wait_until_true(
+ lambda: self.mgr_cluster.get_active_id() in original_standbys,
+ timeout=60
+ )
+
+ # The one we failed should come back as a standby (he isn't
+ # really dead)
+ self.wait_until_true(
+ lambda: original_active in self.mgr_cluster.get_standby_ids(),
+ timeout=10
+ )
+
+ # Both daemons should have fully populated metadata
+ # (regression test for http://tracker.ceph.com/issues/21260)
+ meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ "mgr", "metadata"))
+ id_to_meta = dict([(i['id'], i) for i in meta])
+ for i in [original_active] + original_standbys:
+ self.assertIn(i, id_to_meta)
+ self.assertIn('ceph_version', id_to_meta[i])
+
+ # We should be able to fail back over again: the exercises
+ # our re-initialization of the python runtime within
+ # a single process lifetime.
+
+ # Get rid of any bystander standbys so that the original_active
+ # will be selected as next active.
+ new_active = self.mgr_cluster.get_active_id()
+ for daemon in original_standbys:
+ if daemon != new_active:
+ self.mgr_cluster.mgr_stop(daemon)
+ self.mgr_cluster.mgr_fail(daemon)
+
+ self.assertListEqual(self.mgr_cluster.get_standby_ids(),
+ [original_active])
+
+ self.mgr_cluster.mgr_stop(new_active)
+ self.mgr_cluster.mgr_fail(new_active)
+
+ self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+ self.assertEqual(self.mgr_cluster.get_standby_ids(), [])
+
+ def test_standby_timeout(self):
+ """
+ That when a standby daemon stops sending beacons, it is
+ removed from the list of standbys
+ :return:
+ """
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ victim = original_standbys[0]
+ self.mgr_cluster.mgr_stop(victim)
+
+ expect_standbys = set(original_standbys) - {victim}
+
+ self.wait_until_true(
+ lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+ timeout=60
+ )
+ self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
diff --git a/src/ceph/qa/tasks/mgr/test_module_selftest.py b/src/ceph/qa/tasks/mgr/test_module_selftest.py
new file mode 100644
index 0000000..2776fb8
--- /dev/null
+++ b/src/ceph/qa/tasks/mgr/test_module_selftest.py
@@ -0,0 +1,74 @@
+
+import time
+import requests
+
+from tasks.mgr.mgr_test_case import MgrTestCase
+
+
+class TestModuleSelftest(MgrTestCase):
+ """
+ That modules with a self-test command can be loaded and execute it
+ without errors.
+
+ This is not a substitute for really testing the modules, but it
+ is quick and is designed to catch regressions that could occur
+ if data structures change in a way that breaks how the modules
+ touch them.
+ """
+ MGRS_REQUIRED = 1
+
+ def _selftest_plugin(self, module_name):
+ self._load_module(module_name)
+
+ # Execute the module's self-test routine
+ self.mgr_cluster.mon_manager.raw_cluster_cmd(module_name, "self-test")
+
+ def test_zabbix(self):
+ self._selftest_plugin("zabbix")
+
+ def test_prometheus(self):
+ self._selftest_plugin("prometheus")
+
+ def test_influx(self):
+ self._selftest_plugin("influx")
+
+ def test_selftest_run(self):
+ self._load_module("selftest")
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test", "run")
+
+ def test_selftest_command_spam(self):
+ # Use the selftest module to stress the mgr daemon
+ self._load_module("selftest")
+
+ # Use the dashboard to test that the mgr is still able to do its job
+ self._assign_ports("dashboard", "server_port")
+ self._load_module("dashboard")
+
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+ "background", "start",
+ "command_spam")
+
+ dashboard_uri = self._get_uri("dashboard")
+
+ delay = 10
+ periods = 10
+ for i in range(0, periods):
+ t1 = time.time()
+ # Check that an HTTP module remains responsive
+ r = requests.get(dashboard_uri)
+ self.assertEqual(r.status_code, 200)
+
+ # Check that a native non-module command remains responsive
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("osd", "df")
+
+ time.sleep(delay - (time.time() - t1))
+
+ self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "self-test",
+ "background", "stop")
+
+ # Check that all mgr daemons are still running
+ self.assertEqual(original_active, self.mgr_cluster.get_active_id())
+ self.assertEqual(original_standbys, self.mgr_cluster.get_standby_ids())