src/ceph/qa/tasks/systemd.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142

"""
Systemd test
"""
import contextlib
import logging
import re
import time

from cStringIO import StringIO
from teuthology.orchestra import run
from teuthology.misc import reconnect, get_first_mon, wait_until_healthy

log = logging.getLogger(__name__)


@contextlib.contextmanager
def task(ctx, config):
    """
      - tasks:
          ceph-deploy:
          systemd:

    Test ceph systemd services can start, stop and restart and
    check for any failed services and report back errors
    """
    for remote, roles in ctx.cluster.remotes.iteritems():
        remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                         'grep', 'ceph'])
        r = remote.run(args=['sudo', 'systemctl', 'list-units', run.Raw('|'),
                             'grep', 'ceph'], stdout=StringIO(),
                       check_status=False)
        log.info(r.stdout.getvalue())
        if r.stdout.getvalue().find('failed'):
            log.info("Ceph services in failed state")

        # test overall service stop and start using ceph.target
        # ceph.target tests are meant for ceph systemd tests
        # and not actual process testing using 'ps'
        log.info("Stopping all Ceph services")
        remote.run(args=['sudo', 'systemctl', 'stop', 'ceph.target'])
        r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
                       stdout=StringIO(), check_status=False)
        log.info(r.stdout.getvalue())
        log.info("Checking process status")
        r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                             'grep', 'ceph'], stdout=StringIO())
        if r.stdout.getvalue().find('Active: inactive'):
            log.info("Sucessfully stopped all ceph services")
        else:
            log.info("Failed to stop ceph services")

        log.info("Starting all Ceph services")
        remote.run(args=['sudo', 'systemctl', 'start', 'ceph.target'])
        r = remote.run(args=['sudo', 'systemctl', 'status', 'ceph.target'],
                       stdout=StringIO())
        log.info(r.stdout.getvalue())
        if r.stdout.getvalue().find('Active: active'):
            log.info("Sucessfully started all Ceph services")
        else:
            log.info("info", "Failed to start Ceph services")
        r = remote.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                             'grep', 'ceph'], stdout=StringIO())
        log.info(r.stdout.getvalue())
        time.sleep(4)

        # test individual services start stop
        name = remote.shortname
        mon_name = 'ceph-mon@' + name + '.service'
        mds_name = 'ceph-mds@' + name + '.service'
        mgr_name = 'ceph-mgr@' + name + '.service'
        mon_role_name = 'mon.' + name
        mds_role_name = 'mds.' + name
        mgr_role_name = 'mgr.' + name
        m_osd = re.search('--id (\d+) --setuser ceph', r.stdout.getvalue())
        if m_osd:
            osd_service = 'ceph-osd@{m}.service'.format(m=m_osd.group(1))
            remote.run(args=['sudo', 'systemctl', 'status',
                             osd_service])
            remote.run(args=['sudo', 'systemctl', 'stop',
                             osd_service])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', osd_service],
                           stdout=StringIO(), check_status=False)
            log.info(r.stdout.getvalue())
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single osd ceph service")
            else:
                log.info("Failed to stop ceph osd services")
            remote.run(args=['sudo', 'systemctl', 'start',
                             osd_service])
            time.sleep(4)
        if mon_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mon_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mon_name])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', mon_name],
                           stdout=StringIO(), check_status=False)
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single mon ceph service")
            else:
                log.info("Failed to stop ceph mon service")
            remote.run(args=['sudo', 'systemctl', 'start', mon_name])
            time.sleep(4)
        if mgr_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mgr_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mgr_name])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', mgr_name],
                           stdout=StringIO(), check_status=False)
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single ceph mgr service")
            else:
                log.info("Failed to stop ceph mgr service")
            remote.run(args=['sudo', 'systemctl', 'start', mgr_name])
            time.sleep(4)
        if mds_role_name in roles:
            remote.run(args=['sudo', 'systemctl', 'status', mds_name])
            remote.run(args=['sudo', 'systemctl', 'stop', mds_name])
            time.sleep(4)  # immediate check will result in deactivating state
            r = remote.run(args=['sudo', 'systemctl', 'status', mds_name],
                           stdout=StringIO(), check_status=False)
            if r.stdout.getvalue().find('Active: inactive'):
                log.info("Sucessfully stopped single ceph mds service")
            else:
                log.info("Failed to stop ceph mds service")
            remote.run(args=['sudo', 'systemctl', 'start', mds_name])
            time.sleep(4)

    # reboot all nodes and verify the systemd units restart
    # workunit that runs would fail if any of the systemd unit doesnt start
    ctx.cluster.run(args='sudo reboot', wait=False, check_status=False)
    # avoid immediate reconnect
    time.sleep(120)
    reconnect(ctx, 480)  # reconnect all nodes
    # for debug info
    ctx.cluster.run(args=['sudo', 'ps', '-eaf', run.Raw('|'),
                          'grep', 'ceph'])
    # wait for HEALTH_OK
    mon = get_first_mon(ctx, config)
    (mon_remote,) = ctx.cluster.only(mon).remotes.iterkeys()
    wait_until_healthy(ctx, mon_remote, use_sudo=True)
    yield