aboutsummaryrefslogtreecommitdiffstats
path: root/patches/opnfv-fuel/0022-ipmi_adapter-simplify-retry-if-command-fails.patch
blob: c1617f047092793528490a43d47d35ce2bcf055a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
From: Josep Puigdemont <josep.puigdemont@enea.com>
Date: Fri, 6 May 2016 12:09:58 +0200
Subject: [PATCH] ipmi_adapter: simplify, retry if command fails

The method get_node_state has been added to the The IpmiAdapter class.

In addition, now the power on/off methods will try several times to
perform their IPMI command before giving up, instead of bailing out at
the first error.

After the power on/off command is completed, the method will wait until
the node is in the desired state.

FIXME: a command could potentially take several minutes if the defaults
are used; each IPMI command can take 1 minutes, and there can be three
commands issued per operation, one of them may be retried 20 times with
the current defaults. Ideally we would use eventlet or something alike
to allow each command a limited time to execute:
    with eventlet.timeout.Timeout(seconds) as t:
        power_on/off_command

FIXME: There is a potential dead-lock situation by issuing the command
and then checking the status, as someone could have intervened in
between the two commands.

Signed-off-by: Josep Puigdemont <josep.puigdemont@enea.com>
---
 deploy/dha_adapters/ipmi_adapter.py | 101 +++++++++++++++---------------------
 1 file changed, 42 insertions(+), 59 deletions(-)

diff --git a/deploy/dha_adapters/ipmi_adapter.py b/deploy/dha_adapters/ipmi_adapter.py
index 8fda4f9..283bd57 100644
--- a/deploy/dha_adapters/ipmi_adapter.py
+++ b/deploy/dha_adapters/ipmi_adapter.py
@@ -1,5 +1,6 @@
 ###############################################################################
 # Copyright (c) 2015 Ericsson AB and others.
+#           (c) 2016 Enea Software AB
 # szilard.cserey@ericsson.com
 # All rights reserved. This program and the accompanying materials
 # are made available under the terms of the Apache License, Version 2.0
@@ -20,8 +21,10 @@ from common import (
 
 class IpmiAdapter(HardwareAdapter):
 
-    def __init__(self, yaml_path):
+    def __init__(self, yaml_path, attempts=20, delay=3):
         super(IpmiAdapter, self).__init__(yaml_path)
+        self.attempts = attempts
+        self.delay = delay
 
     def get_access_info(self, node_id):
         ip = self.get_node_property(node_id, 'ipmiIp')
@@ -40,69 +43,46 @@ class IpmiAdapter(HardwareAdapter):
         mac_list.append(self.get_node_property(node_id, 'pxeMac').lower())
         return mac_list
 
+    def node_get_state(self, node_id):
+        state = exec_cmd('%s chassis power status' % self.ipmi_cmd(node_id),
+                         attempts=self.attempts, delay=self.delay,
+                         verbose=True)
+        return state
+
+    def __node_power_cmd__(self, node_id, cmd):
+        expected = 'Chassis Power is %s' % cmd
+        if self.node_get_state(node_id) == expected:
+            return
+
+        pow_cmd = '%s chassis power %s' % (self.ipmi_cmd(node_id), cmd)
+        exec_cmd(pow_cmd, attempts=self.attempts, delay=self.delay,
+                 verbose=True)
+
+        attempts = self.attempts
+        while attempts:
+            state = self.node_get_state(node_id)
+            attempts -= 1
+            if state == expected:
+                return
+            elif attempts != 0:
+                # reinforce our will, but allow the command to fail,
+                # we know our message got across once already...
+                exec_cmd(pow_cmd, check=False)
+
+        err('Could not set chassis %s for node %s' % (cmd, node_id))
+
     def node_power_on(self, node_id):
-        WAIT_LOOP = 200
-        SLEEP_TIME = 3
         log('Power ON Node %s' % node_id)
-        cmd_prefix = self.ipmi_cmd(node_id)
-        state = exec_cmd('%s chassis power status' % cmd_prefix)
-        if state == 'Chassis Power is off':
-            exec_cmd('%s chassis power on' % cmd_prefix)
-            done = False
-            for i in range(WAIT_LOOP):
-                state, _ = exec_cmd('%s chassis power status' % cmd_prefix,
-                                    False)
-                if state == 'Chassis Power is on':
-                    done = True
-                    break
-                else:
-                    time.sleep(SLEEP_TIME)
-            if not done:
-                err('Could Not Power ON Node %s' % node_id)
+        self.__node_power_cmd__(node_id, 'on')
 
     def node_power_off(self, node_id):
-        WAIT_LOOP = 200
-        SLEEP_TIME = 3
         log('Power OFF Node %s' % node_id)
-        cmd_prefix = self.ipmi_cmd(node_id)
-        state = exec_cmd('%s chassis power status' % cmd_prefix)
-        if state == 'Chassis Power is on':
-            done = False
-            exec_cmd('%s chassis power off' % cmd_prefix)
-            for i in range(WAIT_LOOP):
-                state, _ = exec_cmd('%s chassis power status' % cmd_prefix,
-                                    False)
-                if state == 'Chassis Power is off':
-                    done = True
-                    break
-                else:
-                    time.sleep(SLEEP_TIME)
-            if not done:
-                err('Could Not Power OFF Node %s' % node_id)
+        self.__node_power_cmd__(node_id, 'off')
 
     def node_reset(self, node_id):
-        WAIT_LOOP = 600
         log('RESET Node %s' % node_id)
-        cmd_prefix = self.ipmi_cmd(node_id)
-        state = exec_cmd('%s chassis power status' % cmd_prefix)
-        if state == 'Chassis Power is on':
-            was_shut_off = False
-            done = False
-            exec_cmd('%s chassis power reset' % cmd_prefix)
-            for i in range(WAIT_LOOP):
-                state, _ = exec_cmd('%s chassis power status' % cmd_prefix,
-                                    False)
-                if state == 'Chassis Power is off':
-                    was_shut_off = True
-                elif state == 'Chassis Power is on' and was_shut_off:
-                    done = True
-                    break
-                time.sleep(1)
-            if not done:
-                err('Could Not RESET Node %s' % node_id)
-        else:
-            err('Cannot RESET Node %s because it\'s not Active, state: %s'
-                % (node_id, state))
+        cmd = '%s chassis power reset' % self.ipmi_cmd(node_id)
+        exec_cmd(cmd, attempts=self.attempts, delay=self.delay, verbose=True)
 
     def node_set_boot_order(self, node_id, boot_order_list):
         log('Set boot order %s on Node %s' % (boot_order_list, node_id))
@@ -111,9 +91,12 @@ class IpmiAdapter(HardwareAdapter):
         for dev in boot_order_list:
             if dev == 'pxe':
                 exec_cmd('%s chassis bootdev pxe options=persistent'
-                         % cmd_prefix)
+                         % cmd_prefix, attempts=self.attempts, delay=self.delay,
+                         verbose=True)
             elif dev == 'iso':
-                exec_cmd('%s chassis bootdev cdrom' % cmd_prefix)
+                exec_cmd('%s chassis bootdev cdrom' % cmd_prefix,
+                         attempts=self.attempts, delay=self.delay, verbose=True)
             elif dev == 'disk':
                 exec_cmd('%s chassis bootdev disk options=persistent'
-                         % cmd_prefix)
+                         % cmd_prefix, attempts=self.attempts, delay=self.delay,
+                         verbose=True)