From: Josep Puigdemont Date: Fri, 6 May 2016 12:09:58 +0200 Subject: [PATCH] ipmi_adapter: simplify, retry if command fails The method get_node_state has been added to the The IpmiAdapter class. In addition, now the power on/off methods will try several times to perform their IPMI command before giving up, instead of bailing out at the first error. After the power on/off command is completed, the method will wait until the node is in the desired state. FIXME: a command could potentially take several minutes if the defaults are used; each IPMI command can take 1 minutes, and there can be three commands issued per operation, one of them may be retried 20 times with the current defaults. Ideally we would use eventlet or something alike to allow each command a limited time to execute: with eventlet.timeout.Timeout(seconds) as t: power_on/off_command FIXME: There is a potential dead-lock situation by issuing the command and then checking the status, as someone could have intervened in between the two commands. Signed-off-by: Josep Puigdemont --- deploy/dha_adapters/ipmi_adapter.py | 101 +++++++++++++++--------------------- 1 file changed, 42 insertions(+), 59 deletions(-) diff --git a/deploy/dha_adapters/ipmi_adapter.py b/deploy/dha_adapters/ipmi_adapter.py index 8fda4f9..283bd57 100644 --- a/deploy/dha_adapters/ipmi_adapter.py +++ b/deploy/dha_adapters/ipmi_adapter.py @@ -1,5 +1,6 @@ ############################################################################### # Copyright (c) 2015 Ericsson AB and others. +# (c) 2016 Enea Software AB # szilard.cserey@ericsson.com # All rights reserved. This program and the accompanying materials # are made available under the terms of the Apache License, Version 2.0 @@ -20,8 +21,10 @@ from common import ( class IpmiAdapter(HardwareAdapter): - def __init__(self, yaml_path): + def __init__(self, yaml_path, attempts=20, delay=3): super(IpmiAdapter, self).__init__(yaml_path) + self.attempts = attempts + self.delay = delay def get_access_info(self, node_id): ip = self.get_node_property(node_id, 'ipmiIp') @@ -40,69 +43,46 @@ class IpmiAdapter(HardwareAdapter): mac_list.append(self.get_node_property(node_id, 'pxeMac').lower()) return mac_list + def node_get_state(self, node_id): + state = exec_cmd('%s chassis power status' % self.ipmi_cmd(node_id), + attempts=self.attempts, delay=self.delay, + verbose=True) + return state + + def __node_power_cmd__(self, node_id, cmd): + expected = 'Chassis Power is %s' % cmd + if self.node_get_state(node_id) == expected: + return + + pow_cmd = '%s chassis power %s' % (self.ipmi_cmd(node_id), cmd) + exec_cmd(pow_cmd, attempts=self.attempts, delay=self.delay, + verbose=True) + + attempts = self.attempts + while attempts: + state = self.node_get_state(node_id) + attempts -= 1 + if state == expected: + return + elif attempts != 0: + # reinforce our will, but allow the command to fail, + # we know our message got across once already... + exec_cmd(pow_cmd, check=False) + + err('Could not set chassis %s for node %s' % (cmd, node_id)) + def node_power_on(self, node_id): - WAIT_LOOP = 200 - SLEEP_TIME = 3 log('Power ON Node %s' % node_id) - cmd_prefix = self.ipmi_cmd(node_id) - state = exec_cmd('%s chassis power status' % cmd_prefix) - if state == 'Chassis Power is off': - exec_cmd('%s chassis power on' % cmd_prefix) - done = False - for i in range(WAIT_LOOP): - state, _ = exec_cmd('%s chassis power status' % cmd_prefix, - False) - if state == 'Chassis Power is on': - done = True - break - else: - time.sleep(SLEEP_TIME) - if not done: - err('Could Not Power ON Node %s' % node_id) + self.__node_power_cmd__(node_id, 'on') def node_power_off(self, node_id): - WAIT_LOOP = 200 - SLEEP_TIME = 3 log('Power OFF Node %s' % node_id) - cmd_prefix = self.ipmi_cmd(node_id) - state = exec_cmd('%s chassis power status' % cmd_prefix) - if state == 'Chassis Power is on': - done = False - exec_cmd('%s chassis power off' % cmd_prefix) - for i in range(WAIT_LOOP): - state, _ = exec_cmd('%s chassis power status' % cmd_prefix, - False) - if state == 'Chassis Power is off': - done = True - break - else: - time.sleep(SLEEP_TIME) - if not done: - err('Could Not Power OFF Node %s' % node_id) + self.__node_power_cmd__(node_id, 'off') def node_reset(self, node_id): - WAIT_LOOP = 600 log('RESET Node %s' % node_id) - cmd_prefix = self.ipmi_cmd(node_id) - state = exec_cmd('%s chassis power status' % cmd_prefix) - if state == 'Chassis Power is on': - was_shut_off = False - done = False - exec_cmd('%s chassis power reset' % cmd_prefix) - for i in range(WAIT_LOOP): - state, _ = exec_cmd('%s chassis power status' % cmd_prefix, - False) - if state == 'Chassis Power is off': - was_shut_off = True - elif state == 'Chassis Power is on' and was_shut_off: - done = True - break - time.sleep(1) - if not done: - err('Could Not RESET Node %s' % node_id) - else: - err('Cannot RESET Node %s because it\'s not Active, state: %s' - % (node_id, state)) + cmd = '%s chassis power reset' % self.ipmi_cmd(node_id) + exec_cmd(cmd, attempts=self.attempts, delay=self.delay, verbose=True) def node_set_boot_order(self, node_id, boot_order_list): log('Set boot order %s on Node %s' % (boot_order_list, node_id)) @@ -111,9 +91,12 @@ class IpmiAdapter(HardwareAdapter): for dev in boot_order_list: if dev == 'pxe': exec_cmd('%s chassis bootdev pxe options=persistent' - % cmd_prefix) + % cmd_prefix, attempts=self.attempts, delay=self.delay, + verbose=True) elif dev == 'iso': - exec_cmd('%s chassis bootdev cdrom' % cmd_prefix) + exec_cmd('%s chassis bootdev cdrom' % cmd_prefix, + attempts=self.attempts, delay=self.delay, verbose=True) elif dev == 'disk': exec_cmd('%s chassis bootdev disk options=persistent' - % cmd_prefix) + % cmd_prefix, attempts=self.attempts, delay=self.delay, + verbose=True)