From: Josep Puigdemont Date: Fri, 6 May 2016 12:09:58 +0200 Subject: [PATCH] ipmi_adapter: simplify, retry if command fails The method get_node_state has been added to the the IpmiAdapter class. In addition, now the power on/off methods will try several times to perform their IPMI command before giving up, instead of bailing out at the first error. After the power on/off command is completed, the method will wait until the node is in the desired state. NOTE: a command could potentially take several minutes if the defaults are used; each IPMI command can take up to 1 minute, and there can be 3 commands issued per operation, one of them may be retried 20 times with the current defaults. Ideally we would use eventlet or something similar to allow each command a limited time to execute, instead: with eventlet.timeout.Timeout(seconds) as t: power_on/off_command Signed-off-by: Josep Puigdemont --- deploy/dha_adapters/ipmi_adapter.py | 101 +++++++++++++++--------------------- 1 file changed, 42 insertions(+), 59 deletions(-) diff --git a/deploy/dha_adapters/ipmi_adapter.py b/deploy/dha_adapters/ipmi_adapter.py index 8fda4f9..283bd57 100644 --- a/deploy/dha_adapters/ipmi_adapter.py +++ b/deploy/dha_adapters/ipmi_adapter.py @@ -1,5 +1,6 @@ ############################################################################### # Copyright (c) 2015 Ericsson AB and others. +# (c) 2016 Enea Software AB # szilard.cserey@ericsson.com # All rights reserved. This program and the accompanying materials # are made available under the terms of the Apache License, Version 2.0 @@ -20,8 +21,10 @@ from common import ( class IpmiAdapter(HardwareAdapter): - def __init__(self, yaml_path): + def __init__(self, yaml_path, attempts=20, delay=3): super(IpmiAdapter, self).__init__(yaml_path) + self.attempts = attempts + self.delay = delay def get_access_info(self, node_id): ip = self.get_node_property(node_id, 'ipmiIp') @@ -40,69 +43,46 @@ class IpmiAdapter(HardwareAdapter): mac_list.append(self.get_node_property(node_id, 'pxeMac').lower()) return mac_list + def node_get_state(self, node_id): + state = exec_cmd('%s chassis power status' % self.ipmi_cmd(node_id), + attempts=self.attempts, delay=self.delay, + verbose=True) + return state + + def __node_power_cmd__(self, node_id, cmd): + expected = 'Chassis Power is %s' % cmd + if self.node_get_state(node_id) == expected: + return + + pow_cmd = '%s chassis power %s' % (self.ipmi_cmd(node_id), cmd) + exec_cmd(pow_cmd, attempts=self.attempts, delay=self.delay, + verbose=True) + + attempts = self.attempts + while attempts: + state = self.node_get_state(node_id) + attempts -= 1 + if state == expected: + return + elif attempts != 0: + # reinforce our will, but allow the command to fail, + # we know our message got across once already... + exec_cmd(pow_cmd, check=False) + + err('Could not set chassis %s for node %s' % (cmd, node_id)) + def node_power_on(self, node_id): - WAIT_LOOP = 200 - SLEEP_TIME = 3 log('Power ON Node %s' % node_id) - cmd_prefix = self.ipmi_cmd(node_id) - state = exec_cmd('%s chassis power status' % cmd_prefix) - if state == 'Chassis Power is off': - exec_cmd('%s chassis power on' % cmd_prefix) - done = False - for i in range(WAIT_LOOP): - state, _ = exec_cmd('%s chassis power status' % cmd_prefix, - False) - if state == 'Chassis Power is on': - done = True - break - else: - time.sleep(SLEEP_TIME) - if not done: - err('Could Not Power ON Node %s' % node_id) + self.__node_power_cmd__(node_id, 'on') def node_power_off(self, node_id): - WAIT_LOOP = 200 - SLEEP_TIME = 3 log('Power OFF Node %s' % node_id) - cmd_prefix = self.ipmi_cmd(node_id) - state = exec_cmd('%s chassis power status' % cmd_prefix) - if state == 'Chassis Power is on': - done = False - exec_cmd('%s chassis power off' % cmd_prefix) - for i in range(WAIT_LOOP): - state, _ = exec_cmd('%s chassis power status' % cmd_prefix, - False) - if state == 'Chassis Power is off': - done = True - break - else: - time.sleep(SLEEP_TIME) - if not done: - err('Could Not Power OFF Node %s' % node_id) + self.__node_power_cmd__(node_id, 'off') def node_reset(self, node_id): - WAIT_LOOP = 600 log('RESET Node %s' % node_id) - cmd_prefix = self.ipmi_cmd(node_id) - state = exec_cmd('%s chassis power status' % cmd_prefix) - if state == 'Chassis Power is on': - was_shut_off = False - done = False - exec_cmd('%s chassis power reset' % cmd_prefix) - for i in range(WAIT_LOOP): - state, _ = exec_cmd('%s chassis power status' % cmd_prefix, - False) - if state == 'Chassis Power is off': - was_shut_off = True - elif state == 'Chassis Power is on' and was_shut_off: - done = True - break - time.sleep(1) - if not done: - err('Could Not RESET Node %s' % node_id) - else: - err('Cannot RESET Node %s because it\'s not Active, state: %s' - % (node_id, state)) + cmd = '%s chassis power reset' % self.ipmi_cmd(node_id) + exec_cmd(cmd, attempts=self.attempts, delay=self.delay, verbose=True) def node_set_boot_order(self, node_id, boot_order_list): log('Set boot order %s on Node %s' % (boot_order_list, node_id)) @@ -111,9 +91,12 @@ class IpmiAdapter(HardwareAdapter): for dev in boot_order_list: if dev == 'pxe': exec_cmd('%s chassis bootdev pxe options=persistent' - % cmd_prefix) + % cmd_prefix, attempts=self.attempts, delay=self.delay, + verbose=True) elif dev == 'iso': - exec_cmd('%s chassis bootdev cdrom' % cmd_prefix) + exec_cmd('%s chassis bootdev cdrom' % cmd_prefix, + attempts=self.attempts, delay=self.delay, verbose=True) elif dev == 'disk': exec_cmd('%s chassis bootdev disk options=persistent' - % cmd_prefix) + % cmd_prefix, attempts=self.attempts, delay=self.delay, + verbose=True)