From 874ca3f24d349ee844bb7339dda34eddb2e91c73 Mon Sep 17 00:00:00 2001 From: Josep Puigdemont Date: Fri, 6 May 2016 03:28:26 +0200 Subject: common.py: allow specifying number of attempts in exec_cmd Some commands executed by exec_cmd may fail because of a temporary cause, and it may be desirable to retry the same command several times until it succeeds. One example of this are the ipmitool commands, which may fail temorarily on some targets if they get too many requests simultaneously. In this patch three new optional parameters are introduced to the function signature, which do not break backward compatibility: attempts: which indicates how many times the command should be run if it returns a non-zero value*, and defaults to 1 (as today). delay: which indicates the delay in seconds between attempts, and defaults to 5 seconds. verbose: It will print the remaining attempts left for the current command if set to True. * It may be desirable to add yet another parameter to indicate what return value should be considered an error, but non-zero for now seems a reasonable default. Signed-off-by: Josep Puigdemont --- deploy/common.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/deploy/common.py b/deploy/common.py index 41b4e274e..3cd3e0e6e 100644 --- a/deploy/common.py +++ b/deploy/common.py @@ -16,6 +16,7 @@ import argparse import shutil import stat import errno +import time N = {'id': 0, 'status': 1, 'name': 2, 'cluster': 3, 'ip': 4, 'mac': 5, 'roles': 6, 'pending_roles': 7, 'online': 8, 'group_id': 9} @@ -37,13 +38,22 @@ out_handler.setFormatter(formatter) LOG.addHandler(out_handler) os.chmod(LOGFILE, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO) -def exec_cmd(cmd, check=True): - process = subprocess.Popen(cmd, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - shell=True) - (response, stderr) = process.communicate() - return_code = process.returncode +def exec_cmd(cmd, check=True, attempts=1, delay=5, verbose=False): + # a negative value means forever + while attempts != 0: + attempts = attempts - 1 + process = subprocess.Popen(cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + shell=True) + (response, stderr) = process.communicate() + return_code = process.returncode + if return_code == 0 or attempts == 0: + break + time.sleep(delay) + if verbose: + log('%d attempts left: %s' % (attempts, cmd)) + response = response.strip() if check: if return_code > 0: -- cgit From 2ea2690762b4a1e547d2ca71674125c7e101d7fd Mon Sep 17 00:00:00 2001 From: Josep Puigdemont Date: Fri, 6 May 2016 12:09:58 +0200 Subject: ipmi_adapter: simplify, retry if command fails The method get_node_state has been added to the the IpmiAdapter class. In addition, now the power on/off methods will try several times to perform their IPMI command before giving up, instead of bailing out at the first error. After the power on/off command is completed, the method will wait until the node is in the desired state. NOTE: a command could potentially take several minutes if the defaults are used; each IPMI command can take up to 1 minute, and there can be 3 commands issued per operation, one of them may be retried 20 times with the current defaults. Ideally we would use eventlet or something similar to allow each command a limited time to execute, instead: with eventlet.timeout.Timeout(seconds) as t: power_on/off_command Signed-off-by: Josep Puigdemont --- deploy/dha_adapters/ipmi_adapter.py | 101 +++++++++++++++--------------------- 1 file changed, 42 insertions(+), 59 deletions(-) diff --git a/deploy/dha_adapters/ipmi_adapter.py b/deploy/dha_adapters/ipmi_adapter.py index 4bd6bd378..6ce4012f4 100644 --- a/deploy/dha_adapters/ipmi_adapter.py +++ b/deploy/dha_adapters/ipmi_adapter.py @@ -1,5 +1,6 @@ ############################################################################### # Copyright (c) 2015 Ericsson AB and others. +# (c) 2016 Enea Software AB # szilard.cserey@ericsson.com # All rights reserved. This program and the accompanying materials # are made available under the terms of the Apache License, Version 2.0 @@ -20,8 +21,10 @@ from common import ( class IpmiAdapter(HardwareAdapter): - def __init__(self, yaml_path): + def __init__(self, yaml_path, attempts=20, delay=3): super(IpmiAdapter, self).__init__(yaml_path) + self.attempts = attempts + self.delay = delay def get_access_info(self, node_id): ip = self.get_node_property(node_id, 'ipmiIp') @@ -43,69 +46,46 @@ class IpmiAdapter(HardwareAdapter): mac_list.append(self.get_node_property(node_id, 'pxeMac').lower()) return mac_list + def node_get_state(self, node_id): + state = exec_cmd('%s chassis power status' % self.ipmi_cmd(node_id), + attempts=self.attempts, delay=self.delay, + verbose=True) + return state + + def _node_power_cmd(self, node_id, cmd): + expected = 'Chassis Power is %s' % cmd + if self.node_get_state(node_id) == expected: + return + + pow_cmd = '%s chassis power %s' % (self.ipmi_cmd(node_id), cmd) + exec_cmd(pow_cmd, attempts=self.attempts, delay=self.delay, + verbose=True) + + attempts = self.attempts + while attempts: + state = self.node_get_state(node_id) + attempts -= 1 + if state == expected: + return + elif attempts != 0: + # reinforce our will, but allow the command to fail, + # we know our message got across once already... + exec_cmd(pow_cmd, check=False) + + err('Could not set chassis %s for node %s' % (cmd, node_id)) + def node_power_on(self, node_id): - WAIT_LOOP = 200 - SLEEP_TIME = 3 log('Power ON Node %s' % node_id) - cmd_prefix = self.ipmi_cmd(node_id) - state = exec_cmd('%s chassis power status' % cmd_prefix) - if state == 'Chassis Power is off': - exec_cmd('%s chassis power on' % cmd_prefix) - done = False - for i in range(WAIT_LOOP): - state, _ = exec_cmd('%s chassis power status' % cmd_prefix, - False) - if state == 'Chassis Power is on': - done = True - break - else: - time.sleep(SLEEP_TIME) - if not done: - err('Could Not Power ON Node %s' % node_id) + self._node_power_cmd(node_id, 'on') def node_power_off(self, node_id): - WAIT_LOOP = 200 - SLEEP_TIME = 3 log('Power OFF Node %s' % node_id) - cmd_prefix = self.ipmi_cmd(node_id) - state = exec_cmd('%s chassis power status' % cmd_prefix) - if state == 'Chassis Power is on': - done = False - exec_cmd('%s chassis power off' % cmd_prefix) - for i in range(WAIT_LOOP): - state, _ = exec_cmd('%s chassis power status' % cmd_prefix, - False) - if state == 'Chassis Power is off': - done = True - break - else: - time.sleep(SLEEP_TIME) - if not done: - err('Could Not Power OFF Node %s' % node_id) + self._node_power_cmd(node_id, 'off') def node_reset(self, node_id): - WAIT_LOOP = 600 log('RESET Node %s' % node_id) - cmd_prefix = self.ipmi_cmd(node_id) - state = exec_cmd('%s chassis power status' % cmd_prefix) - if state == 'Chassis Power is on': - was_shut_off = False - done = False - exec_cmd('%s chassis power reset' % cmd_prefix) - for i in range(WAIT_LOOP): - state, _ = exec_cmd('%s chassis power status' % cmd_prefix, - False) - if state == 'Chassis Power is off': - was_shut_off = True - elif state == 'Chassis Power is on' and was_shut_off: - done = True - break - time.sleep(1) - if not done: - err('Could Not RESET Node %s' % node_id) - else: - err('Cannot RESET Node %s because it\'s not Active, state: %s' - % (node_id, state)) + cmd = '%s chassis power reset' % self.ipmi_cmd(node_id) + exec_cmd(cmd, attempts=self.attempts, delay=self.delay, verbose=True) def node_set_boot_order(self, node_id, boot_order_list): log('Set boot order %s on Node %s' % (boot_order_list, node_id)) @@ -114,9 +94,12 @@ class IpmiAdapter(HardwareAdapter): for dev in boot_order_list: if dev == 'pxe': exec_cmd('%s chassis bootdev pxe options=persistent' - % cmd_prefix) + % cmd_prefix, attempts=self.attempts, delay=self.delay, + verbose=True) elif dev == 'iso': - exec_cmd('%s chassis bootdev cdrom' % cmd_prefix) + exec_cmd('%s chassis bootdev cdrom' % cmd_prefix, + attempts=self.attempts, delay=self.delay, verbose=True) elif dev == 'disk': exec_cmd('%s chassis bootdev disk options=persistent' - % cmd_prefix) + % cmd_prefix, attempts=self.attempts, delay=self.delay, + verbose=True) -- cgit