diff options
Diffstat (limited to 'deploy/cloud/deployment.py')
-rw-r--r-- | deploy/cloud/deployment.py | 232 |
1 files changed, 232 insertions, 0 deletions
diff --git a/deploy/cloud/deployment.py b/deploy/cloud/deployment.py new file mode 100644 index 000000000..ecccc241f --- /dev/null +++ b/deploy/cloud/deployment.py @@ -0,0 +1,232 @@ +############################################################################### +# Copyright (c) 2015 Ericsson AB and others. +# szilard.cserey@ericsson.com +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################### + +import time +import re +import json + +from common import ( + N, + E, + exec_cmd, + run_proc, + run_proc_wait_terminated, + run_proc_kill, + parse, + err, + log, + delete, +) + +SEARCH_TEXT = '(err)' +LOG_FILE = '/var/log/puppet.log' +GREP_LINES_OF_LEADING_CONTEXT = 100 +GREP_LINES_OF_TRAILING_CONTEXT = 100 +LIST_OF_CHAR_TO_BE_ESCAPED = ['[', ']', '"'] + + +class DeployNotStart(Exception): + """Unable to start deployment""" + + +class NodesGoOffline(Exception): + """Nodes goes offline during deployment""" + + +class Deployment(object): + + def __init__(self, dea, yaml_config_dir, env_id, node_id_roles_dict, + no_health_check, deploy_timeout): + self.dea = dea + self.yaml_config_dir = yaml_config_dir + self.env_id = env_id + self.node_id_roles_dict = node_id_roles_dict + self.no_health_check = no_health_check + self.deploy_timeout = deploy_timeout + self.pattern = re.compile( + '\d\d\d\d-\d\d-\d\d\s\d\d:\d\d:\d\d') + + def collect_error_logs(self): + for node_id, roles_blade in self.node_id_roles_dict.iteritems(): + log_list = [] + cmd = ('ssh -q node-%s grep \'"%s"\' %s' + % (node_id, SEARCH_TEXT, LOG_FILE)) + results, _ = exec_cmd(cmd, False) + for result in results.splitlines(): + log_msg = '' + + sub_cmd = '"%s" %s' % (result, LOG_FILE) + for c in LIST_OF_CHAR_TO_BE_ESCAPED: + sub_cmd = sub_cmd.replace(c, '\%s' % c) + grep_cmd = ('grep -B%s %s' + % (GREP_LINES_OF_LEADING_CONTEXT, sub_cmd)) + cmd = ('ssh -q node-%s "%s"' % (node_id, grep_cmd)) + + details, _ = exec_cmd(cmd, False) + details_list = details.splitlines() + + found_prev_log = False + for i in range(len(details_list) - 2, -1, -1): + if self.pattern.match(details_list[i]): + found_prev_log = True + break + if found_prev_log: + log_msg += '\n'.join(details_list[i:-1]) + '\n' + + grep_cmd = ('grep -A%s %s' + % (GREP_LINES_OF_TRAILING_CONTEXT, sub_cmd)) + cmd = ('ssh -q node-%s "%s"' % (node_id, grep_cmd)) + + details, _ = exec_cmd(cmd, False) + details_list = details.splitlines() + + found_next_log = False + for i in range(1, len(details_list)): + if self.pattern.match(details_list[i]): + found_next_log = True + break + if found_next_log: + log_msg += '\n'.join(details_list[:i]) + else: + log_msg += details + + if log_msg: + log_list.append(log_msg) + + if log_list: + role = ('controller' if 'controller' in roles_blade[0] + else 'compute host') + log('_' * 40 + 'Errors in node-%s %s' % (node_id, role) + + '_' * 40) + for log_msg in log_list: + print(log_msg + '\n') + + def run_deploy(self): + SLEEP_TIME = 60 + abort_after = 60 * int(self.deploy_timeout) + start = time.time() + + log('Starting deployment of environment %s' % self.env_id) + deploy_id = None + ready = False + timeout = False + + attempts = 0 + while attempts < 3: + try: + if time.time() > start + abort_after: + timeout = True + break + if not deploy_id: + deploy_id = self._start_deploy_task() + sts, prg, msg = self._deployment_status(deploy_id) + if sts == 'error': + log('Error during deployment: {}'.format(msg)) + break + if sts == 'running': + log('Environmnent deploymnet progress: {}%'.format(prg)) + elif sts == 'ready': + ready = True + break + time.sleep(SLEEP_TIME) + except (DeployNotStart, NodesGoOffline) as e: + log(e) + attempts += 1 + deploy_id = None + time.sleep(SLEEP_TIME * attempts) + + if timeout: + err('Deployment timed out, environment %s is not operational, ' + 'snapshot will not be performed' + % self.env_id) + if ready: + log('Environment %s successfully deployed' + % self.env_id) + else: + self.collect_error_logs() + err('Deployment failed, environment %s is not operational' + % self.env_id, self.collect_logs) + + def _start_deploy_task(self): + out, _ = exec_cmd('fuel2 env deploy {}'.format(self.env_id), False) + id = self._deployment_task_id(out) + return id + + def _deployment_task_id(self, response): + response = str(response) + if response.startswith('Deployment task with id'): + for s in response.split(): + if s.isdigit(): + return int(s) + raise DeployNotStart('Unable to start deployment: {}'.format(response)) + + def _deployment_status(self, id): + task = self._task_fields(id) + if task['status'] == 'error': + if task['message'].endswith( + 'offline. Remove them from environment and try again.'): + raise NodesGoOffline(task['message']) + return task['status'], task['progress'], task['message'] + + def _task_fields(self, id): + try: + out, _ = exec_cmd('fuel2 task show {} -f json'.format(id), False) + task_info = json.loads(out) + properties = {} + # for 9.0 this can be list of dicts or dict + # see https://bugs.launchpad.net/fuel/+bug/1625518 + if isinstance(task_info, list): + for d in task_info: + properties.update({d['Field']: d['Value']}) + else: + return task_info + return properties + except ValueError as e: + err('Unable to fetch task info: {}'.format(e)) + + def collect_logs(self): + log('Cleaning out any previous deployment logs') + exec_cmd('rm -f /var/log/remote/fuel-snapshot-*', False) + exec_cmd('rm -f /root/deploy-*', False) + log('Generating Fuel deploy snap-shot') + if exec_cmd('fuel snapshot < /dev/null &> snapshot.log', False)[1] <> 0: + log('Could not create a Fuel snapshot') + else: + exec_cmd('mv /root/fuel-snapshot* /var/log/remote/', False) + + log('Collecting all Fuel Snapshot & deploy log files') + r, _ = exec_cmd('tar -czhf /root/deploy-%s.log.tar.gz /var/log/remote' % time.strftime("%Y%m%d-%H%M%S"), False) + log(r) + + def verify_node_status(self): + node_list = parse(exec_cmd('fuel --env %s node' % self.env_id)) + failed_nodes = [] + for node in node_list: + if node[N['status']] != 'ready': + failed_nodes.append((node[N['id']], node[N['status']])) + + if failed_nodes: + summary = '' + for node, status in failed_nodes: + summary += '[node %s, status %s]\n' % (node, status) + err('Deployment failed: %s' % summary, self.collect_logs) + + def health_check(self): + log('Now running sanity and smoke health checks') + r = exec_cmd('fuel health --env %s --check sanity,smoke --force' % self.env_id) + log(r) + if 'failure' in r: + err('Healthcheck failed!', self.collect_logs) + + def deploy(self): + self.run_deploy() + self.verify_node_status() + if not self.no_health_check: + self.health_check() + self.collect_logs() |