diff options
Diffstat (limited to 'doctor_tests')
27 files changed, 1395 insertions, 446 deletions
diff --git a/doctor_tests/admin_tool/__init__.py b/doctor_tests/admin_tool/__init__.py index e8b12817..3417a334 100644 --- a/doctor_tests/admin_tool/__init__.py +++ b/doctor_tests/admin_tool/__init__.py @@ -8,16 +8,16 @@ ############################################################################## from oslo_config import cfg from oslo_utils import importutils - +import os OPTS = [ cfg.StrOpt('type', - default='sample', - choices=['sample'], + default=os.environ.get('ADMIN_TOOL_TYPE', 'sample'), + choices=['sample', 'fenix'], help='the component of doctor admin_tool', required=True), cfg.StrOpt('ip', - default='127.0.0.1', + default='0.0.0.0', help='the ip of admin_tool', required=True), cfg.IntOpt('port', diff --git a/doctor_tests/admin_tool/fenix/Dockerfile b/doctor_tests/admin_tool/fenix/Dockerfile new file mode 100644 index 00000000..202380eb --- /dev/null +++ b/doctor_tests/admin_tool/fenix/Dockerfile @@ -0,0 +1,34 @@ +FROM gliderlabs/alpine:3.6 + +ARG BRANCH=master +ARG OPENSTACK=master + +EXPOSE 12347 + +RUN echo "Building Fenix container against OpenStack $OPENSTACK" && \ + echo "Building Fenix with $BRANCH" && \ + mkdir /etc/fenix && \ + mkdir -p /var/tmp/fenix +WORKDIR /var/tmp/fenix +COPY fenix*.conf /etc/fenix/ + +RUN apk --no-cache add ca-certificates && \ + apk --no-cache add --update python3 sshpass py-pip git curl && \ + apk --no-cache add --virtual .build-deps --update \ + python3-dev build-base linux-headers libffi-dev \ + openssl-dev libjpeg-turbo-dev && \ + curl https://opendev.org/openstack/requirements/raw/branch/$OPENSTACK/upper-constraints.txt > upper-constraints.txt && \ + if [ ! -e /usr/bin/pip ]; then ln -s pip3 /usr/bin/pip ; fi && \ + if [[ ! -e /usr/bin/python ]]; then ln -sf /usr/bin/python3 /usr/bin/python; fi && \ + pip3 install --upgrade pip && \ + pip3 install alembic aodhclient decorator flask Flask-RESTful eventlet jsonschema \ + keystoneauth1 keystonemiddleware python-novaclient oslo.config pecan \ + oslo.db oslo.log oslo.messaging oslo.serialization oslo.service oslo_policy \ + oslotest oslo.utils pbr pymysql six sqlalchemy -cupper-constraints.txt && \ + git clone https://opendev.org/x/fenix -b $BRANCH /fenix && \ + rm -fr /var/tmp/fenix +COPY run /fenix +COPY keystonercv3 /fenix +WORKDIR /fenix +RUN python3 setup.py install +CMD ./run diff --git a/doctor_tests/admin_tool/fenix/run b/doctor_tests/admin_tool/fenix/run new file mode 100755 index 00000000..50ae68e7 --- /dev/null +++ b/doctor_tests/admin_tool/fenix/run @@ -0,0 +1,32 @@ +#!/bin/sh +. keystonercv3 + +# Start the first process +nohup python3 /fenix/fenix/cmd/engine.py > /var/log/fenix-engine.log& +status=$? +if [ $status -ne 0 ]; then + echo "Failed to start engine.py: $status" + exit $status +fi + +# Start the second process +nohup python3 /fenix/fenix/cmd/api.py > /var/log/fenix-api.log& +status=$? +if [ $status -ne 0 ]; then + echo "Failed to start api.py: $status" + exit $status +fi + +echo "started Fenix: engine and api" +while sleep 60; do + ps aux |grep "cmd/engine.py" |grep -q -v grep + PROCESS_1_STATUS=$? + ps aux |grep "cmd/api.py" |grep -q -v grep + PROCESS_2_STATUS=$? + # If the greps above find anything, they exit with 0 status + # If they are not both 0, then something is wrong + if [ $PROCESS_1_STATUS -ne 0 -o $PROCESS_2_STATUS -ne 0 ]; then + echo "One of the processes has already exited." + exit 1 + fi +done diff --git a/doctor_tests/admin_tool/sample.py b/doctor_tests/admin_tool/sample.py index 892a4c83..a71f43a1 100644 --- a/doctor_tests/admin_tool/sample.py +++ b/doctor_tests/admin_tool/sample.py @@ -59,7 +59,7 @@ class AdminMain(Thread): self.parent = parent self.log = log self.conf = conf - self.url = 'http://0.0.0.0:%s' % conf.admin_tool.port + self.url = 'http://%s:%s' % (conf.admin_tool.ip, conf.admin_tool.port) self.projects_state = dict() # current state for each project self.proj_server_actions = dict() # actions for each project server self.projects_servers = dict() # servers processed in current state @@ -86,6 +86,7 @@ class AdminMain(Thread): driver='messaging', topics=['notifications']) self.notif_admin = self.notif_admin.prepare(publisher_id='admin_tool') + self.stopped = False self.log.info('Admin tool session %s initialized' % self.session_id) def cleanup(self): @@ -116,14 +117,15 @@ class AdminMain(Thread): if self._projects_not_in_wanted_states(wanted_states): self.log.error('Admin tool session %s: projects in invalid states ' '%s' % (self.session_id, self.projects_state)) - raise Exception('Admin tool session %s: not all projects in states' - ' %s' % (self.session_id, wanted_states)) + return False else: self.log.info('all projects replied') + return True def _project_notify(self, project_id, instance_ids, allowed_actions, actions_at, state, metadata): - reply_url = '%s/%s/maintenance' % (self.url, project_id) + reply_url = '%s/maintenance/%s/%s' % (self.url, self.session_id, + project_id) payload = dict(project_id=project_id, instance_ids=instance_ids, @@ -148,11 +150,12 @@ class AdminMain(Thread): self.notif_admin.info({'some': 'context'}, 'maintenance.host', payload) - def down_scale(self): + def in_scale(self): for project in self.projects_servers: - self.log.info('DOWN_SCALE to project %s' % project) + self.log.info('SCALE_IN to project %s' % project) self.log.debug('instance_ids %s' % self.projects_servers[project]) - instance_ids = '%s/%s/maintenance' % (self.url, project) + instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id, + project) allowed_actions = [] wait_seconds = 120 actions_at = (datetime.datetime.utcnow() + @@ -163,18 +166,20 @@ class AdminMain(Thread): self._project_notify(project, instance_ids, allowed_actions, actions_at, state, metadata) - allowed_states = ['ACK_DOWN_SCALE', 'NACK_DOWN_SCALE'] - self.wait_projects_state(allowed_states, wait_seconds) - if self.projects_not_in_state('ACK_DOWN_SCALE'): - raise Exception('Admin tool session %s: all states not ' - 'ACK_DOWN_SCALE %s' % - (self.session_id, self.projects_state)) + allowed_states = ['ACK_SCALE_IN', 'NACK_SCALE_IN'] + if not self.wait_projects_state(allowed_states, wait_seconds): + self.state = 'MAINTENANCE_FAILED' + if self.projects_not_in_state('ACK_SCALE_IN'): + self.log.error('%s: all states not ACK_SCALE_IN' % + self.session_id) + self.state = 'MAINTENANCE_FAILED' def maintenance(self): for project in self.projects_servers: self.log.info('\nMAINTENANCE to project %s\n' % project) self.log.debug('instance_ids %s' % self.projects_servers[project]) - instance_ids = '%s/%s/maintenance' % (self.url, project) + instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id, + project) allowed_actions = [] actions_at = self.maintenance_at state = self.state @@ -190,16 +195,18 @@ class AdminMain(Thread): allowed_actions, actions_at, state, metadata) allowed_states = ['ACK_MAINTENANCE', 'NACK_MAINTENANCE'] - self.wait_projects_state(allowed_states, wait_seconds) + if not self.wait_projects_state(allowed_states, wait_seconds): + self.state = 'MAINTENANCE_FAILED' if self.projects_not_in_state('ACK_MAINTENANCE'): - raise Exception('Admin tool session %s: all states not ' - 'ACK_MAINTENANCE %s' % - (self.session_id, self.projects_state)) + self.log.error('%s: all states not ACK_MAINTENANCE' % + self.session_id) + self.state = 'MAINTENANCE_FAILED' def maintenance_complete(self): for project in self.projects_servers: self.log.info('MAINTENANCE_COMPLETE to project %s' % project) - instance_ids = '%s/%s/maintenance' % (self.url, project) + instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id, + project) allowed_actions = [] wait_seconds = 120 actions_at = (datetime.datetime.utcnow() + @@ -212,13 +219,14 @@ class AdminMain(Thread): metadata) allowed_states = ['ACK_MAINTENANCE_COMPLETE', 'NACK_MAINTENANCE_COMPLETE'] - self.wait_projects_state(allowed_states, wait_seconds) + if not self.wait_projects_state(allowed_states, wait_seconds): + self.state = 'MAINTENANCE_FAILED' if self.projects_not_in_state('ACK_MAINTENANCE_COMPLETE'): - raise Exception('Admin tool session %s: all states not ' - 'ACK_MAINTENANCE_COMPLETE %s' % - (self.session_id, self.projects_state)) + self.log.error('%s: all states not ACK_MAINTENANCE_COMPLETE' % + self.session_id) + self.state = 'MAINTENANCE_FAILED' - def need_down_scale(self, host_servers): + def need_in_scale(self, host_servers): room_for_instances = 0 for host in host_servers: instances = 0 @@ -267,7 +275,8 @@ class AdminMain(Thread): self.projects_servers[project] = projects_servers[project].copy() self.log.info('%s to project %s' % (state, project)) self.project_servers_log_info(project, projects_servers) - instance_ids = '%s/%s/maintenance' % (self.url, project) + instance_ids = '%s/maintenance/%s/%s' % (self.url, self.session_id, + project) allowed_actions = ['MIGRATE', 'LIVE_MIGRATE', 'OWN_ACTION'] wait_seconds = 120 actions_at = (datetime.datetime.utcnow() + @@ -278,11 +287,14 @@ class AdminMain(Thread): allowed_actions, actions_at, state, metadata) allowed_states = [state_ack, state_nack] - self.wait_projects_state(allowed_states, wait_seconds) - if self.projects_not_in_state(state_ack): - raise Exception('Admin tool session %s: all states not %s %s' % - (self.session_id, state_ack, self.projects_state)) - self.actions_to_have_empty_host(host) + if not self.wait_projects_state(allowed_states, wait_seconds): + self.state = 'MAINTENANCE_FAILED' + elif self.projects_not_in_state(state_ack): + self.log.error('%s: all states not %s' % + (self.session_id, state_ack)) + self.state = 'MAINTENANCE_FAILED' + else: + self.actions_to_have_empty_host(host) def notify_action_done(self, project, instance_id): instance_ids = instance_id @@ -463,7 +475,8 @@ class AdminMain(Thread): time.sleep(5) def run(self): - while self.state != 'MAINTENANCE_COMPLETE': + while (self.state not in ['MAINTENANCE_DONE', 'MAINTENANCE_FAILED'] and + not self.stopped): self.log.info('--==session %s: processing state %s==--' % (self.session_id, self.state)) if self.state == 'MAINTENANCE': @@ -474,7 +487,8 @@ class AdminMain(Thread): raise Exception('all projects do not listen maintenance ' 'alarm') self.maintenance() - + if self.state == 'MAINTENANCE_FAILED': + continue maint_at = self.str_to_datetime(self.maintenance_at) if maint_at > datetime.datetime.utcnow(): time_now = (datetime.datetime.utcnow().strftime( @@ -492,14 +506,14 @@ class AdminMain(Thread): # True -> PLANNED_MAINTENANCE # False -> check if we can migrate VMs to get empty host # True -> PREPARE_MAINTENANCE - # False -> DOWN_SCALE + # False -> SCALE_IN maintenance_empty_hosts = ([h for h in self.hosts if h not in host_servers]) if len(maintenance_empty_hosts) == 0: - if self.need_down_scale(host_servers): + if self.need_in_scale(host_servers): self.log.info('Need to down scale') - self.state = 'DOWN_SCALE' + self.state = 'SCALE_IN' else: self.log.info('Free capacity, but need empty host') self.state = 'PREPARE_MAINTENANCE' @@ -508,14 +522,17 @@ class AdminMain(Thread): self.state = 'PLANNED_MAINTENANCE' self.log.info('--==State change from MAINTENANCE to %s==--' % self.state) - elif self.state == 'DOWN_SCALE': + elif self.state == 'SCALE_IN': # Test case is hard coded to have all compute capacity used # We need to down scale to have one empty compute host - self.down_scale() + self.update_server_info() + self.in_scale() + if self.state == 'MAINTENANCE_FAILED': + continue self.state = 'PREPARE_MAINTENANCE' host_servers = self.update_server_info() self.servers_log_info(host_servers) - self.log.info('--==State change from DOWN_SCALE to' + self.log.info('--==State change from SCALE_IN to' ' %s==--' % self.state) elif self.state == 'PREPARE_MAINTENANCE': @@ -527,7 +544,7 @@ class AdminMain(Thread): host_servers]) if len(maintenance_empty_hosts) == 0: self.log.info('no empty hosts for maintenance') - if self.need_down_scale(host_servers): + if self.need_in_scale(host_servers): raise Exception('Admin tool session %s: Not enough ' 'free capacity for maintenance' % self.session_id) @@ -535,6 +552,8 @@ class AdminMain(Thread): if host: self.make_compute_host_empty(host, host_servers[host], 'PREPARE_MAINTENANCE') + if self.state == 'MAINTENANCE_FAILED': + continue else: # We do not currently support another down scale if # first was not enough @@ -566,6 +585,7 @@ class AdminMain(Thread): maintenance_empty_hosts.append(host) self.log.info('--==Start to maintain empty hosts==--\n%s' % maintenance_empty_hosts) + self.update_server_info() for host in maintenance_empty_hosts: # scheduler has problems, let's see if just down scaled # host is really empty @@ -586,6 +606,8 @@ class AdminMain(Thread): self.log.info('PLANNED_MAINTENANCE host %s' % host) self.make_compute_host_empty(host, host_servers[host], 'PLANNED_MAINTENANCE') + if self.state == 'MAINTENANCE_FAILED': + continue self.log.info('IN_MAINTENANCE host %s' % host) self._admin_notify(admin_project, host, 'IN_MAINTENANCE', self.session_id) @@ -603,14 +625,16 @@ class AdminMain(Thread): self.log.info('Projects still need to up scale back to full ' 'capcity') self.maintenance_complete() + if self.state == 'MAINTENANCE_FAILED': + continue host_servers = self.update_server_info() self.servers_log_info(host_servers) - self.state = 'MAINTENANCE_COMPLETE' + self.state = 'MAINTENANCE_DONE' else: raise Exception('Admin tool session %s: session in invalid ' 'state %s' % (self.session_id, self.state)) - self.log.info('--==Maintenance session %s: ' - 'MAINTENANCE SESSION COMPLETE==--' % self.session_id) + self.log.info('--==Maintenance session %s: %s==--' % + (self.session_id, self.state)) def project_input(self, project_id, data): self.log.debug('Admin tool session %s: project %s input' % @@ -637,7 +661,6 @@ class AdminTool(Thread): self.admin_tool = admin_tool self.log = log self.conf = conf - self.port = self.conf.admin_tool.port self.maint_sessions = {} self.projects = {} self.maintenance_hosts = [] @@ -650,63 +673,55 @@ class AdminTool(Thread): def admin_maintenance_api_post(): data = json.loads(request.data.decode('utf8')) self.log.info('maintenance message: %s' % data) - if 'session_id' in data: - if data['state'] == 'REMOVE_MAINTENANCE_SESSION': - session_id = data['session_id'] - self.log.info('remove session %s' - % session_id) - self.maint_sessions[session_id].cleanup() - self.maint_sessions[session_id].stop() - del self.maint_sessions[session_id] - else: - session_id = str(generate_uuid()) - self.log.info('creating session: %s' % session_id) - self.maint_sessions[session_id] = ( - AdminMain(self.trasport_url, - session_id, - data, - self, - self.conf, - self.log)) - self.maint_sessions[session_id].start() + session_id = str(generate_uuid()) + self.log.info('creating session: %s' % session_id) + self.maint_sessions[session_id] = ( + AdminMain(self.trasport_url, + session_id, + data, + self, + self.conf, + self.log)) + self.maint_sessions[session_id].start() reply = json.dumps({'session_id': session_id, 'state': 'ACK_%s' % data['state']}) self.log.debug('reply: %s' % reply) return reply, 200, None - @app.route('/maintenance', methods=['GET']) - def admin_maintenance_api_get(): - data = json.loads(request.data.decode('utf8')) - self.log.debug('Admin get maintenance: %s' % data) - session_id = data['session_id'] + @app.route('/maintenance/<session_id>', methods=['GET']) + def admin_maintenance_api_get(session_id=None): + self.log.debug('Admin get maintenance') reply = json.dumps({'state': self.maint_sessions[session_id].state}) - self.log.debug('reply: %s' % reply) + self.log.info('reply: %s' % reply) return reply, 200, None - @app.route('/<projet_id>/maintenance', methods=['PUT']) - def project_maintenance_api_put(projet_id=None): + @app.route('/maintenance/<session_id>/<projet_id>', methods=['PUT']) + def project_maintenance_api_put(session_id=None, projet_id=None): data = json.loads(request.data.decode('utf8')) self.log.debug('%s project put: %s' % (projet_id, data)) - self.project_input(projet_id, data) + self.project_input(session_id, projet_id, data) return 'OK' - @app.route('/<projet_id>/maintenance', methods=['GET']) - def project_maintenance_api_get(projet_id=None): - data = json.loads(request.data.decode('utf8')) - self.log.debug('%s project get %s' % (projet_id, data)) - instances = self.project_get_instances(projet_id, data) + @app.route('/maintenance/<session_id>/<projet_id>', methods=['GET']) + def project_maintenance_api_get(session_id=None, projet_id=None): + self.log.debug('%s project get %s' % (projet_id, session_id)) + instances = self.project_get_instances(session_id, projet_id) reply = json.dumps({'instance_ids': instances}) self.log.debug('%s reply: %s' % (projet_id, reply)) return reply, 200, None + @app.route('/maintenance/<session_id>', methods=['DELETE']) + def remove_session(session_id=None): + self.log.info('remove session %s' + % session_id) + self.maint_sessions[session_id].cleanup() + self.maint_sessions[session_id].stop() + del self.maint_sessions[session_id] + return 'OK' + @app.route('/shutdown', methods=['POST']) def shutdown(): - for session in self.maint_sessions: - self.log.info('shutdown admin tool session %s thread' % - session) - self.maint_sessions[session].cleanup() - self.maint_sessions[session].stop() self.log.info('shutdown admin_tool server at %s' % time.time()) func = request.environ.get('werkzeug.server.shutdown') if func is None: @@ -714,13 +729,11 @@ class AdminTool(Thread): func() return 'admin_tool app shutting down...' - app.run(host='0.0.0.0', port=self.port) + app.run(host=self.conf.admin_tool.ip, port=self.conf.admin_tool.port) - def project_input(self, project_id, data): - session_id = data['session_id'] + def project_input(self, session_id, project_id, data): self.maint_sessions[session_id].project_input(project_id, data) - def project_get_instances(self, project_id, data): - session_id = data['session_id'] + def project_get_instances(self, session_id, project_id): return self.maint_sessions[session_id].project_get_instances( project_id) diff --git a/doctor_tests/app_manager/__init__.py b/doctor_tests/app_manager/__init__.py index 717d6587..c2f75918 100644 --- a/doctor_tests/app_manager/__init__.py +++ b/doctor_tests/app_manager/__init__.py @@ -8,12 +8,13 @@ ############################################################################## from oslo_config import cfg from oslo_utils import importutils +import os OPTS = [ cfg.StrOpt('type', - default='sample', - choices=['sample'], + default=os.environ.get('APP_MANAGER_TYPE', 'sample'), + choices=['sample', 'vnfm'], help='the component of doctor app manager', required=True), cfg.StrOpt('ip', @@ -28,7 +29,8 @@ OPTS = [ _app_manager_name_class_mapping = { - 'sample': 'doctor_tests.app_manager.sample.SampleAppManager' + 'sample': 'doctor_tests.app_manager.sample.SampleAppManager', + 'vnfm': 'doctor_tests.app_manager.vnfm.VNFM', } diff --git a/doctor_tests/app_manager/sample.py b/doctor_tests/app_manager/sample.py index 94926ee2..7ca35b97 100644 --- a/doctor_tests/app_manager/sample.py +++ b/doctor_tests/app_manager/sample.py @@ -17,6 +17,7 @@ import requests from doctor_tests.app_manager.base import BaseAppManager from doctor_tests.identity_auth import get_identity_auth from doctor_tests.identity_auth import get_session +from doctor_tests.os_clients import neutron_client from doctor_tests.os_clients import nova_client @@ -56,12 +57,16 @@ class AppManager(Thread): self.app_manager = app_manager self.log = log self.intance_ids = None + self.auth = get_identity_auth(project=self.conf.doctor_project) + self.session = get_session(auth=self.auth) + self.nova = nova_client(self.conf.nova_version, + self.session) + self.neutron = neutron_client(session=self.session) self.headers = { 'Content-Type': 'application/json', 'Accept': 'application/json'} - self.auth = get_identity_auth(project=self.conf.doctor_project) - self.nova = nova_client(self.conf.nova_version, - get_session(auth=self.auth)) + if self.conf.admin_tool.type == 'fenix': + self.headers['X-Auth-Token'] = self.session.get_token() self.orig_number_of_instances = self.number_of_instances() self.ha_instances = self.get_ha_instances() self.floating_ip = None @@ -85,7 +90,13 @@ class AppManager(Thread): if instance.id != self.active_instance_id: self.log.info('Switch over to: %s %s' % (instance.name, instance.id)) - instance.add_floating_ip(self.floating_ip) + # Deprecated, need to use neutron instead + # instance.add_floating_ip(self.floating_ip) + port = self.neutron.list_ports(device_id=instance.id)['ports'][0]['id'] # noqa + floating_id = self.neutron.list_floatingips(floating_ip_address=self.floating_ip)['floatingips'][0]['id'] # noqa + self.neutron.update_floatingip(floating_id, {'floatingip': {'port_id': port}}) # noqa + # Have to update ha_instances as floating_ip changed + self.ha_instances = self.get_ha_instances() self.active_instance_id = instance.id break @@ -114,8 +125,7 @@ class AppManager(Thread): for t in data['reason_data']['event']['traits']}) def get_session_instance_ids(self, url, session_id): - data = {'session_id': session_id} - ret = requests.get(url, data=json.dumps(data), headers=self.headers) + ret = requests.get(url, data=None, headers=self.headers) if ret.status_code != 200: raise Exception(ret.text) self.log.info('get_instance_ids %s' % ret.json()) @@ -155,7 +165,7 @@ class AppManager(Thread): data = json.loads(request.data.decode('utf8')) try: payload = self._alarm_traits_decoder(data) - except: + except Exception: payload = ({t[0]: t[2] for t in data['reason_data']['event']['traits']}) self.log.error('cannot parse alarm data: %s' % payload) @@ -177,12 +187,12 @@ class AppManager(Thread): reply['instance_ids'] = instance_ids reply_state = 'ACK_MAINTENANCE' - elif state == 'DOWN_SCALE': + elif state == 'SCALE_IN': # scale down 2 isntances that is VCPUS equaling to single # compute node self.scale_instances(-2) reply['instance_ids'] = self.get_instance_ids() - reply_state = 'ACK_DOWN_SCALE' + reply_state = 'ACK_SCALE_IN' elif state == 'MAINTENANCE_COMPLETE': # possibly need to upscale diff --git a/doctor_tests/app_manager/vnfm.py b/doctor_tests/app_manager/vnfm.py new file mode 100644 index 00000000..68fdbb88 --- /dev/null +++ b/doctor_tests/app_manager/vnfm.py @@ -0,0 +1,441 @@ +############################################################################## +# Copyright (c) 2018 Nokia Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +from flask import Flask +from flask import request +import json +import requests +from threading import Thread +import time +import uuid +import yaml + +from doctor_tests.app_manager.base import BaseAppManager +from doctor_tests.identity_auth import get_identity_auth +from doctor_tests.identity_auth import get_session +from doctor_tests.os_clients import neutron_client +from doctor_tests.os_clients import nova_client +from doctor_tests.os_clients import keystone_client + + +class VNFM(BaseAppManager): + + def __init__(self, stack, conf, log): + super(VNFM, self).__init__(conf, log) + self.stack = stack + self.app = None + + def start(self): + self.log.info('VNFM start......') + self.app = VNFManager(self.stack, self.conf, self, self.log) + self.app.start() + + def stop(self): + self.log.info('VNFM stop......') + if not self.app: + return + self.app.delete_constraints() + headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + } + url = 'http://%s:%d/shutdown'\ + % (self.conf.app_manager.ip, + self.conf.app_manager.port) + requests.post(url, data='', headers=headers) + + +class VNFManager(Thread): + + def __init__(self, stack, conf, app_manager, log): + Thread.__init__(self) + self.stack = stack + self.conf = conf + self.port = self.conf.app_manager.port + self.app_manager = app_manager + self.log = log + self.intance_ids = None + self.auth = get_identity_auth(project=self.conf.doctor_project) + self.session = get_session(auth=self.auth) + self.keystone = keystone_client( + self.conf.keystone_version, self.session) + self.nova = nova_client(self.conf.nova_version, + self.session) + self.neutron = neutron_client(session=self.session) + self.headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json'} + if self.conf.admin_tool.type == 'fenix': + self.headers['X-Auth-Token'] = self.session.get_token() + self.orig_number_of_instances = self.number_of_instances() + # List of instances + self.ha_instances = [] + self.nonha_instances = [] + # Different instance_id specific constraints {instanse_id: {},...} + self.instance_constraints = None + # Update existing instances to instance lists + self.update_instances() + nonha_instances = len(self.nonha_instances) + if nonha_instances < 7: + self.scale = 2 + self.max_impacted = 2 + else: + self.scale = int((nonha_instances) / 2) + self.max_impacted = self.scale - 1 + self.log.info('Init nonha_instances: %s scale: %s: max_impacted %s' % + (nonha_instances, self.scale, self.max_impacted)) + # Different instance groups constraints dict + self.ha_group = None + self.nonha_group = None + # Floating IP used in HA instance + self.floating_ip = None + # VNF project_id + self.project_id = None + # HA instance_id that is active / has floating IP + self.active_instance_id = self.active_instance_id() + + services = self.keystone.services.list() + for service in services: + if service.type == 'maintenance': + self.log.info('maintenance service: %s:%s type %s' + % (service.name, service.id, service.type)) + maint_id = service.id + self.maint_endpoint = [ep.url for ep in self.keystone.endpoints.list() + if ep.service_id == maint_id and + ep.interface == 'public'][0] + self.log.info('maintenance endpoint: %s' % self.maint_endpoint) + self.update_constraints_lock = False + self.update_constraints() + + def delete_remote_instance_constraints(self, instance_id): + url = "%s/instance/%s" % (self.maint_endpoint, instance_id) + self.log.info('DELETE: %s' % url) + ret = requests.delete(url, data=None, headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def update_remote_instance_constraints(self, instance): + url = "%s/instance/%s" % (self.maint_endpoint, instance["instance_id"]) + self.log.info('PUT: %s' % url) + ret = requests.put(url, data=json.dumps(instance), + headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def delete_remote_group_constraints(self, instance_group): + url = "%s/instance_group/%s" % (self.maint_endpoint, + instance_group["group_id"]) + self.log.info('DELETE: %s' % url) + ret = requests.delete(url, data=None, headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def update_remote_group_constraints(self, instance_group): + url = "%s/instance_group/%s" % (self.maint_endpoint, + instance_group["group_id"]) + self.log.info('PUT: %s' % url) + ret = requests.put(url, data=json.dumps(instance_group), + headers=self.headers) + if ret.status_code != 200 and ret.status_code != 204: + raise Exception(ret.text) + + def delete_constraints(self): + if self.conf.admin_tool.type == 'fenix': + self.headers['X-Auth-Token'] = self.session.get_token() + for instance_id in self.instance_constraints: + self.delete_remote_instance_constraints(instance_id) + self.delete_remote_group_constraints(self.nonha_group) + self.delete_remote_group_constraints(self.ha_group) + + def update_constraints(self): + while self.update_constraints_lock: + self.log.info('Waiting update_constraints_lock...') + time.sleep(1) + self.update_constraints_lock = True + self.log.info('Update constraints') + if self.project_id is None: + self.project_id = self.keystone.projects.list( + name=self.conf.doctor_project)[0].id + if self.nonha_group is None: + # Nova does not support groupping instances that do not belong to + # anti-affinity server_groups. Anyhow all instances need groupping + self.nonha_group = { + "group_id": str(uuid.uuid4()), + "project_id": self.project_id, + "group_name": "doctor_nonha_app_group", + "anti_affinity_group": False, + "max_instances_per_host": 0, + "max_impacted_members": self.max_impacted, + "recovery_time": 2, + "resource_mitigation": True} + self.log.info('create doctor_nonha_app_group constraints: %s' + % self.nonha_group) + self.update_remote_group_constraints(self.nonha_group) + if self.ha_group is None: + group_id = [sg.id for sg in self.nova.server_groups.list() + if sg.name == "doctor_ha_app_group"][0] + self.ha_group = { + "group_id": group_id, + "project_id": self.project_id, + "group_name": "doctor_ha_app_group", + "anti_affinity_group": True, + "max_instances_per_host": 1, + "max_impacted_members": 1, + "recovery_time": 4, + "resource_mitigation": True} + self.log.info('create doctor_ha_app_group constraints: %s' + % self.ha_group) + self.update_remote_group_constraints(self.ha_group) + instance_constraints = {} + for ha_instance in self.ha_instances: + instance = { + "instance_id": ha_instance.id, + "project_id": self.project_id, + "group_id": self.ha_group["group_id"], + "instance_name": ha_instance.name, + "max_interruption_time": 120, + "migration_type": "MIGRATE", + "resource_mitigation": True, + "lead_time": 40} + self.log.info('create ha instance constraints: %s' + % instance) + instance_constraints[ha_instance.id] = instance + for nonha_instance in self.nonha_instances: + instance = { + "instance_id": nonha_instance.id, + "project_id": self.project_id, + "group_id": self.nonha_group["group_id"], + "instance_name": nonha_instance.name, + "max_interruption_time": 120, + "migration_type": "MIGRATE", + "resource_mitigation": True, + "lead_time": 40} + self.log.info('create nonha instance constraints: %s' + % instance) + instance_constraints[nonha_instance.id] = instance + if not self.instance_constraints: + # Initial instance constraints + self.log.info('create initial instances constraints...') + for instance in [instance_constraints[i] for i + in instance_constraints]: + self.update_remote_instance_constraints(instance) + self.instance_constraints = instance_constraints.copy() + else: + self.log.info('check instances constraints changes...') + added = [i for i in instance_constraints.keys() + if i not in self.instance_constraints] + deleted = [i for i in self.instance_constraints.keys() + if i not in instance_constraints] + modified = [i for i in instance_constraints.keys() + if (i not in added and i not in deleted and + instance_constraints[i] != + self.instance_constraints[i])] + for instance_id in deleted: + self.delete_remote_instance_constraints(instance_id) + updated = added + modified + for instance in [instance_constraints[i] for i in updated]: + self.update_remote_instance_constraints(instance) + if updated or deleted: + # Some instance constraints have changed + self.instance_constraints = instance_constraints.copy() + self.update_constraints_lock = False + + def active_instance_id(self): + # Need rertry as it takes time after heat template done before + # Floating IP in place + retry = 5 + while retry > 0: + for instance in self.ha_instances: + network_interfaces = next(iter(instance.addresses.values())) + for network_interface in network_interfaces: + _type = network_interface.get('OS-EXT-IPS:type') + if _type == "floating": + if not self.floating_ip: + self.floating_ip = network_interface.get('addr') + self.log.debug('active_instance: %s %s' % + (instance.name, instance.id)) + return instance.id + time.sleep(2) + self.update_instances() + retry -= 1 + raise Exception("No active instance found") + + def switch_over_ha_instance(self): + for instance in self.ha_instances: + if instance.id != self.active_instance_id: + self.log.info('Switch over to: %s %s' % (instance.name, + instance.id)) + # Deprecated, need to use neutron instead + # instance.add_floating_ip(self.floating_ip) + port = self.neutron.list_ports(device_id=instance.id)['ports'][0]['id'] # noqa + floating_id = self.neutron.list_floatingips(floating_ip_address=self.floating_ip)['floatingips'][0]['id'] # noqa + self.neutron.update_floatingip(floating_id, {'floatingip': {'port_id': port}}) # noqa + # Have to update ha_instances as floating_ip changed + self.update_instances() + self.active_instance_id = instance.id + break + + def get_instance_ids(self): + ret = list() + for instance in self.nova.servers.list(detailed=False): + ret.append(instance.id) + return ret + + def update_instances(self): + instances = self.nova.servers.list(detailed=True) + self.ha_instances = [i for i in instances + if "doctor_ha_app_" in i.name] + self.nonha_instances = [i for i in instances + if "doctor_nonha_app_" in i.name] + + def _alarm_data_decoder(self, data): + if "[" in data or "{" in data: + # string to list or dict removing unicode + data = yaml.load(data.replace("u'", "'")) + return data + + def _alarm_traits_decoder(self, data): + return ({str(t[0]): self._alarm_data_decoder(str(t[2])) + for t in data['reason_data']['event']['traits']}) + + def get_session_instance_ids(self, url, session_id): + ret = requests.get(url, data=None, headers=self.headers) + if ret.status_code != 200: + raise Exception(ret.text) + self.log.info('get_instance_ids %s' % ret.json()) + return ret.json()['instance_ids'] + + def scale_instances(self, number_of_instances): + number_of_instances_before = self.number_of_instances() + + parameters = self.stack.parameters + parameters['nonha_intances'] += number_of_instances + self.stack.update(self.stack.stack_name, + self.stack.stack_id, + self.stack.template, + parameters=parameters, + files=self.stack.files) + + number_of_instances_after = self.number_of_instances() + if (number_of_instances_before + number_of_instances != + number_of_instances_after): + self.log.error('scale_instances with: %d from: %d ends up to: %d' + % (number_of_instances, number_of_instances_before, + number_of_instances_after)) + raise Exception('scale_instances failed') + + self.log.info('scaled instances from %d to %d' % + (number_of_instances_before, + number_of_instances_after)) + + def number_of_instances(self): + return len(self.nova.servers.list(detailed=False)) + + def run(self): + app = Flask('VNFM') + + @app.route('/maintenance', methods=['POST']) + def maintenance_alarm(): + data = json.loads(request.data.decode('utf8')) + try: + payload = self._alarm_traits_decoder(data) + except Exception: + payload = ({t[0]: t[2] for t in + data['reason_data']['event']['traits']}) + self.log.error('cannot parse alarm data: %s' % payload) + raise Exception('VNFM cannot parse alarm.' + 'Possibly trait data over 256 char') + + self.log.info('VNFM received data = %s' % payload) + + state = payload['state'] + reply_state = None + reply = dict() + + self.log.info('VNFM state: %s' % state) + + if state == 'MAINTENANCE': + instance_ids = (self.get_session_instance_ids( + payload['instance_ids'], + payload['session_id'])) + my_instance_ids = self.get_instance_ids() + invalid_instances = ( + [instance_id for instance_id in instance_ids + if instance_id not in my_instance_ids]) + if invalid_instances: + self.log.error('Invalid instances: %s' % invalid_instances) + reply_state = 'NACK_MAINTENANCE' + else: + reply_state = 'ACK_MAINTENANCE' + + elif state == 'SCALE_IN': + # scale down "self.scale" instances that is VCPUS equaling + # at least a single compute node + self.scale_instances(-self.scale) + reply_state = 'ACK_SCALE_IN' + + elif state == 'MAINTENANCE_COMPLETE': + # possibly need to upscale + number_of_instances = self.number_of_instances() + if self.orig_number_of_instances > number_of_instances: + scale_instances = (self.orig_number_of_instances - + number_of_instances) + self.scale_instances(scale_instances) + reply_state = 'ACK_MAINTENANCE_COMPLETE' + + elif state == 'PREPARE_MAINTENANCE': + # TBD from contraints + if "MIGRATE" not in payload['allowed_actions']: + raise Exception('MIGRATE not supported') + instance_ids = payload['instance_ids'][0] + self.log.info('VNFM got instance: %s' % instance_ids) + if instance_ids == self.active_instance_id: + self.switch_over_ha_instance() + # optional also in contraints + reply['instance_action'] = "MIGRATE" + reply_state = 'ACK_PREPARE_MAINTENANCE' + + elif state == 'PLANNED_MAINTENANCE': + # TBD from contraints + if "MIGRATE" not in payload['allowed_actions']: + raise Exception('MIGRATE not supported') + instance_ids = payload['instance_ids'][0] + self.log.info('VNFM got instance: %s' % instance_ids) + if instance_ids == self.active_instance_id: + self.switch_over_ha_instance() + # optional also in contraints + reply['instance_action'] = "MIGRATE" + reply_state = 'ACK_PLANNED_MAINTENANCE' + + elif state == 'INSTANCE_ACTION_DONE': + # TBD was action done in allowed window + self.log.info('%s' % payload['instance_ids']) + else: + raise Exception('VNFM received event with' + ' unknown state %s' % state) + + if reply_state: + if self.conf.admin_tool.type == 'fenix': + self.headers['X-Auth-Token'] = self.session.get_token() + reply['state'] = reply_state + url = payload['reply_url'] + self.log.info('VNFM reply: %s' % reply) + requests.put(url, data=json.dumps(reply), headers=self.headers) + + return 'OK' + + @app.route('/shutdown', methods=['POST']) + def shutdown(): + self.log.info('shutdown VNFM server at %s' % time.time()) + func = request.environ.get('werkzeug.server.shutdown') + if func is None: + raise RuntimeError('Not running with the Werkzeug Server') + func() + return 'VNFM shutting down...' + + app.run(host="0.0.0.0", port=self.port) diff --git a/doctor_tests/common/constants.py b/doctor_tests/common/constants.py index 088ff633..201f3fc4 100644 --- a/doctor_tests/common/constants.py +++ b/doctor_tests/common/constants.py @@ -12,6 +12,10 @@ from collections import namedtuple Host = namedtuple('Host', ['name', 'ip']) +def is_fenix(conf): + return conf.admin_tool.type == 'fenix' + + class Inspector(object): CONGRESS = 'congress' SAMPLE = 'sample' diff --git a/doctor_tests/common/utils.py b/doctor_tests/common/utils.py index 1a8840dd..67ca4f4b 100644 --- a/doctor_tests/common/utils.py +++ b/doctor_tests/common/utils.py @@ -10,6 +10,7 @@ import json import os import paramiko import re +import subprocess def load_json_file(full_path): @@ -97,6 +98,27 @@ class SSHClient(object): ftp.close() +class LocalSSH(object): + + def __init__(self, log): + self.log = log + self.log.info('Init local ssh client') + + def ssh(self, cmd): + ret = 0 + output = "%s failed!!!" % cmd + try: + output = subprocess.check_output((cmd), shell=True, + universal_newlines=True) + except subprocess.CalledProcessError: + ret = 1 + return ret, output + + def scp(self, src_file, dst_file): + return subprocess.check_output("cp %s %s" % (src_file, dst_file), + shell=True) + + def run_async(func): from threading import Thread from functools import wraps diff --git a/doctor_tests/image.py b/doctor_tests/image.py index 9961b22d..50841ef6 100644 --- a/doctor_tests/image.py +++ b/doctor_tests/image.py @@ -7,7 +7,11 @@ # http://www.apache.org/licenses/LICENSE-2.0 ############################################################################## import os -import urllib.request +try: + from urllib.request import urlopen +except Exception: + from urllib2 import urlopen + from oslo_config import cfg @@ -46,11 +50,14 @@ class Image(object): def create(self): self.log.info('image create start......') - images = {image.name: image for image in self.glance.images.list()} + if self.conf.image_name == 'cirros': + cirros = [image for image in images if 'cirros' in image] + if cirros: + self.conf.image_name = cirros[0] if self.conf.image_name not in images: if not os.path.exists(self.conf.image_filename): - resp = urllib.request.urlopen(self.conf.image_download_url) + resp = urlopen(self.conf.image_download_url) with open(self.conf.image_filename, "wb") as file: file.write(resp.read()) self.image = \ diff --git a/doctor_tests/inspector/__init__.py b/doctor_tests/inspector/__init__.py index 31291baf..50365a61 100644 --- a/doctor_tests/inspector/__init__.py +++ b/doctor_tests/inspector/__init__.py @@ -42,6 +42,10 @@ _inspector_name_class_mapping = { } -def get_inspector(conf, log): +def get_inspector(conf, log, transport_url=None): inspector_class = _inspector_name_class_mapping[conf.inspector.type] - return importutils.import_object(inspector_class, conf, log) + if conf.inspector.type == 'sample': + return importutils.import_object(inspector_class, conf, log, + transport_url) + else: + return importutils.import_object(inspector_class, conf, log) diff --git a/doctor_tests/inspector/sample.py b/doctor_tests/inspector/sample.py index a55a12b7..c44db95d 100644 --- a/doctor_tests/inspector/sample.py +++ b/doctor_tests/inspector/sample.py @@ -10,6 +10,7 @@ import collections from flask import Flask from flask import request import json +import oslo_messaging import time from threading import Thread import requests @@ -26,7 +27,7 @@ from doctor_tests.inspector.base import BaseInspector class SampleInspector(BaseInspector): event_type = 'compute.host.down' - def __init__(self, conf, log): + def __init__(self, conf, log, trasport_url): super(SampleInspector, self).__init__(conf, log) self.inspector_url = self.get_inspector_url() self.novaclients = list() @@ -43,6 +44,17 @@ class SampleInspector(BaseInspector): self.hostnames = list() self.app = None + try: + transport = oslo_messaging.get_notification_transport(self.conf, + trasport_url) + self.notif = oslo_messaging.Notifier(transport, + 'compute.instance.update', + driver='messaging', + topics=['notifications']) + self.notif = self.notif.prepare(publisher_id='sample') + except Exception: + self.notif = None + def _init_novaclients(self): self.NUMBER_OF_CLIENTS = self.conf.instance_count auth = get_identity_auth(project=self.conf.doctor_project) @@ -54,13 +66,13 @@ class SampleInspector(BaseInspector): def _init_servers_list(self): self.servers.clear() opts = {'all_tenants': True} - servers = self.nova.servers.list(search_opts=opts) + servers = self.nova.servers.list(detailed=True, search_opts=opts) for server in servers: try: host = server.__dict__.get('OS-EXT-SRV-ATTR:host') self.servers[host].append(server) self.log.debug('get hostname=%s from server=%s' - % (host, server)) + % (host, str(server.name))) except Exception as e: self.log.info('can not get hostname from server=%s, error=%s' % (server, e)) @@ -97,10 +109,14 @@ class SampleInspector(BaseInspector): event_type = event['type'] if event_type == self.event_type: self.hostnames.append(hostname) + if self.notif is not None: + thr0 = self._send_notif(hostname) thr1 = self._disable_compute_host(hostname) thr2 = self._vms_reset_state('error', hostname) if self.conf.inspector.update_neutron_port_dp_status: thr3 = self._set_ports_data_plane_status('DOWN', hostname) + if self.notif is not None: + thr0.join() thr1.join() thr2.join() if self.conf.inspector.update_neutron_port_dp_status: @@ -119,7 +135,7 @@ class SampleInspector(BaseInspector): def maintenance(self, data): try: payload = self._alarm_traits_decoder(data) - except: + except Exception: payload = ({t[0]: t[2] for t in data['reason_data']['event']['traits']}) self.log.error('cannot parse alarm data: %s' % payload) @@ -156,8 +172,8 @@ class SampleInspector(BaseInspector): nova.servers.reset_state(server, state) vmdown_time = time.time() self.vm_down_time = vmdown_time - self.log.info('doctor mark vm(%s) error at %s' - % (server, vmdown_time)) + self.log.info('doctor mark vm(%s) %s at %s' + % (server, state, vmdown_time)) thrs = [] for nova, server in zip(self.novaclients, self.servers[hostname]): @@ -167,6 +183,26 @@ class SampleInspector(BaseInspector): t.join() @utils.run_async + def _send_notif(self, hostname): + + @utils.run_async + def _send_notif(server): + payload = dict(tenant_id=server.tenant_id, + instance_id=server.id, + state="error") + self.notif.info({'some': 'context'}, 'compute.instance.update', + payload) + self.log.info('doctor compute.instance.update vm(%s) error %s' + % (server, time.time())) + + thrs = [] + for server in self.servers[hostname]: + t = _send_notif(server) + thrs.append(t) + for t in thrs: + t.join() + + @utils.run_async def _set_ports_data_plane_status(self, status, hostname): body = {'data_plane_status': status} diff --git a/doctor_tests/installer/__init__.py b/doctor_tests/installer/__init__.py index 2b9ad83d..00a01667 100644 --- a/doctor_tests/installer/__init__.py +++ b/doctor_tests/installer/__init__.py @@ -13,8 +13,8 @@ from oslo_utils import importutils OPTS = [ cfg.StrOpt('type', - default=os.environ.get('INSTALLER_TYPE', 'local'), - choices=['local', 'apex', 'daisy', 'fuel'], + default=os.environ.get('INSTALLER_TYPE', 'devstack'), + choices=['apex', 'daisy', 'fuel', 'devstack'], help='the type of installer', required=True), cfg.StrOpt('ip', @@ -28,10 +28,10 @@ OPTS = [ _installer_name_class_mapping = { - 'local': 'doctor_tests.installer.local.LocalInstaller', 'apex': 'doctor_tests.installer.apex.ApexInstaller', 'daisy': 'doctor_tests.installer.daisy.DaisyInstaller', - 'fuel': 'doctor_tests.installer.mcp.McpInstaller' + 'fuel': 'doctor_tests.installer.mcp.McpInstaller', + 'devstack': 'doctor_tests.installer.devstack.DevstackInstaller' } diff --git a/doctor_tests/installer/apex.py b/doctor_tests/installer/apex.py index 2aa81ff9..3ec2100c 100644 --- a/doctor_tests/installer/apex.py +++ b/doctor_tests/installer/apex.py @@ -6,10 +6,11 @@ # which accompanies this distribution, and is available at # http://www.apache.org/licenses/LICENSE-2.0 ############################################################################## -import re import time from doctor_tests.common.constants import Inspector +from doctor_tests.common.constants import is_fenix +from doctor_tests.common.utils import get_doctor_test_root_dir from doctor_tests.common.utils import SSHClient from doctor_tests.installer.base import BaseInstaller @@ -20,6 +21,7 @@ class ApexInstaller(BaseInstaller): cm_set_script = 'set_config.py' nc_set_compute_script = 'set_compute_config.py' cg_set_script = 'set_congress.py' + fe_set_script = 'set_fenix.sh' cm_restore_script = 'restore_config.py' nc_restore_compute_script = 'restore_compute_config.py' cg_restore_script = 'restore_congress.py' @@ -36,13 +38,13 @@ class ApexInstaller(BaseInstaller): self.key_file = None self.controllers = list() self.computes = list() - self.controller_clients = list() - self.compute_clients = list() def setup(self): self.log.info('Setup Apex installer start......') self.key_file = self.get_ssh_key_from_installer() self._get_overcloud_conf() + if is_fenix(self.conf): + self._copy_overcloudrc_to_controllers() self.create_flavor() self.set_apply_patches() self.setup_stunnel() @@ -56,6 +58,11 @@ class ApexInstaller(BaseInstaller): key_path = '/home/stack/.ssh/id_rsa' return self._get_ssh_key(self.client, key_path) + def _copy_overcloudrc_to_controllers(self): + for ip in self.controllers: + cmd = "scp overcloudrc %s@%s:" % (self.node_user_name, ip) + self._run_cmd_remote(self.client, cmd) + def _get_overcloud_conf(self): self.log.info('Get overcloud config details from Apex installer' '......') @@ -83,26 +90,6 @@ class ApexInstaller(BaseInstaller): host_ips = self._run_cmd_remote(self.client, command) return host_ips[0] - def get_transport_url(self): - client = SSHClient(self.controllers[0], self.node_user_name, - key_filename=self.key_file) - if self.use_containers: - ncbase = "/var/lib/config-data/puppet-generated/nova" - else: - ncbase = "" - command = 'sudo grep "^transport_url" %s/etc/nova/nova.conf' % ncbase - - ret, url = client.ssh(command) - if ret: - raise Exception('Exec command to get host ip from controller(%s)' - 'in Apex installer failed, ret=%s, output=%s' - % (self.controllers[0], ret, url)) - # need to use ip instead of hostname - ret = (re.sub("@.*:", "@%s:" % self.controllers[0], - url[0].split("=", 1)[1])) - self.log.debug('get_transport_url %s' % ret) - return ret - def _set_docker_restart_cmd(self, service): # There can be multiple instances running so need to restart all cmd = "for container in `sudo docker ps | grep " @@ -113,6 +100,7 @@ class ApexInstaller(BaseInstaller): def set_apply_patches(self): self.log.info('Set apply patches start......') + fenix_files = None set_scripts = [self.cm_set_script] @@ -127,6 +115,10 @@ class ApexInstaller(BaseInstaller): if self.conf.test_case != 'fault_management': if self.use_containers: restart_cmd += self._set_docker_restart_cmd("nova-scheduler") + if is_fenix(self.conf): + set_scripts.append(self.fe_set_script) + testdir = get_doctor_test_root_dir() + fenix_files = ["Dockerfile", "run"] else: restart_cmd += ' openstack-nova-scheduler.service' set_scripts.append(self.nc_set_compute_script) @@ -141,29 +133,34 @@ class ApexInstaller(BaseInstaller): for node_ip in self.controllers: client = SSHClient(node_ip, self.node_user_name, key_filename=self.key_file) - self.controller_clients.append(client) + if fenix_files is not None: + for fenix_file in fenix_files: + src_file = '{0}/{1}/{2}'.format(testdir, + 'admin_tool/fenix', + fenix_file) + client.scp(src_file, fenix_file) self._run_apply_patches(client, restart_cmd, set_scripts, python=self.python) + time.sleep(5) + + self.log.info('Set apply patches start......') if self.conf.test_case != 'fault_management': if self.use_containers: - restart_cmd = self._set_docker_restart_cmd("nova-compute") + restart_cmd = self._set_docker_restart_cmd("nova") else: restart_cmd = 'sudo systemctl restart' \ ' openstack-nova-compute.service' for node_ip in self.computes: client = SSHClient(node_ip, self.node_user_name, key_filename=self.key_file) - self.compute_clients.append(client) self._run_apply_patches(client, restart_cmd, [self.nc_set_compute_script], python=self.python) - - if self.conf.test_case != 'fault_management': - time.sleep(10) + time.sleep(5) def restore_apply_patches(self): self.log.info('restore apply patches start......') @@ -192,39 +189,22 @@ class ApexInstaller(BaseInstaller): restart_cmd += ' openstack-congress-server.service' restore_scripts.append(self.cg_restore_script) - for client, node_ip in zip(self.controller_clients, self.controllers): - retry = 0 - while retry < 2: - try: - self._run_apply_patches(client, - restart_cmd, - restore_scripts, - python=self.python) - except Exception: - if retry > 0: - raise Exception("SSHClient to %s feiled" % node_ip) - client = SSHClient(node_ip, self.node_user_name, - key_filename=self.key_file) - retry += 1 - break + for node_ip in self.controllers: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + self._run_apply_patches(client, + restart_cmd, + restore_scripts, + python=self.python) + if self.conf.test_case != 'fault_management': if self.use_containers: restart_cmd = self._set_docker_restart_cmd("nova-compute") else: restart_cmd = 'sudo systemctl restart' \ ' openstack-nova-compute.service' - for client, node_ip in zip(self.compute_clients, self.computes): - retry = 0 - while retry < 2: - try: - self._run_apply_patches( - client, restart_cmd, - [self.nc_restore_compute_script], - python=self.python) - except Exception: - if retry > 0: - raise Exception("SSHClient to %s feiled" % node_ip) - client = SSHClient(node_ip, self.node_user_name, - key_filename=self.key_file) - retry += 1 - break + for node_ip in self.computes: + self._run_apply_patches( + client, restart_cmd, + [self.nc_restore_compute_script], + python=self.python) diff --git a/doctor_tests/installer/base.py b/doctor_tests/installer/base.py index 30435931..de4d2f2e 100644 --- a/doctor_tests/installer/base.py +++ b/doctor_tests/installer/base.py @@ -14,8 +14,9 @@ import pwd import six import stat import subprocess +import time -from doctor_tests.common.utils import get_doctor_test_root_dir +from doctor_tests.common import utils from doctor_tests.identity_auth import get_session from doctor_tests.os_clients import nova_client @@ -75,7 +76,7 @@ class BaseInstaller(object): cmd = ("ssh -o UserKnownHostsFile=/dev/null" " -o StrictHostKeyChecking=no" " -i %s %s@%s -R %s:localhost:%s" - " sleep %s > ssh_tunnel.%s" + " sleep %s > ssh_tunnel.%s.%s" " 2>&1 < /dev/null " % (self.key_file, self.node_user_name, @@ -83,9 +84,28 @@ class BaseInstaller(object): port, port, tunnel_uptime, - node_ip)) + node_ip, + port)) server = subprocess.Popen('exec ' + cmd, shell=True) self.servers.append(server) + if self.conf.admin_tool.type == 'fenix': + port = self.conf.admin_tool.port + self.log.info('tunnel for port %s' % port) + cmd = ("ssh -o UserKnownHostsFile=/dev/null" + " -o StrictHostKeyChecking=no" + " -i %s %s@%s -L %s:localhost:%s" + " sleep %s > ssh_tunnel.%s.%s" + " 2>&1 < /dev/null " + % (self.key_file, + self.node_user_name, + node_ip, + port, + port, + tunnel_uptime, + node_ip, + port)) + server = subprocess.Popen('exec ' + cmd, shell=True) + self.servers.append(server) def _get_ssh_key(self, client, key_path): self.log.info('Get SSH keys from %s installer......' @@ -96,7 +116,8 @@ class BaseInstaller(object): % self.conf.installer.type) return self.key_file - ssh_key = '{0}/{1}'.format(get_doctor_test_root_dir(), 'instack_key') + ssh_key = '{0}/{1}'.format(utils.get_doctor_test_root_dir(), + 'instack_key') client.scp(key_path, ssh_key, method='get') user = getpass.getuser() uid = pwd.getpwnam(user).pw_uid @@ -105,6 +126,10 @@ class BaseInstaller(object): os.chmod(ssh_key, stat.S_IREAD) return ssh_key + @abc.abstractmethod + def get_transport_url(self): + pass + def _run_cmd_remote(self, client, command): self.log.info('Run command=%s in %s installer......' % (command, self.conf.installer.type)) @@ -131,19 +156,36 @@ class BaseInstaller(object): ret = False return ret + @utils.run_async def _run_apply_patches(self, client, restart_cmd, script_names, python='python3'): installer_dir = os.path.dirname(os.path.realpath(__file__)) - if isinstance(script_names, list): for script_name in script_names: script_abs_path = '{0}/{1}/{2}'.format(installer_dir, 'common', script_name) - client.scp(script_abs_path, script_name) - cmd = 'sudo %s %s' % (python, script_name) - ret, output = client.ssh(cmd) + if self.conf.installer.type == "devstack": + script_name = "/opt/stack/%s" % script_name + try: + client.scp(script_abs_path, script_name) + except Exception: + client.scp(script_abs_path, script_name) + try: + if ".py" in script_name: + cmd = 'sudo %s %s' % (python, script_name) + else: + cmd = 'sudo chmod 700 %s;sudo ./%s' % (script_name, + script_name) + ret, output = client.ssh(cmd) + self.log.info('Command %s output %s' % (cmd, output)) + except Exception: + ret, output = client.ssh(cmd) + self.log.info('Command %s output %s' % (cmd, output)) if ret: raise Exception('Do the command in remote' ' node failed, ret=%s, cmd=%s, output=%s' % (ret, cmd, output)) + if 'nova' in restart_cmd or 'devstack@n-' in restart_cmd: + # Make sure scheduler has proper cpu_allocation_ratio + time.sleep(5) client.ssh(restart_cmd) diff --git a/doctor_tests/installer/common/restore_compute_config.py b/doctor_tests/installer/common/restore_compute_config.py index 0e9939fd..82e10a66 100644 --- a/doctor_tests/installer/common/restore_compute_config.py +++ b/doctor_tests/installer/common/restore_compute_config.py @@ -11,18 +11,16 @@ import shutil def restore_cpu_allocation_ratio(): - nova_base = "/var/lib/config-data/puppet-generated/nova" - if not os.path.isdir(nova_base): - nova_base = "" - nova_file = nova_base + '/etc/nova/nova.conf' - nova_file_bak = nova_base + '/etc/nova/nova.bak' - - if not os.path.isfile(nova_file_bak): - print('Bak_file:%s does not exist.' % nova_file_bak) - else: - print('restore: %s' % nova_file) - shutil.copyfile(nova_file_bak, nova_file) - os.remove(nova_file_bak) + for nova_file_bak in ["/var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.bak", # noqa + "/var/lib/config-data/puppet-generated/nova/etc/nova/nova.bak", # noqa + "/etc/nova/nova.bak"]: + if os.path.isfile(nova_file_bak): + nova_file = nova_file_bak.replace(".bak", ".conf") + print('restoring nova.bak.') + shutil.copyfile(nova_file_bak, nova_file) + os.remove(nova_file_bak) + return + print('nova.bak does not exist.') return restore_cpu_allocation_ratio() diff --git a/doctor_tests/installer/common/set_compute_config.py b/doctor_tests/installer/common/set_compute_config.py index 86266085..615f1895 100644 --- a/doctor_tests/installer/common/set_compute_config.py +++ b/doctor_tests/installer/common/set_compute_config.py @@ -10,37 +10,25 @@ import os import shutil -def make_initial_config(service, dest): - for mk in ["", "/etc", "/%s" % service]: - dest += mk - os.mkdir(dest) - src = "/etc/%s/%s.conf" % (service, service) - dest += "/%s.conf" % service - shutil.copyfile(src, dest) - - def set_cpu_allocation_ratio(): - docker_conf_base_dir = "/var/lib/config-data/puppet-generated" - if not os.path.isdir(docker_conf_base_dir): - nova_base = "" - else: - nova_base = "%s/nova" % docker_conf_base_dir - if not os.path.isdir(nova_base): - # nova.conf to be used might not exist - make_initial_config("nova", nova_base) - nova_file = nova_base + '/etc/nova/nova.conf' - nova_file_bak = nova_base + '/etc/nova/nova.bak' + nova_file_bak = None + for nova_file in ["/var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.conf", # noqa + "/var/lib/config-data/puppet-generated/nova/etc/nova/nova.conf", # noqa + "/etc/nova/nova.conf"]: + if os.path.isfile(nova_file): + nova_file_bak = nova_file.replace(".conf", ".bak") + break - if not os.path.isfile(nova_file): - raise Exception("File doesn't exist: %s." % nova_file) + if nova_file_bak is None: + raise Exception("Could not find nova.conf") # TODO (tojuvone): Unfortunately ConfigParser did not produce working conf fcheck = open(nova_file) found_list = ([ca for ca in fcheck.readlines() if "cpu_allocation_ratio" in ca]) fcheck.close() + change = False + found = False if found_list and len(found_list): - change = False - found = False for car in found_list: if car.startswith('#'): continue diff --git a/doctor_tests/installer/common/set_config.py b/doctor_tests/installer/common/set_config.py index 3dc6cd9a..e66d4c2c 100644 --- a/doctor_tests/installer/common/set_config.py +++ b/doctor_tests/installer/common/set_config.py @@ -125,6 +125,7 @@ def set_event_definitions(): 'reply_url': {'fields': 'payload.reply_url'}, 'actions_at': {'fields': 'payload.actions_at', 'type': 'datetime'}, + 'reply_at': {'fields': 'payload.reply_at', 'type': 'datetime'}, 'state': {'fields': 'payload.state'}, 'session_id': {'fields': 'payload.session_id'}, 'project_id': {'fields': 'payload.project_id'}, diff --git a/doctor_tests/installer/common/set_fenix.sh b/doctor_tests/installer/common/set_fenix.sh new file mode 100644 index 00000000..bd1eae47 --- /dev/null +++ b/doctor_tests/installer/common/set_fenix.sh @@ -0,0 +1,106 @@ +#!/usr/bin/env bash + +############################################################################## +# Copyright (c) 2019 Nokia Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## + +# Config files +docker -v >/dev/null || { +echo "Fenix needs docker to be installed..." +ver=`grep "UBUNTU_CODENAME" /etc/os-release | cut -d '=' -f 2` +curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo apt-key add - +add-apt-repository "deb [arch=amd64] https://download.docker.com/linux/ubuntu $ver stable" +apt install apt-transport-https ca-certificates curl software-properties-common +apt update +apt-cache policy docker-ce +apt-get install -y docker-ce docker-ce-cli containerd.io +dpkg -r --force-depends golang-docker-credential-helpers +} + +docker ps | grep fenix -q && { +REMOTE=`git ls-remote https://opendev.org/x/fenix HEAD | awk '{ print $1}'` +LOCAL=`docker exec -t fenix git rev-parse @` +if [[ "$LOCAL" =~ "$REMOTE" ]]; then + # Difference in above string ending marks, so cannot compare equal + echo "Fenix start: Already running latest $LOCAL equals $REMOTE" + exit 0 +else + echo "Fenix container needs to be recreated $LOCAL not $REMOTE" + # Remove previous container + for img in `docker image list | grep "^fenix" | awk '{print $1}'`; do + for dock in `docker ps --all -f "ancestor=$img" | grep "$img" | awk '{print $1}'`; do + docker stop $dock; docker rm $dock; + done; + docker image rm $img; + done +fi +} || echo "Fenix container needs to be created..." + +cp /root/keystonercv3 . + +transport=`grep -m1 "^transport" /etc/nova/nova.conf` +. keystonercv3 + +echo "[DEFAULT]" > fenix.conf +echo "port = 12347" >> fenix.conf +echo $transport >> fenix.conf + +echo "[database]" >> fenix.conf +MYSQLIP=`grep -m1 "^connection" /etc/nova/nova.conf | sed -e "s/.*@//;s/\/.*//"` +echo "connection = mysql+pymysql://fenix:fenix@$MYSQLIP/fenix" >> fenix.conf + +echo "[service_user]" >> fenix.conf +echo "os_auth_url = $OS_AUTH_URL" >> fenix.conf +echo "os_username = $OS_USERNAME" >> fenix.conf +echo "os_password = $OS_PASSWORD" >> fenix.conf +echo "os_user_domain_name = $OS_USER_DOMAIN_NAME" >> fenix.conf +echo "os_project_name = $OS_PROJECT_NAME" >> fenix.conf +echo "os_project_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix.conf + +echo "[DEFAULT]" > fenix-api.conf +echo "port = 12347" >> fenix-api.conf +echo $transport >> fenix-api.conf + +echo "[keystone_authtoken]" >> fenix-api.conf +echo "auth_url = $OS_AUTH_URL" >> fenix-api.conf +echo "auth_type = password" >> fenix-api.conf +echo "project_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix-api.conf +echo "project_name = $OS_PROJECT_NAME" >> fenix-api.conf +echo "user_domain_name = $OS_PROJECT_DOMAIN_NAME" >> fenix-api.conf +echo "password = $OS_PASSWORD" >> fenix-api.conf +echo "username = $OS_USERNAME" >> fenix-api.conf +echo "cafile = /opt/stack/data/ca-bundle.pem" >> fenix-api.conf + +openstack service list | grep -q maintenance || { +openstack service create --name fenix --enable maintenance +openstack endpoint create --region $OS_REGION_NAME --enable fenix public http://localhost:12347/v1 +} + +# Mysql pw +# MYSQLPW=`cat /var/lib/config-data/mysql/etc/puppet/hieradata/service_configs.json | grep mysql | grep root_password | awk -F": " '{print $2}' | awk -F"\"" '{print $2}'` +MYSQLPW=root + +# Fenix DB +[ `mysql -uroot -p$MYSQLPW -e "SELECT host, user FROM mysql.user;" | grep fenix | wc -l` -eq 0 ] && { + mysql -uroot -p$MYSQLPW -hlocalhost -e "CREATE USER 'fenix'@'localhost' IDENTIFIED BY 'fenix';" + mysql -uroot -p$MYSQLPW -hlocalhost -e "GRANT ALL PRIVILEGES ON fenix.* TO 'fenix'@'' identified by 'fenix';FLUSH PRIVILEGES;" +} +mysql -ufenix -pfenix -hlocalhost -e "DROP DATABASE IF EXISTS fenix;" +mysql -ufenix -pfenix -hlocalhost -e "CREATE DATABASE fenix CHARACTER SET utf8;" + +# Build Fenix container and run it +chmod 700 run +docker build --build-arg OPENSTACK=master --build-arg BRANCH=master --network host $PWD -t fenix | tail -1 +docker run --network host -d --name fenix -p 12347:12347 -ti fenix +if [ $? -eq 0 ]; then + echo "Fenix start: OK" +else + echo "Fenix start: FAILED" +fi +# To debug check log from fenix container +# docker exec -ti fenix tail -f /var/log/fenix-engine.log diff --git a/doctor_tests/installer/devstack.py b/doctor_tests/installer/devstack.py new file mode 100644 index 00000000..02f3601a --- /dev/null +++ b/doctor_tests/installer/devstack.py @@ -0,0 +1,151 @@ +############################################################################## +# Copyright (c) 2019 Nokia Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## +import os +import socket +import time + +from doctor_tests.common.utils import SSHClient +from doctor_tests.common.utils import LocalSSH +from doctor_tests.identity_auth import get_session +from doctor_tests.installer.base import BaseInstaller +from doctor_tests.os_clients import nova_client + + +class DevstackInstaller(BaseInstaller): + node_user_name = None + cm_set_script = 'set_config.py' + nc_set_compute_script = 'set_compute_config.py' + cm_restore_script = 'restore_config.py' + nc_restore_compute_script = 'restore_compute_config.py' + ac_restart_script = 'restart_aodh.py' + ac_restore_script = 'restore_aodh.py' + python = 'python' + + def __init__(self, conf, log): + super(DevstackInstaller, self).__init__(conf, log) + # Run Doctor under users home. sudo hides other env param to be used + home, self.node_user_name = (iter(os.environ.get('VIRTUAL_ENV') + .split('/', 3)[1:3])) + # Migration needs to work so ssh should have proper key defined + self.key_file = '/%s/%s/.ssh/id_rsa' % (home, self.node_user_name) + self.log.info('ssh uses: %s and %s' % (self.node_user_name, + self.key_file)) + self.controllers = ([ip for ip in + socket.gethostbyname_ex(socket.gethostname())[2] + if not ip.startswith('127.')] or + [[(s.connect(('8.8.8.8', 53)), + s.getsockname()[0], s.close()) + for s in [socket.socket(socket.AF_INET, + socket.SOCK_DGRAM)]][0][1]]) + conf.admin_tool.ip = self.controllers[0] + self.computes = list() + self.nova = nova_client(conf.nova_version, get_session()) + + def setup(self): + self.log.info('Setup Devstack installer start......') + self._get_devstack_conf() + self.create_flavor() + self.set_apply_patches() + + def cleanup(self): + self.restore_apply_patches() + + def get_ssh_key_from_installer(self): + return self.key_file + + def get_transport_url(self): + client = LocalSSH(self.log) + cmd = 'sudo grep -m1 "^transport_url" /etc/nova/nova.conf' + ret, url = client.ssh(cmd) + url = url.split("= ", 1)[1][:-1] + self.log.info('get_transport_url %s' % url) + return url + + def get_host_ip_from_hostname(self, hostname): + return [hvisor.__getattr__('host_ip') for hvisor in self.hvisors + if hvisor.__getattr__('hypervisor_hostname') == hostname][0] + + def _get_devstack_conf(self): + self.log.info('Get devstack config details for Devstack installer' + '......') + self.hvisors = self.nova.hypervisors.list(detailed=True) + self.log.info('checking hypervisors.......') + self.computes = [hvisor.__getattr__('host_ip') for hvisor in + self.hvisors] + self.use_containers = False + self.log.info('controller_ips:%s' % self.controllers) + self.log.info('compute_ips:%s' % self.computes) + self.log.info('use_containers:%s' % self.use_containers) + + def _set_docker_restart_cmd(self, service): + # There can be multiple instances running so need to restart all + cmd = "for container in `sudo docker ps | grep " + cmd += service + cmd += " | awk '{print $1}'`; do sudo docker restart $container; \ + done;" + return cmd + + def set_apply_patches(self): + self.log.info('Set apply patches start......') + + set_scripts = [self.cm_set_script] + + restart_cmd = 'sudo systemctl restart' \ + ' devstack@ceilometer-anotification.service' + + client = LocalSSH(self.log) + self._run_apply_patches(client, + restart_cmd, + set_scripts, + python=self.python) + time.sleep(7) + + self.log.info('Set apply patches start......') + + if self.conf.test_case != 'fault_management': + restart_cmd = 'sudo systemctl restart' \ + ' devstack@n-cpu.service' + for node_ip in self.computes: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + self._run_apply_patches(client, + restart_cmd, + [self.nc_set_compute_script], + python=self.python) + time.sleep(7) + + def restore_apply_patches(self): + self.log.info('restore apply patches start......') + + restore_scripts = [self.cm_restore_script] + + restart_cmd = 'sudo systemctl restart' \ + ' devstack@ceilometer-anotification.service' + + if self.conf.test_case != 'fault_management': + restart_cmd += ' devstack@n-sch.service' + restore_scripts.append(self.nc_restore_compute_script) + + client = LocalSSH(self.log) + self._run_apply_patches(client, + restart_cmd, + restore_scripts, + python=self.python) + + if self.conf.test_case != 'fault_management': + + restart_cmd = 'sudo systemctl restart' \ + ' devstack@n-cpu.service' + for node_ip in self.computes: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + self._run_apply_patches( + client, restart_cmd, + [self.nc_restore_compute_script], + python=self.python) diff --git a/doctor_tests/installer/local.py b/doctor_tests/installer/local.py deleted file mode 100644 index fee14f33..00000000 --- a/doctor_tests/installer/local.py +++ /dev/null @@ -1,118 +0,0 @@ -############################################################################## -# Copyright (c) 2017 ZTE Corporation and others. -# -# All rights reserved. This program and the accompanying materials -# are made available under the terms of the Apache License, Version 2.0 -# which accompanies this distribution, and is available at -# http://www.apache.org/licenses/LICENSE-2.0 -############################################################################## -import os -import shutil -import subprocess - -from doctor_tests.installer.base import BaseInstaller -from doctor_tests.installer.common.vitrage import \ - set_vitrage_host_down_template -from doctor_tests.common.constants import Inspector -from doctor_tests.common.utils import load_json_file -from doctor_tests.common.utils import write_json_file - - -class LocalInstaller(BaseInstaller): - node_user_name = 'root' - - nova_policy_file = '/etc/nova/policy.json' - nova_policy_file_backup = '%s%s' % (nova_policy_file, '.bak') - - def __init__(self, conf, log): - super(LocalInstaller, self).__init__(conf, log) - self.policy_modified = False - self.add_policy_file = False - - def setup(self): - self.get_ssh_key_from_installer() - self.set_apply_patches() - - def cleanup(self): - self.restore_apply_patches() - - def get_ssh_key_from_installer(self): - self.log.info('Assuming SSH keys already exchanged with computer' - 'for local installer type') - return None - - def get_host_ip_from_hostname(self, hostname): - self.log.info('Get host ip from host name in local installer......') - - cmd = "getent hosts %s | awk '{ print $1 }'" % (hostname) - server = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True) - stdout, stderr = server.communicate() - host_ip = stdout.strip().decode("utf-8") - - self.log.info('Get host_ip:%s from host_name:%s in local installer' - % (host_ip, hostname)) - return host_ip - - def set_apply_patches(self): - self._set_nova_policy() - if self.conf.inspector.type == Inspector.VITRAGE: - set_vitrage_host_down_template() - os.system('sudo systemctl restart devstack@vitrage-graph.service') - - def restore_apply_patches(self): - self._restore_nova_policy() - - def _set_nova_policy(self): - host_status_policy = 'os_compute_api:servers:show:host_status' - host_status_rule = 'rule:admin_or_owner' - policy_data = { - 'context_is_admin': 'role:admin', - 'owner': 'user_id:%(user_id)s', - 'admin_or_owner': 'rule:context_is_admin or rule:owner', - host_status_policy: host_status_rule - } - - if os.path.isfile(self.nova_policy_file): - data = load_json_file(self.nova_policy_file) - if host_status_policy in data: - rule_origion = data[host_status_policy] - if host_status_rule == rule_origion: - self.log.info('Do not need to modify nova policy.') - self.policy_modified = False - else: - # update the host_status_policy - data[host_status_policy] = host_status_rule - self.policy_modified = True - else: - # add the host_status_policy, if the admin_or_owner is not - # defined, add it also - for policy, rule in policy_data.items(): - if policy not in data: - data[policy] = rule - self.policy_modified = True - if self.policy_modified: - self.log.info('Nova policy is Modified.') - shutil.copyfile(self.nova_policy_file, - self.nova_policy_file_backup) - else: - # file does not exit, create a new one and add the policy - self.log.info('Nova policy file not exist. Creating a new one') - data = policy_data - self.add_policy_file = True - - if self.policy_modified or self.add_policy_file: - write_json_file(self.nova_policy_file, data) - os.system('sudo systemctl restart devstack@n-api.service') - - def _restore_nova_policy(self): - if self.policy_modified: - shutil.copyfile(self.nova_policy_file_backup, - self.nova_policy_file) - os.remove(self.nova_policy_file_backup) - elif self.add_policy_file: - os.remove(self.nova_policy_file) - - if self.add_policy_file or self.policy_modified: - os.system('sudo systemctl restart devstack@n-api.service') - self.add_policy_file = False - self.policy_modified = False diff --git a/doctor_tests/installer/mcp.py b/doctor_tests/installer/mcp.py index 9cfff92d..7659c9e2 100644 --- a/doctor_tests/installer/mcp.py +++ b/doctor_tests/installer/mcp.py @@ -1,5 +1,5 @@ ############################################################################## -# Copyright (c) 2018 ZTE Corporation and others. +# Copyright (c) 2019 ZTE Corporation and others. # # All rights reserved. This program and the accompanying materials # are made available under the terms of the Apache License, Version 2.0 @@ -7,15 +7,26 @@ # http://www.apache.org/licenses/LICENSE-2.0 ############################################################################## from os.path import isfile +import re +import time +from doctor_tests.common.constants import is_fenix +from doctor_tests.common.utils import get_doctor_test_root_dir from doctor_tests.common.utils import SSHClient from doctor_tests.installer.base import BaseInstaller class McpInstaller(BaseInstaller): node_user_name = 'ubuntu' - cm_set_script = 'set_ceilometer.py' - cm_restore_script = 'restore_ceilometer.py' + + cm_set_script = 'set_config.py' + nc_set_compute_script = 'set_compute_config.py' + fe_set_script = 'set_fenix.sh' + cm_restore_script = 'restore_config.py' + nc_restore_compute_script = 'restore_compute_config.py' + ac_restart_script = 'restart_aodh.py' + ac_restore_script = 'restore_aodh.py' + python = 'python3' def __init__(self, conf, log): super(McpInstaller, self).__init__(conf, log) @@ -26,40 +37,87 @@ class McpInstaller(BaseInstaller): look_for_keys=True) self.controllers = list() self.controller_clients = list() + self.computes = list() def setup(self): self.log.info('Setup MCP installer start......') - - self.controllers = self.get_controller_ips() + self.get_node_ips() self.create_flavor() - self.set_apply_patches() + if is_fenix(self.conf): + self.set_apply_patches() self.setup_stunnel() def cleanup(self): - self.restore_apply_patches() + if is_fenix(self.conf): + self.restore_apply_patches() for server in self.servers: server.terminate() def get_ssh_key_from_installer(self): self.log.info('Get SSH keys from MCP......') - # Assuming mcp.rsa is already mapped to functest container - # if not, only the test runs on jumphost can get the ssh_key - # default in path /var/lib/opnfv/mcp.rsa + # Default in path /var/lib/opnfv/mcp.rsa ssh_key = '/root/.ssh/id_rsa' mcp_key = '/var/lib/opnfv/mcp.rsa' - return ssh_key if isfile(ssh_key) else mcp_key - - def get_controller_ips(self): - self.log.info('Get controller ips from Mcp installer......') - - command = "sudo salt --out yaml 'ctl*' " \ - "pillar.get _param:openstack_control_address |" \ - "awk '{print $2}'" - controllers = self._run_cmd_remote(self.client, command) - self.log.info('Get controller_ips:%s from Mcp installer' - % controllers) - return controllers + return mcp_key if isfile(mcp_key) else ssh_key + + def get_transport_url(self): + client = SSHClient(self.controllers[0], self.node_user_name, + key_filename=self.key_file) + try: + cmd = 'sudo grep -m1 "^transport_url" /etc/nova/nova.conf' + ret, url = client.ssh(cmd) + + if ret: + raise Exception('Exec command to get transport from ' + 'controller(%s) in MCP installer failed, ' + 'ret=%s, output=%s' + % (self.controllers[0], ret, url)) + elif self.controllers[0] not in url: + # need to use ip instead of hostname + url = (re.sub("@.*:", "@%s:" % self.controllers[0], + url[0].split("=", 1)[1])) + except Exception: + cmd = 'grep -i "^rabbit" /etc/nova/nova.conf' + ret, lines = client.ssh(cmd) + if ret: + raise Exception('Exec command to get transport from ' + 'controller(%s) in MCP installer failed, ' + 'ret=%s, output=%s' + % (self.controllers[0], ret, url)) + else: + for line in lines.split('\n'): + if line.startswith("rabbit_userid"): + rabbit_userid = line.split("=") + if line.startswith("rabbit_port"): + rabbit_port = line.split("=") + if line.startswith("rabbit_password"): + rabbit_password = line.split("=") + url = "rabbit://%s:%s@%s:%s/?ssl=0" % (rabbit_userid, + rabbit_password, + self.controllers[0], + rabbit_port) + self.log.info('get_transport_url %s' % url) + return url + + def _copy_overcloudrc_to_controllers(self): + for ip in self.controllers: + cmd = "scp overcloudrc %s@%s:" % (self.node_user_name, ip) + self._run_cmd_remote(self.client, cmd) + + def get_node_ips(self): + self.log.info('Get node ips from Mcp installer......') + + command = 'sudo salt "*" --out yaml pillar.get _param:single_address' + node_details = self._run_cmd_remote(self.client, command) + + self.controllers = [line.split()[1] for line in node_details + if line.startswith("ctl")] + self.computes = [line.split()[1] for line in node_details + if line.startswith("cmp")] + + self.log.info('controller_ips:%s' % self.controllers) + self.log.info('compute_ips:%s' % self.computes) def get_host_ip_from_hostname(self, hostname): command = "sudo salt --out yaml '%s*' " \ @@ -70,21 +128,80 @@ class McpInstaller(BaseInstaller): def set_apply_patches(self): self.log.info('Set apply patches start......') + fenix_files = None + set_scripts = [self.cm_set_script] + thrs = [] + + restart_cmd = 'sudo systemctl restart' \ + ' ceilometer-agent-notification.service' + + if self.conf.test_case != 'fault_management': + if is_fenix(self.conf): + set_scripts.append(self.fe_set_script) + testdir = get_doctor_test_root_dir() + fenix_files = ["Dockerfile", "run"] + restart_cmd += ' nova-scheduler.service' + set_scripts.append(self.nc_set_compute_script) - restart_cm_cmd = 'sudo service ceilometer-agent-notification restart' for node_ip in self.controllers: client = SSHClient(node_ip, self.node_user_name, key_filename=self.key_file) - self.controller_clients.append(client) - self._run_apply_patches(client, - restart_cm_cmd, - [self.cm_set_script]) + if fenix_files is not None: + for fenix_file in fenix_files: + src_file = '{0}/{1}/{2}'.format(testdir, + 'admin_tool/fenix', + fenix_file) + client.scp(src_file, fenix_file) + thrs.append(self._run_apply_patches(client, + restart_cmd, + set_scripts, + python=self.python)) + time.sleep(5) + + self.log.info('Set apply patches start......') + + if self.conf.test_case != 'fault_management': + restart_cmd = 'sudo systemctl restart nova-compute.service' + for node_ip in self.computes: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + thrs.append(self._run_apply_patches( + client, + restart_cmd, + [self.nc_set_compute_script], + python=self.python)) + time.sleep(5) + # If Fenix container ir build, it needs to be ready before continue + for thr in thrs: + thr.join() def restore_apply_patches(self): self.log.info('restore apply patches start......') - restart_cm_cmd = 'sudo service ceilometer-agent-notification restart' - for client in self.controller_clients: + restore_scripts = [self.cm_restore_script] + + restore_scripts.append(self.ac_restore_script) + restart_cmd = 'sudo systemctl restart' \ + ' ceilometer-agent-notification.service' + + if self.conf.test_case != 'fault_management': + restart_cmd += ' nova-scheduler.service' + restore_scripts.append(self.nc_restore_compute_script) + + for node_ip in self.controllers: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) self._run_apply_patches(client, - restart_cm_cmd, - [self.cm_restore_script]) + restart_cmd, + restore_scripts, + python=self.python) + + if self.conf.test_case != 'fault_management': + restart_cmd = 'sudo systemctl restart nova-compute.service' + for node_ip in self.computes: + client = SSHClient(node_ip, self.node_user_name, + key_filename=self.key_file) + self._run_apply_patches( + client, restart_cmd, + [self.nc_restore_compute_script], + python=self.python) diff --git a/doctor_tests/main.py b/doctor_tests/main.py index 438d8324..7573faec 100644 --- a/doctor_tests/main.py +++ b/doctor_tests/main.py @@ -1,5 +1,5 @@ ############################################################################## -# Copyright (c) 2017 ZTE Corporation and others. +# Copyright (c) 2019 ZTE Corporation and others. # # All rights reserved. This program and the accompanying materials # are made available under the terms of the Apache License, Version 2.0 @@ -43,7 +43,6 @@ class DoctorTest(object): def setup(self): # prepare the cloud env self.installer.setup() - # preparing VM image... self.image.create() @@ -51,37 +50,50 @@ class DoctorTest(object): self.user.create() def test_fault_management(self): - try: - LOG.info('doctor fault management test starting.......') - - self.fault_management = \ - FaultManagement(self.conf, self.installer, self.user, LOG) - - # prepare test env - self.fault_management.setup() - - # wait for aodh alarms are updated in caches for event evaluator, - # sleep time should be larger than event_alarm_cache_ttl - # (default 60) - # (tojuvone) Fraser currently needs 120 - time.sleep(120) - - # injecting host failure... - # NOTE (umar) add INTERFACE_NAME logic to host injection - self.fault_management.start() - time.sleep(30) - - # verify the test results - # NOTE (umar) copy remote monitor.log file when monitor=collectd - self.fault_management.check_host_status('down') - self.fault_management.check_notification_time() - - except Exception as e: - LOG.error('doctor fault management test failed, ' - 'Exception=%s' % e) - sys.exit(1) - finally: - self.fault_management.cleanup() + retry = 2 + # Retry once if notified_time is None + while retry > 0: + try: + self.fault_management = None + LOG.info('doctor fault management test starting.......') + transport_url = self.installer.get_transport_url() + self.fault_management = \ + FaultManagement(self.conf, self.installer, self.user, LOG, + transport_url) + + # prepare test env + self.fault_management.setup() + + # wait for aodh alarms are updated in caches for event + # evaluator,sleep time should be larger than + # event_alarm_cache_ttl (default 60) + # (tojuvone) Fraser currently needs 120 + time.sleep(120) + + # injecting host failure... + # NOTE (umar) add INTERFACE_NAME logic to host injection + self.fault_management.start() + time.sleep(30) + + # verify the test results + # NOTE (umar) copy remote monitor.log file when + # monitor=collectd + self.fault_management.check_host_status('down') + self.fault_management.check_notification_time() + retry = 0 + + except Exception as e: + LOG.error('doctor fault management test failed, ' + 'Exception=%s' % e) + if 'notified_time=None' in str(e): + retry -= 1 + LOG.info('doctor fault management retry') + continue + LOG.error(format_exc()) + sys.exit(1) + finally: + if self.fault_management is not None: + self.fault_management.cleanup() def _amount_compute_nodes(self): services = self.nova.services.list(binary='nova-compute') @@ -94,11 +106,12 @@ class DoctorTest(object): LOG.info('not enough compute nodes, skipping doctor ' 'maintenance test') return - elif self.conf.installer.type != 'apex': + elif self.conf.installer.type not in ['apex', 'fuel', 'devstack']: LOG.info('not supported installer, skipping doctor ' 'maintenance test') return try: + maintenance = None LOG.info('doctor maintenance test starting.......') trasport_url = self.installer.get_transport_url() maintenance = Maintenance(trasport_url, self.conf, LOG) @@ -120,7 +133,8 @@ class DoctorTest(object): LOG.error(format_exc()) sys.exit(1) finally: - maintenance.cleanup_maintenance() + if maintenance is not None: + maintenance.cleanup_maintenance() def run(self): """run doctor tests""" @@ -143,6 +157,7 @@ class DoctorTest(object): % function) except Exception as e: LOG.error('doctor test failed, Exception=%s' % e) + LOG.error(format_exc()) sys.exit(1) finally: self.cleanup() diff --git a/doctor_tests/scenario/fault_management.py b/doctor_tests/scenario/fault_management.py index 869311bd..0271dffe 100644 --- a/doctor_tests/scenario/fault_management.py +++ b/doctor_tests/scenario/fault_management.py @@ -40,7 +40,7 @@ sleep 1 class FaultManagement(object): - def __init__(self, conf, installer, user, log): + def __init__(self, conf, installer, user, log, transport_url): self.conf = conf self.log = log self.user = user @@ -55,7 +55,7 @@ class FaultManagement(object): self.network = Network(self.conf, log) self.instance = Instance(self.conf, log) self.alarm = Alarm(self.conf, log) - self.inspector = get_inspector(self.conf, log) + self.inspector = get_inspector(self.conf, log, transport_url) self.monitor = get_monitor(self.conf, self.inspector.get_inspector_url(), log) @@ -111,7 +111,10 @@ class FaultManagement(object): server = servers.get(vm_name) if not server: raise Exception('Can not find instance: vm_name(%s)' % vm_name) - host_name = server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname') + # use hostname without domain name which is mapped to the cell + hostname = \ + server.__dict__.get('OS-EXT-SRV-ATTR:hypervisor_hostname') + host_name = hostname.split('.')[0] host_ip = self.installer.get_host_ip_from_hostname(host_name) self.log.info('Get host info(name:%s, ip:%s) which vm(%s) launched at' @@ -209,6 +212,10 @@ class FaultManagement(object): detected = self.monitor.detected_time notified = self.consumer.notified_time + if None in [vmdown, hostdown, detected, notified]: + self.log.info('one of the time for profiler is None, return') + return + # TODO(yujunz) check the actual delay to verify time sync status # expected ~1s delay from $trigger to $linkdown relative_start = linkdown diff --git a/doctor_tests/scenario/maintenance.py b/doctor_tests/scenario/maintenance.py index 9fcd4128..e6cdcccd 100644 --- a/doctor_tests/scenario/maintenance.py +++ b/doctor_tests/scenario/maintenance.py @@ -1,5 +1,5 @@ ############################################################################## -# Copyright (c) 2018 Nokia Corporation and others. +# Copyright (c) 2019 Nokia Corporation and others. # # All rights reserved. This program and the accompanying materials # are made available under the terms of the Apache License, Version 2.0 @@ -28,15 +28,25 @@ class Maintenance(object): def __init__(self, trasport_url, conf, log): self.conf = conf self.log = log + self.admin_session = get_session() self.keystone = keystone_client( self.conf.keystone_version, get_session()) self.nova = nova_client(conf.nova_version, get_session()) auth = get_identity_auth(project=self.conf.doctor_project) self.neutron = neutron_client(get_session(auth=auth)) self.stack = Stack(self.conf, self.log) - self.admin_tool = get_admin_tool(trasport_url, self.conf, self.log) + if self.conf.installer.type == "devstack": + self.endpoint_ip = trasport_url.split("@", 1)[1].split(":", 1)[0] + else: + self.endpoint_ip = self.conf.admin_tool.ip + self.endpoint = "http://%s:12347/" % self.endpoint_ip + if self.conf.admin_tool.type == 'sample': + self.admin_tool = get_admin_tool(trasport_url, self.conf, self.log) + self.endpoint += 'maintenance' + else: + self.endpoint += 'v1/maintenance' self.app_manager = get_app_manager(self.stack, self.conf, self.log) - self.inspector = get_inspector(self.conf, self.log) + self.inspector = get_inspector(self.conf, self.log, trasport_url) def get_external_network(self): ext_net = None @@ -64,8 +74,16 @@ class Maintenance(object): raise Exception('not enough vcpus (%d) on %s' % (vcpus, hostname)) if vcpus_used > 0: - raise Exception('%d vcpus used on %s' - % (vcpus_used, hostname)) + if self.conf.test_case == 'all': + # VCPU might not yet be free after fault_management test + self.log.info('%d vcpus used on %s, retry...' + % (vcpus_used, hostname)) + time.sleep(15) + hvisor = self.nova.hypervisors.get(hvisor.id) + vcpus_used = hvisor.__getattr__('vcpus_used') + if vcpus_used > 0: + raise Exception('%d vcpus used on %s' + % (vcpus_used, hostname)) if prev_vcpus != 0 and prev_vcpus != vcpus: raise Exception('%d vcpus on %s does not match to' '%d on %s' @@ -110,9 +128,14 @@ class Maintenance(object): parameters=parameters, files=files) - self.admin_tool.start() - self.app_manager.start() + if self.conf.admin_tool.type == 'sample': + self.admin_tool.start() + else: + # TBD Now we expect Fenix is running in self.conf.admin_tool.port + pass + # Inspector before app_manager, as floating ip might come late self.inspector.start() + self.app_manager.start() def start_maintenance(self): self.log.info('start maintenance.......') @@ -121,22 +144,49 @@ class Maintenance(object): for hvisor in hvisors: hostname = hvisor.__getattr__('hypervisor_hostname') maintenance_hosts.append(hostname) - - url = 'http://0.0.0.0:%s/maintenance' % self.conf.admin_tool.port - # let's start maintenance 20sec from now, so projects will have - # time to ACK to it before that - maintenance_at = (datetime.datetime.utcnow() + - datetime.timedelta(seconds=20) - ).strftime('%Y-%m-%d %H:%M:%S') - data = {'hosts': maintenance_hosts, - 'state': 'MAINTENANCE', - 'maintenance_at': maintenance_at, - 'metadata': {'openstack_version': 'Pike'}} + url = self.endpoint headers = { 'Content-Type': 'application/json', 'Accept': 'application/json'} - - ret = requests.post(url, data=json.dumps(data), headers=headers) + if self.conf.admin_tool.type == 'fenix': + headers['X-Auth-Token'] = self.admin_session.get_token() + self.log.info('url %s headers %s' % (url, headers)) + retries = 12 + ret = None + while retries > 0: + # let's start maintenance 20sec from now, so projects will have + # time to ACK to it before that + maintenance_at = (datetime.datetime.utcnow() + + datetime.timedelta(seconds=30) + ).strftime('%Y-%m-%d %H:%M:%S') + + data = {'state': 'MAINTENANCE', + 'maintenance_at': maintenance_at, + 'metadata': {'openstack_version': 'Train'}} + + if self.conf.app_manager.type == 'vnfm': + data['workflow'] = 'vnf' + else: + data['workflow'] = 'default' + + if self.conf.admin_tool.type == 'sample': + data['hosts'] = maintenance_hosts + else: + data['hosts'] = [] + try: + ret = requests.post(url, data=json.dumps(data), + headers=headers) + except Exception: + if retries == 0: + raise Exception('admin tool did not respond in 120s') + else: + self.log.info('admin tool not ready, retry in 10s') + retries = retries - 1 + time.sleep(10) + continue + break + if not ret: + raise Exception("admin tool did not respond") if ret.status_code != 200: raise Exception(ret.text) return ret.json()['session_id'] @@ -144,48 +194,56 @@ class Maintenance(object): def remove_maintenance_session(self, session_id): self.log.info('remove maintenance session %s.......' % session_id) - url = 'http://0.0.0.0:%s/maintenance' % self.conf.admin_tool.port + url = ('%s/%s' % (self.endpoint, session_id)) - data = {'state': 'REMOVE_MAINTENANCE_SESSION', - 'session_id': session_id} headers = { 'Content-Type': 'application/json', 'Accept': 'application/json'} - ret = requests.post(url, data=json.dumps(data), headers=headers) + if self.conf.admin_tool.type == 'fenix': + headers['X-Auth-Token'] = self.admin_session.get_token() + + ret = requests.delete(url, data=None, headers=headers) if ret.status_code != 200: raise Exception(ret.text) def get_maintenance_state(self, session_id): - url = 'http://0.0.0.0:%s/maintenance' % self.conf.admin_tool.port - data = {'session_id': session_id} + + url = ('%s/%s' % (self.endpoint, session_id)) + headers = { 'Content-Type': 'application/json', 'Accept': 'application/json'} - ret = requests.get(url, data=json.dumps(data), headers=headers) + + if self.conf.admin_tool.type == 'fenix': + headers['X-Auth-Token'] = self.admin_session.get_token() + + ret = requests.get(url, data=None, headers=headers) if ret.status_code != 200: raise Exception(ret.text) return ret.json()['state'] def wait_maintenance_complete(self, session_id): - retries = 66 + retries = 90 state = None - time.sleep(540) - while state != 'MAINTENANCE_COMPLETE' and retries > 0: + time.sleep(300) + while (state not in ['MAINTENANCE_DONE', 'MAINTENANCE_FAILED'] and + retries > 0): time.sleep(10) state = self.get_maintenance_state(session_id) retries = retries - 1 - if retries == 0 and state != 'MAINTENANCE_COMPLETE': - raise Exception('maintenance %s not completed within 20min, status' - ' %s' % (session_id, state)) - elif state == 'MAINTENANCE_COMPLETE': - self.log.info('maintenance %s %s' % (session_id, state)) - self.remove_maintenance_session(session_id) - elif state == 'MAINTENANCE_FAILED': + self.remove_maintenance_session(session_id) + self.log.info('maintenance %s ended with state %s' % + (session_id, state)) + if state == 'MAINTENANCE_FAILED': raise Exception('maintenance %s failed' % session_id) + elif retries == 0: + raise Exception('maintenance %s not completed within 20min' % + session_id) def cleanup_maintenance(self): - self.admin_tool.stop() + if self.conf.admin_tool.type == 'sample': + self.admin_tool.stop() self.app_manager.stop() self.inspector.stop() self.log.info('stack delete start.......') diff --git a/doctor_tests/stack.py b/doctor_tests/stack.py index ee586fa8..8a921beb 100644 --- a/doctor_tests/stack.py +++ b/doctor_tests/stack.py @@ -94,7 +94,7 @@ class Stack(object): # It might not always work at first self.log.info('retry creating maintenance stack.......') self.delete() - time.sleep(3) + time.sleep(5) stack = self.heat.stacks.create(stack_name=self.stack_name, files=files, template=template, diff --git a/doctor_tests/user.py b/doctor_tests/user.py index 29aa004b..2cd9757f 100644 --- a/doctor_tests/user.py +++ b/doctor_tests/user.py @@ -129,7 +129,6 @@ class User(object): def _add_user_role_in_project(self, is_admin=False): """add test user with test role in test project""" - project = self.projects.get(self.conf.doctor_project) user_name = 'admin' if is_admin else self.conf.doctor_user |