From 282369b6fd58a78e6a7c91f21b331363d4ed0fb3 Mon Sep 17 00:00:00 2001 From: Umar Farooq Date: Thu, 13 Jul 2017 12:20:56 +0200 Subject: Add Collectd as a Monitor Type A plugin for collectd is added to use collectd on compute as a monitor type. Monitor files are updated accordingly. The inspector now listens on all interfaces instead of only localhost to enable it to communicate with compute node. JIRA: DOCTOR-86 JIRA: DOCTOR-101 Change-Id: Idc834d428152e4687020eff7d8db36a652b1bf86 Signed-off-by: Umar Farooq --- tests/inspector.py | 3 +- tests/lib/monitor | 31 +++++ tests/lib/monitors/collectd/collectd | 101 +++++++++++++++ tests/lib/monitors/collectd/collectd_plugin.py | 167 +++++++++++++++++++++++++ tests/lib/monitors/sample/monitor.py | 124 ++++++++++++++++++ tests/lib/monitors/sample/sample | 18 +++ tests/main.py | 2 + tests/monitor.py | 124 ------------------ tests/monitor/__init__.py | 5 +- tests/monitor/collectd.py | 145 +++++++++++++++++++++ tests/run.sh | 31 ++--- 11 files changed, 608 insertions(+), 143 deletions(-) create mode 100644 tests/lib/monitor create mode 100644 tests/lib/monitors/collectd/collectd create mode 100644 tests/lib/monitors/collectd/collectd_plugin.py create mode 100644 tests/lib/monitors/sample/monitor.py create mode 100644 tests/lib/monitors/sample/sample delete mode 100644 tests/monitor.py create mode 100644 tests/monitor/collectd.py (limited to 'tests') diff --git a/tests/inspector.py b/tests/inspector.py index a61051f1..82ffc338 100644 --- a/tests/inspector.py +++ b/tests/inspector.py @@ -116,8 +116,7 @@ def get_args(): def main(): args = get_args() - app.run(port=args.port) - + app.run(host='0.0.0.0', port=args.port) if __name__ == '__main__': main() diff --git a/tests/lib/monitor b/tests/lib/monitor new file mode 100644 index 00000000..6b804ec2 --- /dev/null +++ b/tests/lib/monitor @@ -0,0 +1,31 @@ +#!/bin/bash + +MONITOR_TYPE=${MONITOR_TYPE:-sample} + +function is_monitor_supported { + local monitor="$1" + [[ -f $TOP_DIR/lib/monitors/$monitor/$monitor ]] +} + +function is_monitor { + local monitor="$1" + [[ $monitor == $MONITOR_TYPE ]] +} + +function start_monitor { + start_monitor_$MONITOR_TYPE +} + +function stop_monitor { + stop_monitor_$MONITOR_TYPE +} + +function cleanup_monitor { + cleanup_monitor_$MONITOR_TYPE +} + +if ! is_monitor_supported $MONITOR_TYPE; then + die $LINENO "MONITOR_TYPE=$MONITOR_TYPE is not supported." +fi + +source $TOP_DIR/lib/monitors/$MONITOR_TYPE/$MONITOR_TYPE diff --git a/tests/lib/monitors/collectd/collectd b/tests/lib/monitors/collectd/collectd new file mode 100644 index 00000000..f5096658 --- /dev/null +++ b/tests/lib/monitors/collectd/collectd @@ -0,0 +1,101 @@ +#!/bin/bash + +function start_monitor_collectd { + ## CONTROL_IP is the IP of primary interface of control node i.e. + ## eth0, eno1. It is used by collectd monitor to communicate with + ## sample inspector. + ## @TODO (umar) see if mgmt IP of control is a better option. Also + ## primary interface may not be the right option + CONTROL_IP="$(ip a | sed -En 's/127.0.0.1//;s/.*inet (addr:)?(([0-9]*\.){3}[0-9]*).*/\2/p' | sed -n 1p)" + #CONTROL_IP=192.168.98.6 + + echo " +Hostname \"$COMPUTE_HOST\" +FQDNLookup false +Interval 1 +MaxReadInterval 2 + + + Globals true + +LoadPlugin ovs_events +LoadPlugin logfile + + + File \"/var/log/collectd.log\" + Timestamp true + LogLevel \"info\" + + + + ModulePath \"/home/$COMPUTE_USER\" + LogTraces true + Interactive false + Import \"collectd_plugin\" + + control_ip \"$CONTROL_IP\" + compute_ip \"$COMPUTE_IP\" + compute_host \"$COMPUTE_HOST\" + compute_user \"$COMPUTE_USER\" + inspector_type \"$INSPECTOR_TYPE\" + os_auth_url \"$OS_AUTH_URL\" + os_username \"$OS_USERNAME\" + os_password \"$OS_PASSWORD\" + os_project_name \"$OS_PROJECT_NAME\" + os_user_domain_name \"$OS_USER_DOMAIN_NAME\" + os_user_domain_id \"$OS_USER_DOMAIN_ID\" + os_project_domain_name \"$OS_PROJECT_DOMAIN_NAME\" + os_project_domain_id \"$OS_PROJECT_DOMAIN_ID\" + + + + + Port 6640 + Socket \"/var/run/openvswitch/db.sock\" + Interfaces \"@INTERFACE_NAME@\" + SendNotification true + DispatchValues false + + +" > $TOP_DIR/lib/monitors/collectd.conf + + scp $ssh_opts_cpu $TOP_DIR/lib/monitors/collectd.conf $COMPUTE_USER@$COMPUTE_IP: + ## @TODO (umar) Always assuming that the interface is assigned an IP if + ## interface name is not provided. See if there is a better approach + ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" " + if [ -n \"$INTERFACE_NAME\" ]; then + dev=$INTERFACE_NAME + else + dev=\$(sudo ip a | awk '/ $COMPUTE_IP\//{print \$NF}') + fi + sed -i -e \"s/@INTERFACE_NAME@/\$dev/\" collectd.conf + collectd_conf=/opt/collectd/etc/collectd.conf + if [ -e \$collectd_conf ]; then + sudo cp \$collectd_conf \${collectd_conf}-doctor-saved + else + sudo touch \${collectd_conf}-doctor-created + fi + sudo mv collectd.conf /opt/collectd/etc/collectd.conf" + + scp $ssh_opts_cpu $TOP_DIR/lib/monitors/collectd/collectd_plugin.py $COMPUTE_USER@$COMPUTE_IP:collectd_plugin.py + ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" "sudo pkill collectd + sudo /opt/collectd/sbin/collectd" +} + +function stop_monitor_collectd { + ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'sudo pkill collectd' +} + +function cleanup_monitor_collectd { + ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" " + collectd_conf=/opt/collectd/etc/collectd.conf + if [ -e \"\${collectd_conf}-doctor-created\" ]; then + sudo rm \"\${collectd_conf}-doctor-created\" + sudo rm \$collectd_conf + elif [ -e \"\${collectd_conf}-doctor-saved\" ]; then + sudo cp -f \"\${collectd_conf}-doctor-saved\" \$collectd_conf + sudo rm \"\${collectd_conf}-doctor-saved\" + fi" + + rm $TOP_DIR/lib/monitors/collectd.conf +} diff --git a/tests/lib/monitors/collectd/collectd_plugin.py b/tests/lib/monitors/collectd/collectd_plugin.py new file mode 100644 index 00000000..70fcf26e --- /dev/null +++ b/tests/lib/monitors/collectd/collectd_plugin.py @@ -0,0 +1,167 @@ +############################################################################## +# Copyright (c) 2017 NEC Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## + +import collectd +import sys +from netifaces import interfaces, ifaddresses, AF_INET +from datetime import datetime +import json +import requests +import time +from requests.exceptions import ConnectionError + +from keystoneauth1 import loading +from keystoneauth1 import session +from congressclient.v1 import client + + +def write_debug(str_write, write_type, compute_user): + file_name = ('/home/%s/monitor.log' % compute_user) + file_tmp = open(file_name, write_type) + file_tmp.write( "%s" % str_write) + file_tmp.close() + + +class DoctorMonitorCollectd(object): + def __init__(self): + self.control_ip = '' + self.compute_user = '' + self.compute_ip = '' + self.host_name = '' + self.inspector_type = '' + self.inspector_url = '' + self.os_auth_url = '' + self.os_username = '' + self.os_password = '' + self.os_project_name = '' + self.os_user_domain_name = '' + self.os_user_domain_id = '' + self.os_project_domain_name = '' + self.os_project_domain_id = '' + self.sess = '' + self.auth = '' + self.inspector_notified = 0 + self.start_notifications = 0 + self.monitor_type = 'sample' + + def config_func(self, config): + for node in config.children: + key = node.key.lower() + val = node.values[0] + + if key == 'compute_host': + self.host_name = val + elif key == 'control_ip': + self.control_ip = val + elif key == 'compute_ip': + self.compute_ip = val + elif key == 'compute_user': + self.compute_user = val + elif key == 'inspector_type': + self.inspector_type = val + elif key == 'os_auth_url': + self.os_auth_url = val + elif key == 'os_username': + self.os_username = val + elif key == 'os_password': + self.os_password = val + elif key == 'os_project_name': + self.os_project_name = val + elif key == 'os_user_domain_name': + self.os_user_domain_name = val + elif key == 'os_user_domain_id': + self.os_user_domain_id = val + elif key == 'os_project_domain_name': + self.os_project_domain_name = val + elif key == 'os_project_domain_id': + self.os_project_domain_id = val + else: + collectd.info('Unknown config key "%s"' % key) + + def init_collectd(self): + write_debug("Compute node collectd monitor start at %s\n\n" % datetime.now().isoformat(), "w", self.compute_user) + + if self.inspector_type == 'sample': + self.inspector_url = ('http://%s:12345/events' % self.control_ip) + elif self.inspector_type == 'congress': + loader = loading.get_plugin_loader('password') + self.auth = loader.load_from_options(auth_url=self.os_auth_url, + username=self.os_username, + password=self.os_password, + project_name=self.os_project_name, + user_domain_name=self.os_user_domain_name, + user_domain_id=self.os_user_domain_id, + project_domain_name=self.os_project_domain_name, + project_domain_id=self.os_project_domain_id) + self.sess=session.Session(auth=self.auth) + congress = client.Client(session=self.sess, service_type='policy') + ds = congress.list_datasources()['results'] + doctor_ds = next((item for item in ds if item['driver'] == 'doctor'), + None) + + congress_endpoint = congress.httpclient.get_endpoint(auth=self.auth) + self.inspector_url = ('%s/v1/data-sources/%s/tables/events/rows' % + (congress_endpoint, doctor_ds['id'])) + else: + sys.exit() + self.start_notifications = 1 + + + def notify_inspector(self): + event_type = "compute.host.down" + payload = [ + { + 'id': ("monitor_%s_id1" % self.monitor_type), + 'time': datetime.now().isoformat(), + 'type': event_type, + 'details': { + 'hostname': self.host_name, + 'status': 'down', + 'monitor': ("monitor_%s" % self.monitor_type), + 'monitor_event_id': ("monitor_%s_event1" % self.monitor_type) + }, + }, + ] + data = json.dumps(payload) + self.inspector_notified = 1 + + if self.inspector_type == 'sample': + headers = {'content-type': 'application/json'} + try: + requests.post(self.inspector_url, data=data, headers=headers) + except ConnectionError as err: + print err + elif self.inspector_type == 'congress': + # TODO(umar) enhance for token expiry case + headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + 'X-Auth-Token': self.sess.get_token() + } + requests.put(self.inspector_url, data=data, headers=headers) + + + def handle_notif(self, notification, data=None): + if (notification.severity == collectd.NOTIF_FAILURE or + notification.severity == collectd.NOTIF_WARNING): + if (self.start_notifications == 1 and self.inspector_notified == 0): + write_debug("Received down notification: doctor monitor detected at %s\n" % time.time(), "a", self.compute_user) + self.notify_inspector() + + elif notification.severity == collectd.NOTIF_OKAY: + collectd.info("Interface status: UP again %s\n" % time.time()) + else: + collectd.info("Unknown notification severity %s\n" % notification.severity) + + +monitor = DoctorMonitorCollectd() + +collectd.register_config(monitor.config_func) +collectd.register_init(monitor.init_collectd) +collectd.register_notification(monitor.handle_notif) diff --git a/tests/lib/monitors/sample/monitor.py b/tests/lib/monitors/sample/monitor.py new file mode 100644 index 00000000..7450c534 --- /dev/null +++ b/tests/lib/monitors/sample/monitor.py @@ -0,0 +1,124 @@ +############################################################################## +# Copyright (c) 2016 NEC Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## + +import argparse +from datetime import datetime +import json +import logger as doctor_log +import requests +import socket +import time + +from keystoneauth1 import session +from congressclient.v1 import client + +import identity_auth + +# NOTE: icmp message with all zero data (checksum = 0xf7ff) +# see https://tools.ietf.org/html/rfc792 +ICMP_ECHO_MESSAGE = '\x08\x00\xf7\xff\x00\x00\x00\x00' + +SUPPORTED_INSPECTOR_TYPES = ['sample', 'congress'] + +LOG = doctor_log.Logger('doctor_monitor').getLogger() + + +class DoctorMonitorSample(object): + + interval = 0.1 # second + timeout = 0.1 # second + event_type = "compute.host.down" + + def __init__(self, args): + if args.inspector_type not in SUPPORTED_INSPECTOR_TYPES: + raise Exception("Inspector type '%s' not supported", args.inspector_type) + + self.hostname = args.hostname + self.inspector_type = args.inspector_type + self.ip_addr = args.ip or socket.gethostbyname(self.hostname) + + if self.inspector_type == 'sample': + self.inspector_url = 'http://127.0.0.1:12345/events' + elif self.inspector_type == 'congress': + auth=identity_auth.get_identity_auth() + self.session=session.Session(auth=auth) + congress = client.Client(session=self.session, service_type='policy') + ds = congress.list_datasources()['results'] + doctor_ds = next((item for item in ds if item['driver'] == 'doctor'), + None) + + congress_endpoint = congress.httpclient.get_endpoint(auth=auth) + self.inspector_url = ('%s/v1/data-sources/%s/tables/events/rows' % + (congress_endpoint, doctor_ds['id'])) + + def start_loop(self): + LOG.debug("start ping to host %(h)s (ip=%(i)s)" % {'h': self.hostname, + 'i': self.ip_addr}) + sock = socket.socket(socket.AF_INET, socket.SOCK_RAW, + socket.IPPROTO_ICMP) + sock.settimeout(self.timeout) + while True: + try: + sock.sendto(ICMP_ECHO_MESSAGE, (self.ip_addr, 0)) + data = sock.recv(4096) + except socket.timeout: + LOG.info("doctor monitor detected at %s" % time.time()) + self.report_error() + LOG.info("ping timeout, quit monitoring...") + return + time.sleep(self.interval) + + def report_error(self): + payload = [ + { + 'id': 'monitor_sample_id1', + 'time': datetime.now().isoformat(), + 'type': self.event_type, + 'details': { + 'hostname': self.hostname, + 'status': 'down', + 'monitor': 'monitor_sample', + 'monitor_event_id': 'monitor_sample_event1' + }, + }, + ] + data = json.dumps(payload) + + if self.inspector_type == 'sample': + headers = {'content-type': 'application/json'} + requests.post(self.inspector_url, data=data, headers=headers) + elif self.inspector_type == 'congress': + headers = { + 'Content-Type': 'application/json', + 'Accept': 'application/json', + 'X-Auth-Token':self.session.get_token(), + } + requests.put(self.inspector_url, data=data, headers=headers) + + +def get_args(): + parser = argparse.ArgumentParser(description='Doctor Sample Monitor') + parser.add_argument('hostname', metavar='HOSTNAME', type=str, nargs='?', + help='a hostname to monitor connectivity') + parser.add_argument('ip', metavar='IP', type=str, nargs='?', + help='an IP address to monitor connectivity') + parser.add_argument('inspector_type', metavar='INSPECTOR_TYPE', type=str, nargs='?', + help='inspector to report', + default='sample') + return parser.parse_args() + + +def main(): + args = get_args() + monitor = DoctorMonitorSample(args) + monitor.start_loop() + + +if __name__ == '__main__': + main() diff --git a/tests/lib/monitors/sample/sample b/tests/lib/monitors/sample/sample new file mode 100644 index 00000000..1d310333 --- /dev/null +++ b/tests/lib/monitors/sample/sample @@ -0,0 +1,18 @@ +#!/bin/bash + +function start_monitor_sample { + cp $TOP_DIR/lib/monitors/sample/monitor.py $TOP_DIR/monitor.py + pgrep -f "python monitor.py" && return 0 + sudo -E python monitor.py "$COMPUTE_HOST" "$COMPUTE_IP" "$INSPECTOR_TYPE" \ + > monitor.log 2>&1 & +} + +function stop_monitor_sample { + pgrep -f "python monitor.py" || return 0 + sudo kill $(pgrep -f "python monitor.py") +} + +function cleanup_monitor_sample { + rm monitor.py + return +} diff --git a/tests/main.py b/tests/main.py index b59cd7a3..b360f124 100644 --- a/tests/main.py +++ b/tests/main.py @@ -74,8 +74,10 @@ class DoctorTest(object): self.setup() # injecting host failure... + # NOTE (umar) add INTERFACE_NAME logic to host injection # verify the test results + # NOTE (umar) copy remote monitor.log file when monitor=collectd except Exception as e: LOG.error('doctor test failed, Exception=%s' % e) diff --git a/tests/monitor.py b/tests/monitor.py deleted file mode 100644 index 7450c534..00000000 --- a/tests/monitor.py +++ /dev/null @@ -1,124 +0,0 @@ -############################################################################## -# Copyright (c) 2016 NEC Corporation and others. -# -# All rights reserved. This program and the accompanying materials -# are made available under the terms of the Apache License, Version 2.0 -# which accompanies this distribution, and is available at -# http://www.apache.org/licenses/LICENSE-2.0 -############################################################################## - -import argparse -from datetime import datetime -import json -import logger as doctor_log -import requests -import socket -import time - -from keystoneauth1 import session -from congressclient.v1 import client - -import identity_auth - -# NOTE: icmp message with all zero data (checksum = 0xf7ff) -# see https://tools.ietf.org/html/rfc792 -ICMP_ECHO_MESSAGE = '\x08\x00\xf7\xff\x00\x00\x00\x00' - -SUPPORTED_INSPECTOR_TYPES = ['sample', 'congress'] - -LOG = doctor_log.Logger('doctor_monitor').getLogger() - - -class DoctorMonitorSample(object): - - interval = 0.1 # second - timeout = 0.1 # second - event_type = "compute.host.down" - - def __init__(self, args): - if args.inspector_type not in SUPPORTED_INSPECTOR_TYPES: - raise Exception("Inspector type '%s' not supported", args.inspector_type) - - self.hostname = args.hostname - self.inspector_type = args.inspector_type - self.ip_addr = args.ip or socket.gethostbyname(self.hostname) - - if self.inspector_type == 'sample': - self.inspector_url = 'http://127.0.0.1:12345/events' - elif self.inspector_type == 'congress': - auth=identity_auth.get_identity_auth() - self.session=session.Session(auth=auth) - congress = client.Client(session=self.session, service_type='policy') - ds = congress.list_datasources()['results'] - doctor_ds = next((item for item in ds if item['driver'] == 'doctor'), - None) - - congress_endpoint = congress.httpclient.get_endpoint(auth=auth) - self.inspector_url = ('%s/v1/data-sources/%s/tables/events/rows' % - (congress_endpoint, doctor_ds['id'])) - - def start_loop(self): - LOG.debug("start ping to host %(h)s (ip=%(i)s)" % {'h': self.hostname, - 'i': self.ip_addr}) - sock = socket.socket(socket.AF_INET, socket.SOCK_RAW, - socket.IPPROTO_ICMP) - sock.settimeout(self.timeout) - while True: - try: - sock.sendto(ICMP_ECHO_MESSAGE, (self.ip_addr, 0)) - data = sock.recv(4096) - except socket.timeout: - LOG.info("doctor monitor detected at %s" % time.time()) - self.report_error() - LOG.info("ping timeout, quit monitoring...") - return - time.sleep(self.interval) - - def report_error(self): - payload = [ - { - 'id': 'monitor_sample_id1', - 'time': datetime.now().isoformat(), - 'type': self.event_type, - 'details': { - 'hostname': self.hostname, - 'status': 'down', - 'monitor': 'monitor_sample', - 'monitor_event_id': 'monitor_sample_event1' - }, - }, - ] - data = json.dumps(payload) - - if self.inspector_type == 'sample': - headers = {'content-type': 'application/json'} - requests.post(self.inspector_url, data=data, headers=headers) - elif self.inspector_type == 'congress': - headers = { - 'Content-Type': 'application/json', - 'Accept': 'application/json', - 'X-Auth-Token':self.session.get_token(), - } - requests.put(self.inspector_url, data=data, headers=headers) - - -def get_args(): - parser = argparse.ArgumentParser(description='Doctor Sample Monitor') - parser.add_argument('hostname', metavar='HOSTNAME', type=str, nargs='?', - help='a hostname to monitor connectivity') - parser.add_argument('ip', metavar='IP', type=str, nargs='?', - help='an IP address to monitor connectivity') - parser.add_argument('inspector_type', metavar='INSPECTOR_TYPE', type=str, nargs='?', - help='inspector to report', - default='sample') - return parser.parse_args() - - -def main(): - args = get_args() - monitor = DoctorMonitorSample(args) - monitor.start_loop() - - -if __name__ == '__main__': - main() diff --git a/tests/monitor/__init__.py b/tests/monitor/__init__.py index 51a6a65d..e268907f 100644 --- a/tests/monitor/__init__.py +++ b/tests/monitor/__init__.py @@ -12,14 +12,15 @@ from oslo_utils import importutils OPTS = [ cfg.StrOpt('type', default='sample', - choices=['sample'], + choices=['sample', 'collectd'], help='the type of doctor monitor component', required=True), ] _monitor_name_class_mapping = { - 'sample': 'monitor.sample.SampleMonitor' + 'sample': 'monitor.sample.SampleMonitor', + 'collectd': 'monitor.collectd.CollectdMonitor' } def get_monitor(conf, inspector_url, log): diff --git a/tests/monitor/collectd.py b/tests/monitor/collectd.py new file mode 100644 index 00000000..f7a4f442 --- /dev/null +++ b/tests/monitor/collectd.py @@ -0,0 +1,145 @@ +############################################################################## +# Copyright (c) 2017 NEC Corporation and others. +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +############################################################################## + +import os +import socket +import getpass +import sys + +from identity_auth import get_session +from os_clients import nova_client +from monitor.base import BaseMonitor + + +class CollectdMonitor(BaseMonitor): + def __init__(self, conf, inspector_url, log): + super(CollectdMonitor, self).__init__(conf, inspector_url, log) + self.top_dir = os.path.dirname(sys.path[0]) + self.session = get_session() + self.nova = nova_client(conf.nova_version, self.session) + self.compute_hosts = self.nova.hypervisors.list(detailed=True) + for host in self.compute_hosts: + host_dict = host.__dict__ + self.compute_host = host_dict['hypervisor_hostname'] + self.compute_ip = host_dict['host_ip'] + tmp_sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + tmp_sock.connect(("8.8.8.8", 80)) + + ## control_ip is the IP of primary interface of control node i.e. + ## eth0, eno1. It is used by collectd monitor to communicate with + ## sample inspector. + ## TODO (umar) see if mgmt IP of control is a better option. Also + ## primary interface may not be the right option + self.control_ip = tmp_sock.getsockname()[0] + self.compute_user = getpass.getuser() + self.interface_name = os.environ.get('INTERFACE_NAME') or '' + self.inspector_type = os.environ.get('INSPECTOR_TYPE', 'sample') + self.auth_url = os.environ.get('OS_AUTH_URL') + self.username = os.environ.get('OS_USERNAME') + self.password = os.environ.get('OS_PASSWORD') + self.project_name = os.environ.get('OS_PROJECT_NAME') + self.user_domain_name = os.environ.get('OS_USER_DOMAIN_NAME') or 'default' + self.user_domain_id = os.environ.get('OS_USER_DOMAIN_ID') + self.project_domain_name = os.environ.get('OS_PROJECT_DOMAIN_NAME') or 'default' + self.project_domain_id = os.environ.get('OS_PROJECT_DOMAIN_ID') + self.ssh_opts_cpu = '-o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no' + + def start(self): + self.log.info("Collectd monitor start.........") + f = open("%s/tests/collectd.conf" % self.top_dir, 'w') + collectd_conf_file = """ +Hostname %s +FQDNLookup false +Interval 1 +MaxReadInterval 2 + + +Globals true + +LoadPlugin ovs_events +LoadPlugin logfile + + + File \"/var/log/collectd.log\" + Timestamp true + LogLevel \"info\" + + + + ModulePath \"/home/%s\" + LogTraces true + Interactive false + Import \"collectd_plugin\" + + control_ip \"%s\" + compute_ip \"%s\" + compute_host \"%s\" + compute_user \"%s\" + inspector_type \"%s\" + os_auth_url \"%s\" + os_username \"%s\" + os_password \"%s\" + os_project_name \"%s\" + os_user_domain_name \"%s\" + os_user_domain_id \"%s\" + os_project_domain_name \"%s\" + os_project_domain_id \"%s\" + + + + + Port 6640 + Socket \"/var/run/openvswitch/db.sock\" + Interfaces \"@INTERFACE_NAME@\" + SendNotification true + DispatchValues false + + """ % (self.compute_host, self.compute_user, self.control_ip, self.compute_ip, self.compute_host, self.compute_user, + self.inspector_type, self.auth_url, self.username, self.password, self.project_name, self.user_domain_name, + self.user_domain_id, self.project_domain_name, self.project_domain_id) + f.write(collectd_conf_file) + f.close() + + os.system(" scp %s %s/tests/collectd.conf %s@%s: " % (self.ssh_opts_cpu, self.top_dir, self.compute_user, self.compute_ip)) + self.log.info("after first scp") + ## @TODO (umar) Always assuming that the interface is assigned an IP if + ## interface name is not provided. See if there is a better approach + os.system(""" ssh %s %s@%s \"if [ -n \"%s\" ]; then + dev=%s + else + dev=\$(sudo ip a | awk '/ %s\//{print \$NF}') + fi + sed -i -e \"s/@INTERFACE_NAME@/\$dev/\" collectd.conf + collectd_conf=/opt/collectd/etc/collectd.conf + if [ -e \$collectd_conf ]; then + sudo cp \$collectd_conf \${collectd_conf}-doctor-saved + else + sudo touch \${collectd_conf}-doctor-created + fi + sudo mv collectd.conf /opt/collectd/etc/collectd.conf\" """ % (self.ssh_opts_cpu, self.compute_user, self.compute_ip, self.interface_name, self.interface_name, self.compute_ip)) + self.log.info("after first ssh") + os.system(" scp %s %s/tests/lib/monitors/collectd/collectd_plugin.py %s@%s:collectd_plugin.py " % (self.ssh_opts_cpu, self.top_dir, self.compute_user, self.compute_ip)) + self.log.info("after sec scp") + os.system(" ssh %s %s@%s \"sudo pkill collectd; sudo /opt/collectd/sbin/collectd\" " % (self.ssh_opts_cpu, self.compute_user, self.compute_ip)) + self.log.info("after sec ssh") + + def stop(self): + os.system(" ssh %s %s@%s \"sudo pkill collectd\" " % (self.ssh_opts_cpu, self.compute_user, self.compute_ip)) + + def cleanup(self): + os.system(""" ssh %s %s@%s \" + collectd_conf=/opt/collectd/etc/collectd.conf + if [ -e \"\${collectd_conf}-doctor-created\" ]; then + sudo rm \"\${collectd_conf}-doctor-created\" + sudo rm \$collectd_conf + elif [ -e \"\${collectd_conf}-doctor-saved\" ]; then + sudo cp -f \"\${collectd_conf}-doctor-saved\" \$collectd_conf + sudo rm \"\${collectd_conf}-doctor-saved\" + fi\" """ % (self.ssh_opts_cpu, self.compute_user, self.compute_ip)) + os.remove("%s/tests/collectd.conf" % self.top_dir) diff --git a/tests/run.sh b/tests/run.sh index abdad58c..69f18b20 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -212,17 +212,6 @@ create_alarm() { done } -start_monitor() { - pgrep -f "python monitor.py" && return 0 - sudo -E python monitor.py "$COMPUTE_HOST" "$COMPUTE_IP" "$INSPECTOR_TYPE" \ - > monitor.log 2>&1 & -} - -stop_monitor() { - pgrep -f "python monitor.py" || return 0 - sudo kill $(pgrep -f "python monitor.py") -} - start_consumer() { pgrep -f "python consumer.py" && return 0 python consumer.py "$CONSUMER_PORT" > consumer.log 2>&1 & @@ -294,8 +283,12 @@ inject_failure() { echo "disabling network of compute host [$COMPUTE_HOST] for 3 mins..." cat > disable_network.sh << 'END_TXT' #!/bin/bash -x -dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $NF}') sleep 1 +if [ -n "@INTERFACE_NAME@" ]; then + dev=@INTERFACE_NAME@ +else + dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $NF}') +fi sudo ip link set $dev down echo "doctor set link down at" $(date "+%s.%N") sleep 180 @@ -303,6 +296,7 @@ sudo ip link set $dev up sleep 1 END_TXT sed -i -e "s/@COMPUTE_IP@/$COMPUTE_IP/" disable_network.sh + sed -i -e "s/@INTERFACE_NAME@/$INTERFACE_NAME/" disable_network.sh chmod +x disable_network.sh scp $ssh_opts_cpu disable_network.sh "$COMPUTE_USER@$COMPUTE_IP:" ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'nohup ./disable_network.sh > disable_network.log 2>&1 &' @@ -327,8 +321,11 @@ calculate_notification_time() { wait_consumer 60 #keep 'at' as the last keyword just before the value, and #use regex to get value instead of the fixed column + if [ ! -f monitor.log ]; then + scp $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP:monitor.log" . + fi detected=$(grep "doctor monitor detected at" monitor.log |\ - sed -e "s/^.* at //") + sed -e "s/^.* at //" | tail -1) notified=$(grep "doctor consumer notified at" consumer.log |\ sed -e "s/^.* at //" | tail -1) @@ -431,11 +428,11 @@ run_profiler() { cleanup() { set +e echo "cleanup..." - stop_monitor stop_inspector stop_consumer unset_forced_down_hosts + stop_monitor collect_logs vms=$(openstack $as_doctor_user server list) @@ -467,6 +464,7 @@ cleanup() { cleanup_installer cleanup_inspector + cleanup_monitor # NOTE: Temporal log printer. for f in $(find . -name '*.log') @@ -511,6 +509,9 @@ setup_python_packages source $TOP_DIR/functions-common source $TOP_DIR/lib/installer source $TOP_DIR/lib/inspector +source $TOP_DIR/lib/monitor + +rm -f *.log setup_installer @@ -540,8 +541,8 @@ echo "injecting host failure..." inject_failure check_host_status "(DOWN|UNKNOWN)" 60 -calculate_notification_time unset_forced_down_hosts +calculate_notification_time collect_logs run_profiler -- cgit 1.2.3-korg