diff options
author | Ross Brattain <ross.b.brattain@intel.com> | 2017-09-28 00:10:43 -0700 |
---|---|---|
committer | Maciej Skrocki <maciej.skrocki@intel.com> | 2017-10-03 21:54:33 +0000 |
commit | e228c2a3ac5b0173792fa7b11f9540ecec3a0029 (patch) | |
tree | 4d4b8c940c90bc5f6827475eaa45a131c6a3d246 /yardstick/network_services/nfvi | |
parent | e8cf6a76c346806b53fa0c802374327ddc4956d1 (diff) |
NSB PROX test hang fixes
The PROX tests were hanging in the duration
runner.
These are fixes for various errors:
raise error in collect_kpi if VNF is down
move prox dpdk_rebind after collectd stop
fix dpdk nicbind rebind to group by drivers
prox: raise error in collect_kpi if the VNF is down
prox: add VNF_TYPE for consistency
sample_vnf: debug and fix kill_vnf
pkill is not matching some executable names,
add some debug process dumps and try switching
back to killall until we can find the issue
sample_vnf: add default timeout, so we can override
default 3600 SSH timeout
collect_kpi is the point at which we check
the VNFs and TGs for failures or exits
queues are the problem make sure we aren't silently blocking on
non-empty queues by canceling join thread in subprocess
fixup duration runner to close queues
and other attempt to stop duration runner
from hanging
VnfdHelper: memoize port_num
resource: fail if ssh can't connect
at the end of 3600 second test our ssh connection
is dead, so we can't actually stop collectd
unless we reconnect
fix stop() logic to ignore ssh errors
Change-Id: I6c8e682a80cb9d00362e2fef4a46df080f304e55
Signed-off-by: Ross Brattain <ross.b.brattain@intel.com>
Diffstat (limited to 'yardstick/network_services/nfvi')
-rw-r--r-- | yardstick/network_services/nfvi/resource.py | 41 |
1 files changed, 27 insertions, 14 deletions
diff --git a/yardstick/network_services/nfvi/resource.py b/yardstick/network_services/nfvi/resource.py index 7e8334c73..e3d0e3bca 100644 --- a/yardstick/network_services/nfvi/resource.py +++ b/yardstick/network_services/nfvi/resource.py @@ -19,6 +19,7 @@ from __future__ import print_function import logging from itertools import chain +import errno import jinja2 import os import os.path @@ -72,7 +73,6 @@ class ResourceProfile(object): self.timeout = timeout self.enable = True - self.cores = validate_non_string_sequence(cores, default=[]) self._queue = multiprocessing.Queue() self.amqp_client = None self.port_names = validate_non_string_sequence(port_names, default=[]) @@ -83,8 +83,16 @@ class ResourceProfile(object): def check_if_sa_running(self, process): """ verify if system agent is running """ - status, pid, _ = self.connection.execute("pgrep -f %s" % process) - return status == 0, pid + try: + err, pid, _ = self.connection.execute("pgrep -f %s" % process) + # strip whitespace + return err, pid.strip() + except OSError as e: + if e.errno in {errno.ECONNRESET}: + # if we can't connect to check, then we won't be able to connect to stop it + LOG.exception("can't connect to host to check collectd status") + return 1, None + raise def run_collectd_amqp(self): """ run amqp consumer to collect the NFVi data """ @@ -137,7 +145,7 @@ class ResourceProfile(object): def parse_intel_pmu_stats(cls, key, value): return {''.join(str(v) for v in key): value.split(":")[1]} - def parse_collectd_result(self, metrics, core_list): + def parse_collectd_result(self, metrics): """ convert collectd data into json""" result = { "cpu": {}, @@ -161,8 +169,7 @@ class ResourceProfile(object): if "cpu" in res_key0 or "intel_rdt" in res_key0: cpu_key, name, metric, testcase = \ self.get_cpu_data(res_key0, res_key1, value) - if cpu_key in core_list: - result["cpu"].setdefault(cpu_key, {}).update({name: metric}) + result["cpu"].setdefault(cpu_key, {}).update({name: metric}) elif "memory" in res_key0: result["memory"].update({res_key1: value.split(":")[0]}) @@ -189,8 +196,9 @@ class ResourceProfile(object): def amqp_process_for_nfvi_kpi(self): """ amqp collect and return nfvi kpis """ if self.amqp_client is None and self.enable: - self.amqp_client = \ - multiprocessing.Process(target=self.run_collectd_amqp) + self.amqp_client = multiprocessing.Process( + name="AmqpClient-{}-{}".format(self.mgmt['ip'], os.getpid()), + target=self.run_collectd_amqp) self.amqp_client.start() def amqp_collect_nfvi_kpi(self): @@ -201,7 +209,7 @@ class ResourceProfile(object): metric = {} while not self._queue.empty(): metric.update(self._queue.get()) - msg = self.parse_collectd_result(metric, self.cores) + msg = self.parse_collectd_result(metric) return msg def _provide_config_file(self, config_file_path, nfvi_cfg, template_kwargs): @@ -265,8 +273,8 @@ class ResourceProfile(object): connection.execute("sudo rabbitmqctl authenticate_user admin admin") connection.execute("sudo rabbitmqctl set_permissions -p / admin '.*' '.*' '.*'") - LOG.debug("Start collectd service.....") - connection.execute("sudo %s" % collectd_path) + LOG.debug("Start collectd service..... %s second timeout", self.timeout) + connection.execute("sudo %s" % collectd_path, timeout=self.timeout) LOG.debug("Done") def initiate_systemagent(self, bin_path): @@ -292,13 +300,18 @@ class ResourceProfile(object): LOG.debug("Stop resource monitor...") if self.amqp_client is not None: + # we proper and try to join first + self.amqp_client.join(3) self.amqp_client.terminate() + LOG.debug("Check if %s is running", agent) status, pid = self.check_if_sa_running(agent) - if status == 0: + LOG.debug("status %s pid %s", status, pid) + if status != 0: return - self.connection.execute('sudo kill -9 %s' % pid) - self.connection.execute('sudo pkill -9 %s' % agent) + if pid: + self.connection.execute('sudo kill -9 "%s"' % pid) + self.connection.execute('sudo pkill -9 "%s"' % agent) self.connection.execute('sudo service rabbitmq-server stop') self.connection.execute("sudo rabbitmqctl stop_app") |