aboutsummaryrefslogtreecommitdiffstats
path: root/yardstick/network_services/nfvi
diff options
context:
space:
mode:
authorRoss Brattain <ross.b.brattain@intel.com>2017-09-28 00:10:43 -0700
committerMaciej Skrocki <maciej.skrocki@intel.com>2017-10-03 21:54:33 +0000
commite228c2a3ac5b0173792fa7b11f9540ecec3a0029 (patch)
tree4d4b8c940c90bc5f6827475eaa45a131c6a3d246 /yardstick/network_services/nfvi
parente8cf6a76c346806b53fa0c802374327ddc4956d1 (diff)
NSB PROX test hang fixes
The PROX tests were hanging in the duration runner. These are fixes for various errors: raise error in collect_kpi if VNF is down move prox dpdk_rebind after collectd stop fix dpdk nicbind rebind to group by drivers prox: raise error in collect_kpi if the VNF is down prox: add VNF_TYPE for consistency sample_vnf: debug and fix kill_vnf pkill is not matching some executable names, add some debug process dumps and try switching back to killall until we can find the issue sample_vnf: add default timeout, so we can override default 3600 SSH timeout collect_kpi is the point at which we check the VNFs and TGs for failures or exits queues are the problem make sure we aren't silently blocking on non-empty queues by canceling join thread in subprocess fixup duration runner to close queues and other attempt to stop duration runner from hanging VnfdHelper: memoize port_num resource: fail if ssh can't connect at the end of 3600 second test our ssh connection is dead, so we can't actually stop collectd unless we reconnect fix stop() logic to ignore ssh errors Change-Id: I6c8e682a80cb9d00362e2fef4a46df080f304e55 Signed-off-by: Ross Brattain <ross.b.brattain@intel.com>
Diffstat (limited to 'yardstick/network_services/nfvi')
-rw-r--r--yardstick/network_services/nfvi/resource.py41
1 files changed, 27 insertions, 14 deletions
diff --git a/yardstick/network_services/nfvi/resource.py b/yardstick/network_services/nfvi/resource.py
index 7e8334c73..e3d0e3bca 100644
--- a/yardstick/network_services/nfvi/resource.py
+++ b/yardstick/network_services/nfvi/resource.py
@@ -19,6 +19,7 @@ from __future__ import print_function
import logging
from itertools import chain
+import errno
import jinja2
import os
import os.path
@@ -72,7 +73,6 @@ class ResourceProfile(object):
self.timeout = timeout
self.enable = True
- self.cores = validate_non_string_sequence(cores, default=[])
self._queue = multiprocessing.Queue()
self.amqp_client = None
self.port_names = validate_non_string_sequence(port_names, default=[])
@@ -83,8 +83,16 @@ class ResourceProfile(object):
def check_if_sa_running(self, process):
""" verify if system agent is running """
- status, pid, _ = self.connection.execute("pgrep -f %s" % process)
- return status == 0, pid
+ try:
+ err, pid, _ = self.connection.execute("pgrep -f %s" % process)
+ # strip whitespace
+ return err, pid.strip()
+ except OSError as e:
+ if e.errno in {errno.ECONNRESET}:
+ # if we can't connect to check, then we won't be able to connect to stop it
+ LOG.exception("can't connect to host to check collectd status")
+ return 1, None
+ raise
def run_collectd_amqp(self):
""" run amqp consumer to collect the NFVi data """
@@ -137,7 +145,7 @@ class ResourceProfile(object):
def parse_intel_pmu_stats(cls, key, value):
return {''.join(str(v) for v in key): value.split(":")[1]}
- def parse_collectd_result(self, metrics, core_list):
+ def parse_collectd_result(self, metrics):
""" convert collectd data into json"""
result = {
"cpu": {},
@@ -161,8 +169,7 @@ class ResourceProfile(object):
if "cpu" in res_key0 or "intel_rdt" in res_key0:
cpu_key, name, metric, testcase = \
self.get_cpu_data(res_key0, res_key1, value)
- if cpu_key in core_list:
- result["cpu"].setdefault(cpu_key, {}).update({name: metric})
+ result["cpu"].setdefault(cpu_key, {}).update({name: metric})
elif "memory" in res_key0:
result["memory"].update({res_key1: value.split(":")[0]})
@@ -189,8 +196,9 @@ class ResourceProfile(object):
def amqp_process_for_nfvi_kpi(self):
""" amqp collect and return nfvi kpis """
if self.amqp_client is None and self.enable:
- self.amqp_client = \
- multiprocessing.Process(target=self.run_collectd_amqp)
+ self.amqp_client = multiprocessing.Process(
+ name="AmqpClient-{}-{}".format(self.mgmt['ip'], os.getpid()),
+ target=self.run_collectd_amqp)
self.amqp_client.start()
def amqp_collect_nfvi_kpi(self):
@@ -201,7 +209,7 @@ class ResourceProfile(object):
metric = {}
while not self._queue.empty():
metric.update(self._queue.get())
- msg = self.parse_collectd_result(metric, self.cores)
+ msg = self.parse_collectd_result(metric)
return msg
def _provide_config_file(self, config_file_path, nfvi_cfg, template_kwargs):
@@ -265,8 +273,8 @@ class ResourceProfile(object):
connection.execute("sudo rabbitmqctl authenticate_user admin admin")
connection.execute("sudo rabbitmqctl set_permissions -p / admin '.*' '.*' '.*'")
- LOG.debug("Start collectd service.....")
- connection.execute("sudo %s" % collectd_path)
+ LOG.debug("Start collectd service..... %s second timeout", self.timeout)
+ connection.execute("sudo %s" % collectd_path, timeout=self.timeout)
LOG.debug("Done")
def initiate_systemagent(self, bin_path):
@@ -292,13 +300,18 @@ class ResourceProfile(object):
LOG.debug("Stop resource monitor...")
if self.amqp_client is not None:
+ # we proper and try to join first
+ self.amqp_client.join(3)
self.amqp_client.terminate()
+ LOG.debug("Check if %s is running", agent)
status, pid = self.check_if_sa_running(agent)
- if status == 0:
+ LOG.debug("status %s pid %s", status, pid)
+ if status != 0:
return
- self.connection.execute('sudo kill -9 %s' % pid)
- self.connection.execute('sudo pkill -9 %s' % agent)
+ if pid:
+ self.connection.execute('sudo kill -9 "%s"' % pid)
+ self.connection.execute('sudo pkill -9 "%s"' % agent)
self.connection.execute('sudo service rabbitmq-server stop')
self.connection.execute("sudo rabbitmqctl stop_app")