aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPierrick Louin <pierrick.louin@orange.com>2020-10-14 00:14:08 +0200
committerFran�ois-R�gis Menguy <francoisregis.menguy@orange.com>2020-11-05 08:58:25 +0000
commitd41631be9142a50284cfbfd252a53e97d7ebd8c7 (patch)
tree77220439a71cf8ab7fd4c982e62b65f602161067
parentc00d9292739dd7ae072c0d6befcd7e87dc7d0566 (diff)
NFVBENCH-189: Add a fix to work around the i40e_VF port initialization issue
Change-Id: I170292e0871b1ff00aa5c7e1136b3b14b143bee1 Signed-off-by: Pierrick Louin <pierrick.louin@orange.com>
-rw-r--r--nfvbench/traffic_gen/trex_gen.py100
1 files changed, 96 insertions, 4 deletions
diff --git a/nfvbench/traffic_gen/trex_gen.py b/nfvbench/traffic_gen/trex_gen.py
index 85331b2..4e20f73 100644
--- a/nfvbench/traffic_gen/trex_gen.py
+++ b/nfvbench/traffic_gen/trex_gen.py
@@ -15,6 +15,7 @@
import math
import os
+import sys
import random
import time
import traceback
@@ -674,9 +675,78 @@ class TRex(AbstractTrafficGenerator):
def __connect(self, client):
client.connect()
+ def __local_server_status(self):
+ """ The TRex server may have started but failed initializing... and stopped.
+ This piece of code is especially designed to address
+ the case when a fatal failure occurs on a DPDK init call.
+ The TRex algorihm should be revised to include some missing timeouts (?)
+ status returned:
+ 0: no error detected
+ 1: fatal error detected - should lead to exiting the run
+ 2: error detected that could be solved by starting again
+ The diagnostic is based on parsing the local trex log file (improvable)
+ """
+ status = 0
+ message = None
+ failure = None
+ exited = None
+ cause = None
+ error = None
+ before = None
+ after = None
+ last = None
+ try:
+ with open('/tmp/trex.log', 'r') as trex_log:
+ for _line in trex_log:
+ line = _line.strip()
+ if line.startswith('Usage:'):
+ break
+ if 'ports are bound' in line:
+ continue
+ if 'please wait' in line:
+ continue
+ if 'exit' in line.lower():
+ exited = line
+ elif 'cause' in line.lower():
+ cause = line
+ elif 'fail' in line.lower():
+ failure = line
+ elif 'msg' in line.lower():
+ message = line
+ elif (error is not None) and line:
+ after = line
+ elif line.startswith('Error:') or line.startswith('ERROR'):
+ error = line
+ before = last
+ last = line
+ except FileNotFoundError:
+ pass
+ if exited is not None:
+ status = 1
+ LOG.info("\x1b[1m%s\x1b[0m %s", 'TRex failed initializing:', exited)
+ if cause is not None:
+ LOG.info("TRex [cont'd] %s", cause)
+ if failure is not None:
+ LOG.info("TRex [cont'd] %s", failure)
+ if message is not None:
+ LOG.info("TRex [cont'd] %s", message)
+ if 'not supported yet' in message.lower():
+ LOG.info("TRex [cont'd] Try starting again!")
+ status = 2
+ elif error is not None:
+ status = 1
+ LOG.info("\x1b[1m%s\x1b[0m %s", 'TRex failed initializing:', error)
+ if after is not None:
+ LOG.info("TRex [cont'd] %s", after)
+ elif before is not None:
+ LOG.info("TRex [cont'd] %s", before)
+ return status
+
def __connect_after_start(self):
# after start, Trex may take a bit of time to initialize
# so we need to retry a few times
+ # we try to capture recoverable error cases (checking status)
+ status = 0
for it in range(self.config.generic_retry_count):
try:
time.sleep(1)
@@ -685,10 +755,23 @@ class TRex(AbstractTrafficGenerator):
except Exception as ex:
if it == (self.config.generic_retry_count - 1):
raise
+ status = self.__local_server_status()
+ if status > 0:
+ # No need to wait anymore, something went wrong and TRex exited
+ if status == 1:
+ LOG.info("\x1b[1m%s\x1b[0m", 'TRex failed starting!')
+ print("More information? Try the command: "
+ + "\x1b[1mnfvbench --show-trex-log\x1b[0m")
+ sys.exit(0)
+ if status == 2:
+ # a new start will follow
+ return status
LOG.info("Retrying connection to TRex (%s)...", ex.msg)
+ return status
def connect(self):
"""Connect to the TRex server."""
+ status = 0
server_ip = self.generator_config.ip
LOG.info("Connecting to TRex (%s)...", server_ip)
@@ -700,13 +783,20 @@ class TRex(AbstractTrafficGenerator):
if server_ip == '127.0.0.1':
config_updated = self.__check_config()
if config_updated or self.config.restart:
- self.__restart()
+ status = self.__restart()
except (TimeoutError, STLError) as e:
if server_ip == '127.0.0.1':
- self.__start_local_server()
+ status = self.__start_local_server()
else:
raise TrafficGeneratorException(e.message) from e
+ if status == 2:
+ # Workaround in case of a failed TRex server initialization
+ # we try to start it again (twice maximum)
+ # which may allow low level initialization to complete.
+ if self.__start_local_server() == 2:
+ self.__start_local_server()
+
ports = list(self.generator_config.ports)
self.port_handle = ports
# Prepare the ports
@@ -742,7 +832,7 @@ class TRex(AbstractTrafficGenerator):
try:
LOG.info("Starting TRex ...")
self.__start_server()
- self.__connect_after_start()
+ status = self.__connect_after_start()
except (TimeoutError, STLError) as e:
LOG.error('Cannot connect to TRex')
LOG.error(traceback.format_exc())
@@ -762,6 +852,7 @@ class TRex(AbstractTrafficGenerator):
else:
message = e.message
raise TrafficGeneratorException(message) from e
+ return status
def __start_server(self):
server = TRexTrafficServer()
@@ -780,7 +871,8 @@ class TRex(AbstractTrafficGenerator):
if not self.client.is_connected():
LOG.info("TRex is stopped...")
break
- self.__start_local_server()
+ # Start and report a possible failure
+ return self.__start_local_server()
def __stop_server(self):
if self.generator_config.ip == '127.0.0.1':