From eac3701e474623103ac10d8238b5e406355a0724 Mon Sep 17 00:00:00 2001 From: Yujun Zhang Date: Sat, 21 Jan 2017 22:30:05 +0800 Subject: Fix bugs in profiler - collect disable_network.log - use Python oneliner instead of `bc` - rename T00 from hostdown to linkdown - add step collect_logs - move profiler to a dedicated step - reorder function definitions to elimate warning on unresolved variable - redirect also stderr to log JIRA: DOCTOR-94 Change-Id: Ic63e5163945c5b21fd1d1c0bf99be353c11c2ada Signed-off-by: Yujun Zhang --- tests/profiler-poc.py | 6 ++--- tests/run.sh | 65 +++++++++++++++++++++++++++++++++------------------ 2 files changed, 45 insertions(+), 26 deletions(-) diff --git a/tests/profiler-poc.py b/tests/profiler-poc.py index 71034781..408cb09a 100644 --- a/tests/profiler-poc.py +++ b/tests/profiler-poc.py @@ -21,7 +21,7 @@ See also: https://goo.gl/98Osig import json import os -LOGFILE = 'performance-profile' +OUTPUT = 'doctor_profiling_output' PREFIX = 'DOCTOR_PROFILER' TOTAL_CHECK_POINTS = 10 MODULE_CHECK_POINTS = ['T00', 'T01', 'T04', 'T05', 'T06', 'T09'] @@ -33,7 +33,7 @@ Total time cost: {total}(ms) |Monitor|Inspector |Controller|Notifier|Evaluator | |{M00} |{M01} |{M02} |{M03} |{M04} | | | | | | | | | | | -host down:{T00}| | | | | | | | | +link down:{T00}| | | | | | | | | raw failure:{T01}| | | | | | | | found affected:{T02}| | | | | | | set VM error:{T03}| | | | | | @@ -78,7 +78,7 @@ def main(): profile = TEMPLATE.format(**tags) - logfile = open('{}.json'.format(LOGFILE), 'w') + logfile = open('{}.json'.format(OUTPUT), 'w') logfile.write(json.dumps(tags)) print profile diff --git a/tests/run.sh b/tests/run.sh index e7afb518..a26fafee 100755 --- a/tests/run.sh +++ b/tests/run.sh @@ -238,7 +238,7 @@ inject_failure() { dev=$(sudo ip a | awk '/ @COMPUTE_IP@\//{print $7}') sleep 1 sudo ip link set $dev down -echo "doctor set host down at" $(date "+%s.%N") +echo "doctor set link down at" $(date "+%s.%N") sleep 180 sudo ip link set $dev up sleep 1 @@ -247,24 +247,8 @@ END_TXT chmod +x disable_network.sh scp $ssh_opts_cpu disable_network.sh "$COMPUTE_USER@$COMPUTE_IP:" ssh $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP" 'nohup ./disable_network.sh > disable_network.log 2>&1 &' -} - -profile_performance_poc() { - triggered=$(grep "^doctor set host down at" disable_network.log |\ - sed -e "s/^.* at //") - vmdown=$(grep "doctor mark vm.* error at" inspector.log |tail -n 1 |\ - sed -e "s/^.* at //") - hostdown=$(grep "doctor mark host.* down at" inspector.log |\ - sed -e "s/^.* at //") - - #calculate the relative interval to triggered(T00) - export DOCTOR_PROFILER_T00=0 - export DOCTOR_PROFILER_T01=$(echo "($detected-$triggered)*1000/1" |bc) - export DOCTOR_PROFILER_T03=$(echo "($vmdown-$triggered)*1000/1" |bc) - export DOCTOR_PROFILER_T04=$(echo "($hostdown-$triggered)*1000/1" |bc) - export DOCTOR_PROFILER_T09=$(echo "($notified-$triggered)*1000/1" |bc) - - python profiler-poc.py + # use host time to get rid of potential time sync deviation between nodes + triggered=$(date "+%s.%N") } calculate_notification_time() { @@ -279,10 +263,6 @@ calculate_notification_time() { notified=$(grep "doctor consumer notified at" consumer.log |\ sed -e "s/^.* at //") - if [[ "$PROFILER_TYPE" == "poc" ]]; then - profile_performance_poc - fi - echo "$notified $detected" | \ awk '{ d = $1 - $2; @@ -317,6 +297,43 @@ unset_forced_down_hosts() { -f value -c State | grep -q down' 240 5 } +collect_logs() { + unset_forced_down_hosts + # TODO: We need to make sure the target compute host is back to IP + # reachable. wait_ping() will be added by tojuvone . + sleep 110 + scp $ssh_opts_cpu "$COMPUTE_USER@$COMPUTE_IP:disable_network.log" . + + # TODO(yujunz) collect other logs, e.g. nova, aodh +} + +run_profiler() { + if [[ "$PROFILER_TYPE" == "poc" ]]; then + linkdown=$(grep "doctor set link down at " disable_network.log |\ + sed -e "s/^.* at //") + vmdown=$(grep "doctor mark vm.* error at" inspector.log |tail -n 1 |\ + sed -e "s/^.* at //") + hostdown=$(grep "doctor mark host.* down at" inspector.log |\ + sed -e "s/^.* at //") + + # TODO(yujunz) check the actual delay to verify time sync status + # expected ~1s delay from $trigger to $linkdown + relative_start=${linkdown} + export DOCTOR_PROFILER_T00=$(python -c \ + "print(int(($linkdown-$relative_start)*1000))") + export DOCTOR_PROFILER_T01=$(python -c \ + "print(int(($detected-$relative_start)*1000))") + export DOCTOR_PROFILER_T03=$(python -c \ + "print(int(($vmdown-$relative_start)*1000))") + export DOCTOR_PROFILER_T04=$(python -c \ + "print(int(($hostdown-$relative_start)*1000))") + export DOCTOR_PROFILER_T09=$(python -c \ + "print(int(($notified-$relative_start)*1000))") + + python profiler-poc.py >doctor_profiler.log 2>&1 + fi +} + cleanup() { set +e echo "cleanup..." @@ -396,5 +413,7 @@ sleep 60 check_host_status "(DOWN|UNKNOWN)" calculate_notification_time +collect_logs +run_profiler echo "done" -- cgit 1.2.3-korg