From 1cee98670e33ef88e6cbdb69e19ca574462b019e Mon Sep 17 00:00:00 2001 From: SerenaFeng Date: Thu, 21 Sep 2017 11:16:44 +0800 Subject: bugfix: autoupdate failed due to mingle with watchdog 1. In the original design, if the auto-job starts soon after the watchdog starts, the auto update might be mingled with connectivity check, which may lead to update failure. so instead, the connectivity is checked first, if failed and module is not in deploying, restart module. 2. only automate-docker-deploy job will impact the container status during auto-job, so substitue auto-job status check with automate-docker-deploy's 3. the watchdog is not only for testapi, but also for reporting, and all docker container server deployed in testresults.opnfv.org, so move it under utils/test/opts, which is leveraged to store tool-sets employed for testing tools. Change-Id: I766f3a534a3d510ce7509d4e742150150ccd8f54 Signed-off-by: SerenaFeng --- utils/test/opts/watchdog.sh | 162 +++++++++++++++++++++ utils/test/testapi/tools/watchdog/docker_watch.sh | 165 ---------------------- 2 files changed, 162 insertions(+), 165 deletions(-) create mode 100644 utils/test/opts/watchdog.sh delete mode 100644 utils/test/testapi/tools/watchdog/docker_watch.sh (limited to 'utils') diff --git a/utils/test/opts/watchdog.sh b/utils/test/opts/watchdog.sh new file mode 100644 index 000000000..51868d709 --- /dev/null +++ b/utils/test/opts/watchdog.sh @@ -0,0 +1,162 @@ +# * +# http://www.apache.org/licenses/LICENSE-2.0 * +# * +# Unless required by applicable law or agreed to in writing, * +# software distributed under the License is distributed on an * +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * +# KIND, either express or implied. See the License for the * +# specific language governing permissions and limitations * +# under the License. * + +# This script checks if deployments are working or and then +# starts the specified containers in case one of the containers +# crash. The only solution is restarting docker as of now. + +#!/bin/bash + +## List of modules +modules=(testapi reporting) + +## Ports of the modules +declare -A ports=( ["testapi"]="8082" ["reporting"]="8084") + +## Urls to check if the modules are deployed or not ? +declare -A urls=( ["testapi"]="http://testresults.opnfv.org/test/" \ + ["reporting"]="http://testresults.opnfv.org/reporting/index.html") + +### Functions related to checking. + +function is_deploying() { + xml=$(curl -m10 "https://build.opnfv.org/ci/job/${1}-automate-docker-deploy-master/lastBuild/api/xml?depth=1") + building=$(grep -oPm1 "(?<=)[^<]+" <<< "$xml") + if [[ $building == "false" ]] + then + false + else + true + fi +} + +function get_docker_status() { + status=$(service docker status | sed -n 3p | cut -d ' ' -f5) + echo -e "Docker status: $status" + if [ $status = "active" ] + then + true + else + false + fi +} + +function check_connectivity() { + echo "Checking $1 connection : $2" + cmd=`curl --head -m10 --request GET ${2} | grep '200 OK' > /dev/null` + rc=$? + if [[ $rc == 0 ]]; then + true + else + false + fi +} + +function check_modules() { + echo -e "Checking modules" + failed_modules=() + for module in "${modules[@]}" + do + if ! check_connectivity $module "${urls[$module]}"; then + if ! is_deploying $module; then + echo -e "$module failed" + failed_modules+=($module) + fi + fi + done + if [ ! -z "$failed_modules" ]; then + echo -e "Failed Modules: $failed_modules" + false + else + echo -e "All modules working good" + exit 0 + fi +} + +### Functions related fixes. + +function restart_docker_fix() { + echo -e "Running restart_docker_fix" + service docker restart + start_containers_fix "${modules[@]}" +} + +function docker_proxy_fix() { + echo -e "Running docker_proxy_fix" + fix_modules=("${@}") + for module in "${fix_modules[@]}" + do + echo -e "Kill docker proxy and restart containers" + pid=$(netstat -nlp | grep :${ports[$module]} | awk '{print $7}' | cut -d'/' -f1) + echo $pid + if [ ! -z "$pid" ]; then + kill $pid + start_container_fix $module + fi + done +} + +function start_containers_fix() { + start_modules=("${@}") + for module in "${start_modules[@]}" + do + start_container_fix $module + done +} + +function start_container_fix() { + echo -e "Starting a container $module" + sudo docker restart $module + sleep 5 + if ! check_connectivity $module "${urls[$module]}"; then + echo -e "Starting an old container $module_old" + sudo docker restart $module"_old" + sleep 5 + fi +} + +### Main Flow + +echo -e +echo -e "WatchDog Started" +echo -e +echo -e `date "+%Y-%m-%d %H:%M:%S.%N"` +echo -e + +## If the problem is related to docker daemon + +if ! get_docker_status; then + restart_docker_fix + if ! check_modules; then + echo -e "Watchdog failed while restart_docker_fix" + fi + exit +fi + +## If the problem is related to docker proxy + +if ! check_modules; then + docker_proxy_fix "${failed_modules[@]}" +fi + +## If any other problem : restart docker + +if ! check_modules; then + restart_docker_fix +fi + +## If nothing works out + +if ! check_modules; then + echo -e "Watchdog failed" +fi + +sudo docker ps +sudo docker images \ No newline at end of file diff --git a/utils/test/testapi/tools/watchdog/docker_watch.sh b/utils/test/testapi/tools/watchdog/docker_watch.sh deleted file mode 100644 index f1d8946b6..000000000 --- a/utils/test/testapi/tools/watchdog/docker_watch.sh +++ /dev/null @@ -1,165 +0,0 @@ -# * -# http://www.apache.org/licenses/LICENSE-2.0 * -# * -# Unless required by applicable law or agreed to in writing, * -# software distributed under the License is distributed on an * -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * -# KIND, either express or implied. See the License for the * -# specific language governing permissions and limitations * -# under the License. * - -# This script checks if deployments are working or and then -# starts the specified containers in case one of the containers -# crash. The only solution is restarting docker as of now. - -#!/bin/bash - -## List of modules -modules=(testapi reporting) - -## Ports of the modules -declare -A ports=( ["testapi"]="8082" ["reporting"]="8084") - -## Urls to check if the modules are deployed or not ? -declare -A urls=( ["testapi"]="http://testresults.opnfv.org/test/" \ - ["reporting"]="http://testresults.opnfv.org/reporting/index.html") - -### Functions related to checking. - -function is_deploying() { - xml=$(curl -m10 "https://build.opnfv.org/ci/job/${1}-automate-master/lastBuild/api/xml?depth=1") - building=$(grep -oPm1 "(?<=)[^<]+" <<< "$xml") - if [[ $building == "false" ]] - then - false - else - true - fi -} - -function get_docker_status() { - status=$(service docker status | sed -n 3p | cut -d ' ' -f5) - echo -e "Docker status: $status" - if [ $status = "active" ] - then - true - else - false - fi -} - -function check_connectivity() { - echo "Checking $1 connection : $2" - cmd=`curl --head -m10 --request GET ${2} | grep '200 OK' > /dev/null` - rc=$? - if [[ $rc == 0 ]]; then - true - else - false - fi -} - -function check_modules() { - echo -e "Checking modules" - failed_modules=() - for module in "${modules[@]}" - do - if is_deploying $module; then - continue - fi - if ! check_connectivity $module "${urls[$module]}"; then - echo -e "$module failed" - failed_modules+=($module) - fi - done - if [ ! -z "$failed_modules" ]; then - echo -e "Failed Modules: $failed_modules" - false - else - echo -e "All modules working good" - exit 0 - fi -} - -### Functions related fixes. - -function restart_docker_fix() { - echo -e "Running restart_docker_fix" - service docker restart - start_containers_fix "${modules[@]}" -} - -function docker_proxy_fix() { - echo -e "Running docker_proxy_fix" - fix_modules=("${@}") - for module in "${fix_modules[@]}" - do - echo -e "Kill docker proxy and restart containers" - pid=$(netstat -nlp | grep :${ports[$module]} | awk '{print $7}' | cut -d'/' -f1) - echo $pid - if [ ! -z "$pid" ]; then - kill $pid - start_container_fix $module - fi - done -} - -function start_containers_fix() { - start_modules=("${@}") - for module in "${start_modules[@]}" - do - start_container_fix $module - done -} - -function start_container_fix() { - echo -e "Starting a container $module" - sudo docker stop $module - sudo docker start $module - sleep 5 - if ! check_connectivity $module "${urls[$module]}"; then - echo -e "Starting an old container $module_old" - sudo docker stop $module - sudo docker start $module"_old" - sleep 5 - fi -} - -### Main Flow - -echo -e -echo -e "WatchDog Started" -echo -e -echo -e `date "+%Y-%m-%d %H:%M:%S.%N"` -echo -e - -## If the problem is related to docker daemon - -if ! get_docker_status; then - restart_docker_fix - if ! check_modules; then - echo -e "Watchdog failed while restart_docker_fix" - fi - exit -fi - -## If the problem is related to docker proxy - -if ! check_modules; then - docker_proxy_fix "${failed_modules[@]}" -fi - -## If any other problem : restart docker - -if ! check_modules; then - restart_docker_fix -fi - -## If nothing works out - -if ! check_modules; then - echo -e "Watchdog failed" -fi - -sudo docker ps -sudo docker images \ No newline at end of file -- cgit 1.2.3-korg