From 793d26c27c1e24d0e15e1e882f68841446b095ac Mon Sep 17 00:00:00 2001 From: Martin Klozik Date: Thu, 6 Dec 2018 08:24:51 +0100 Subject: ONAP installation improvements Several modifications of installation process were made to improve stability during CI runs. New features: * increased robustness * increased number of K8S slaves * even VM dispatching among computes to avoid overcommitting of compute nodes which lead to K8S slaves crashes * report installation status JIRA: AUTO-79 Change-Id: I6eca0a7203dce0256dc914028989d3fb21d532e7 Signed-off-by: Martin Klozik --- ci/deploy-onap-fuel.sh | 49 +++++++++++++++++++--- ci/deploy-onap.sh | 109 ++++++++++++++++++++++++------------------------- 2 files changed, 96 insertions(+), 62 deletions(-) diff --git a/ci/deploy-onap-fuel.sh b/ci/deploy-onap-fuel.sh index cc4dfb1..2e8c3ad 100755 --- a/ci/deploy-onap-fuel.sh +++ b/ci/deploy-onap-fuel.sh @@ -33,7 +33,7 @@ CMP_MIN_MEM=${CMP_MIN_MEM:-64000} # MB RAM of the weakest compute node CMP_MIN_CPUS=${CMP_MIN_CPUS:-36} # CPU count of the weakest compute node # size of storage for instances CMP_STORAGE_TOTAL=${CMP_STORAGE_TOTAL:-$((80*$CMP_COUNT))} -VM_COUNT=${VM_COUNT:-2} # number of VMs available for k8s cluster +VM_COUNT=${VM_COUNT:-6} # number of VMs available for k8s cluster # # Functions @@ -163,27 +163,64 @@ openstack security group rule create --remote-ip $PUBLIC_NET --proto tcp \ openstack security group rule create --remote-ip $PUBLIC_NET --proto udp \ --dst-port 1:65535 onap_security_group +# Get list of hypervisors and their zone +HOST_ZONE=$(openstack host list -f value | grep compute | head -n1 | cut -d' ' -f3) +HOST_NAME=($(openstack host list -f value | grep compute | cut -d' ' -f1)) +HOST_COUNT=$(echo ${HOST_NAME[@]} | wc -w) # Create VMs and assign floating IPs to them VM_ITER=1 +HOST_ITER=0 while [ $VM_ITER -le $VM_COUNT ] ; do openstack floating ip create floating_net VM_NAME[$VM_ITER]="onap_vm${VM_ITER}" VM_IP[$VM_ITER]=$(openstack floating ip list -c "Floating IP Address" \ -c "Port" -f value | grep None | cut -f1 -d " " | head -n1) + # dispatch new VMs among compute nodes in round robin fashion openstack server create --flavor onap.large --image xenial \ --nic net-id=onap_private_network --security-group onap_security_group \ - --key-name onap_key ${VM_NAME[$VM_ITER]} + --key-name onap_key ${VM_NAME[$VM_ITER]} \ + --availability-zone ${HOST_ZONE}:${HOST_NAME[$HOST_ITER]} sleep 5 # wait for VM init before floating IP can be assigned openstack server add floating ip ${VM_NAME[$VM_ITER]} ${VM_IP[$VM_ITER]} VM_ITER=$(($VM_ITER+1)) + HOST_ITER=$(($HOST_ITER+1)) + [ $HOST_ITER -ge $HOST_COUNT ] && HOST_ITER=0 done -openstack server list +echo "Waiting for VMs to start up for 2m at $(date)" +sleep 2m + +openstack server list -c ID -c Name -c Status -c Networks -c Host --long + +# check that SSH to all VMs is working +SSH_OPTIONS="-i $SSH_IDENTITY -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no" +COUNTER=1 +while [ $COUNTER -le 10 ] ; do + VM_UP=0 + VM_ITER=1 + while [ $VM_ITER -le $VM_COUNT ] ; do + if ssh $SSH_OPTIONS -l $SSH_USER ${VM_IP[$VM_ITER]} exit &>/dev/null ; then + VM_UP=$(($VM_UP+1)) + echo "${VM_NAME[$VM_ITER]} ${VM_IP[$VM_ITER]}: up" + else + echo "${VM_NAME[$VM_ITER]} ${VM_IP[$VM_ITER]}: down" + fi + VM_ITER=$(($VM_ITER+1)) + done + COUNTER=$(($COUNTER+1)) + if [ $VM_UP -eq $VM_COUNT ] ; then + break + fi + echo "Waiting for VMs to be accessible via ssh for 2m at $(date)" + sleep 2m +done -echo "Waiting for VMs to start up for 5 minutes at $(date)" -sleep 5m +openstack server list -c ID -c Name -c Status -c Networks -c Host --long -openstack server list +if [ $VM_UP -ne $VM_COUNT ] ; then + echo "Only $VM_UP from $VM_COUNT VMs are accessible via ssh. Installation will be terminated." + exit 1 +fi # Start ONAP installation DATE_START=$(date) diff --git a/ci/deploy-onap.sh b/ci/deploy-onap.sh index 4d7e3b3..c34eb56 100755 --- a/ci/deploy-onap.sh +++ b/ci/deploy-onap.sh @@ -61,6 +61,19 @@ fi # # Installation # + +# use standalone K8S master if there are enough VMs available for the K8S cluster +SERVERS_COUNT=$(echo $SERVERS | wc -w) +if [ $SERVERS_COUNT -gt 2 ] ; then + RANCHER_SLAVES=$SLAVES +else + RANCHER_SLAVES=$SERVERS +fi + +echo "INSTALLATION TOPOLOGY:" +echo "Rancher Master: $MASTER" +echo "Rancher Slaves: $RANCHER_SLAVES" +echo echo "INSTALLING DOCKER ON ALL MACHINES" echo "$SERVERS" @@ -223,9 +236,9 @@ HOSTREGTOKEN=$(ssh $SSH_OPTIONS $SSH_USER@"$MASTER" cat /tmp/rancher_register_ho echo "$HOSTREGTOKEN" echo "REGISTERING HOSTS WITH RANCHER ENVIRONMENT '$ENVIRON'" -echo "$SERVERS" +echo "$RANCHER_SLAVES" -for MACHINE in $SERVERS; +for MACHINE in $RANCHER_SLAVES; do ssh $SSH_OPTIONS $SSH_USER@"$MACHINE" "bash -s" < \$TMP_POD_LIST - return \$(cat \$TMP_POD_LIST | wc -l) + kubectl get pods --namespace $ENVIRON > $TMP_POD_LIST + return \$(cat $TMP_POD_LIST | wc -l) } -FAILED_PODS_LIMIT=1 # maximal number of falied ONAP PODs -ALL_PODS_LIMIT=20 # minimum ONAP PODs to be up & running -MAX_WAIT_PERIODS=500 # over 2 hours +FAILED_PODS_LIMIT=1 # maximal number of failed ONAP PODs +ALL_PODS_LIMIT=20 # minimum ONAP PODs to be up & running +WAIT_PERIOD=60 # wait period in seconds +MAX_WAIT_TIME=\$((3600*3)) # max wait time in seconds +MAX_WAIT_PERIODS=\$((\$MAX_WAIT_TIME/\$WAIT_PERIOD)) COUNTER=0 get_onap_pods ALL_PODS=\$? -PENDING=\$(grep -E '0/|1/2' \$TMP_POD_LIST | wc -l) +PENDING=\$(grep -E '0/|1/2' $TMP_POD_LIST | wc -l) while [ \$PENDING -gt \$FAILED_PODS_LIMIT -o \$ALL_PODS -lt \$ALL_PODS_LIMIT ]; do - # print header every 20th lines + # print header every 20th line if [ \$COUNTER -eq \$((\$COUNTER/20*20)) ] ; then printf "%-3s %-29s %-3s/%s\n" "Nr." "Datetime of check" "Err" "Total PODs" fi COUNTER=\$((\$COUNTER+1)) printf "%3s %-29s %3s/%-3s\n" \$COUNTER "\$(date)" \$PENDING \$ALL_PODS - sleep 15 + sleep \$WAIT_PERIOD if [ "\$MAX_WAIT_PERIODS" -eq \$COUNTER ]; then FAILED_PODS_LIMIT=800 ALL_PODS_LIMIT=0 fi get_onap_pods ALL_PODS=\$? - PENDING=\$(grep -E '0/|1/2' \$TMP_POD_LIST | wc -l) + PENDING=\$(grep -E '0/|1/2' $TMP_POD_LIST | wc -l) done -echo "Report on non-running containers" get_onap_pods -grep -E '0/|1/2' \$TMP_POD_LIST +cp $TMP_POD_LIST ~/onap_all_pods.txt echo - -echo "sleep 5 min - to allow rest frameworks to finish at \$(date)" -sleep 5m -echo "run healthcheck 2 times to warm caches and frameworks"\ - "so rest endpoints report properly - see OOM-447" - -echo "curl with aai cert to cloud-region PUT" -curl -X PUT https://127.0.0.1:30233/aai/v11/cloud-infrastructure/\ -cloud-regions/cloud-region/CloudOwner/RegionOne \ ---data "@aai-cloud-region-put.json" \ --H "authorization: Basic TW9kZWxMb2FkZXI6TW9kZWxMb2FkZXI=" \ --H "X-TransactionId:jimmy-postman" \ --H "X-FromAppId:AAI" \ --H "Content-Type:application/json" \ --H "Accept:application/json" \ ---cacert aaiapisimpledemoopenecomporg_20171003.crt -k - -echo "get the cloud region back" -curl -X GET https://127.0.0.1:30233/aai/v11/cloud-infrastructure/\ -cloud-regions/ \ --H "authorization: Basic TW9kZWxMb2FkZXI6TW9kZWxMb2FkZXI=" \ --H "X-TransactionId:jimmy-postman" \ --H "X-FromAppId:AAI" \ --H "Content-Type:application/json" \ --H "Accept:application/json" \ ---cacert aaiapisimpledemoopenecomporg_20171003.crt -k - -# OOM-484 - robot scripts moved +echo "========================" +echo "ONAP INSTALLATION REPORT" +echo "========================" +echo +echo "List of Failed PODs" +echo "-------------------" +grep -E '0/|1/2' $TMP_POD_LIST | tee ~/onap_failed_pods.txt +echo +echo "Summary:" +echo "--------" +echo " PODs Failed: \$(cat ~/onap_failed_pods.txt | wc -l)" +echo " PODs Total: \$(cat ~/onap_all_pods.txt | wc -l)" +echo +echo "ONAP health TC results" +echo "----------------------" cd oom/kubernetes/robot -echo -e "\nrun healthcheck prep 1" -# OOM-722 adds namespace parameter -./ete-k8s.sh $ENVIRON health > ~/health1.out -echo "sleep 5 min at \$(date)" -sleep 5m - -echo "run healthcheck prep 2" -./ete-k8s.sh $ENVIRON health > ~/health2.out - -echo "run healthcheck for real - wait a further 5 min at \$(date)" -sleep 5m -./ete-k8s.sh $ENVIRON health +./ete-k8s.sh $ENVIRON health | tee ~/onap_health.txt +echo "===============================" +echo "END OF ONAP INSTALLATION REPORT" +echo "===============================" OOMDEPLOY echo "Finished install, ruturned from Master at $(date)" -- cgit 1.2.3-korg