Merge AT&T WIP on modeled cloud-native stacks into Models

Change-Id: I646825bf7d1a9c1be9c00475028084f920c9d399 Signed-off-by: Bryan Sullivan <bryan.sullivan@att.com>
author: Bryan Sullivan <bryan.sullivan@att.com> 2017-10-17 09:55:26 -0700
committer: Bryan Sullivan <bryan.sullivan@att.com> 2017-10-17 09:55:26 -0700
commit: b2c252cd8913ef15a00d63a391da1c8a8a17d739 (patch)
tree: ec09ee2ae95c995ed7a363c1dce7ee40c31a9f99 /tools/kubernetes/k8s-cluster.sh
parent: 7ecbfa707b3b1b1ac65924b097e4ff9b72ad981f (diff)
1 files changed, 438 insertions, 0 deletions
diff --git a/tools/kubernetes/k8s-cluster.sh b/tools/kubernetes/k8s-cluster.sh
new file mode 100644
index 0000000..6a91cdb
--- /dev/null
+++ b/tools/kubernetes/k8s-cluster.sh
@@ -0,0 +1,438 @@
+#!/bin/bash
+# Copyright 2017 AT&T Intellectual Property, Inc
+#  
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#  
+# http://www.apache.org/licenses/LICENSE-2.0
+#  
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#. What this is: script to setup a kubernetes cluster with calico as sni
+#. Prerequisites: 
+#. - Ubuntu xenial server for master and agent nodes
+#. - key-based auth setup for ssh/scp between master and agent nodes
+#. - 192.168.0.0/16 should not be used on your server network interface subnets
+#. Usage:
+#. $ git clone https://gerrit.opnfv.org/gerrit/models ~/models
+#. $ cd ~/models/tools/kubernetes
+#. $ bash k8s-cluster.sh master
+#. $ bash k8s-cluster.sh agents "<nodes>"
+#.     nodes: space-separated list of ceph node IPs
+#. $ bash k8s-cluster.sh ceph "<nodes>" <cluster-net> <public-net> [ceph_dev]
+#.     nodes: space-separated list of ceph node IPs
+#.     cluster-net: CIDR of ceph cluster network e.g. 10.0.0.1/24
+#.     public-net: CIDR of public network
+#.     ceph_dev: disk to use for ceph. ***MUST NOT BE USED FOR ANY OTHER PURPOSE***
+#.               if not provided, ceph data will be stored on osd nodes in /ceph
+#. $ bash k8s-cluster.sh helm
+#.     Setup helm as app kubernetes orchestration tool
+#. $ bash k8s-cluster.sh demo
+#.     Install helm charts for mediawiki and dokuwiki
+#. $ bash k8s-cluster.sh all "<nodes>" <cluster-net> <public-net> [ceph_dev]
+#.     Runs all the steps above
+#.
+#. Status: work in progress, incomplete
+#
+
+function setup_prereqs() {
+  echo "${FUNCNAME[0]}: Create prerequisite setup script"
+  cat <<'EOG' >/tmp/prereqs.sh
+#!/bin/bash
+# Basic server pre-reqs
+sudo apt-get -y remove kubectl kubelet kubeadm
+sudo apt-get update
+sudo apt-get upgrade -y
+# Set hostname on agent nodes
+if [[ "$1" == "agent" ]]; then
+  echo $(ip route get 8.8.8.8 | awk '{print $NF; exit}') $HOSTNAME | sudo tee -a /etc/hosts
+fi
+# Install docker 1.12 (default for xenial is 1.12.6)
+sudo apt-get install -y docker.io
+sudo service docker start
+export KUBE_VERSION=1.7.5
+# per https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/
+# Install kubelet, kubeadm, kubectl per https://kubernetes.io/docs/setup/independent/install-kubeadm/
+sudo apt-get update && sudo apt-get install -y apt-transport-https
+curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
+cat <<EOF | sudo tee /etc/apt/sources.list.d/kubernetes.list
+deb http://apt.kubernetes.io/ kubernetes-xenial main
+EOF
+sudo apt-get update
+# Next command is to workaround bug resulting in "PersistentVolumeClaim is not bound" for pod startup (remain in Pending)
+# TODO: reverify if this is still an issue in the final working script
+sudo apt-get -y install ceph-common
+sudo apt-get -y install --allow-downgrades kubectl=${KUBE_VERSION}-00 kubelet=${KUBE_VERSION}-00 kubeadm=${KUBE_VERSION}-00
+EOG
+}
+
+function setup_k8s_master() {
+  echo "${FUNCNAME[0]}: Setting up kubernetes master"
+  setup_prereqs
+
+  # Install master 
+  bash /tmp/prereqs.sh master
+  # per https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/
+  # If the following command fails, run "kubeadm reset" before trying again
+  # --pod-network-cidr=192.168.0.0/16 is required for calico; this should not conflict with your server network interface subnets
+  sudo kubeadm init --pod-network-cidr=192.168.0.0/16 >>/tmp/kubeadm.out
+  cat /tmp/kubeadm.out
+  export k8s_joincmd=$(grep "kubeadm join" /tmp/kubeadm.out)
+  echo "${FUNCNAME[0]}: Cluster join command for manual use if needed: $k8s_joincmd"
+
+  # Start cluster
+  echo "${FUNCNAME[0]}: Start the cluster"
+  mkdir -p $HOME/.kube
+  sudo cp -f /etc/kubernetes/admin.conf $HOME/.kube/config
+  sudo chown $(id -u):$(id -g) $HOME/.kube/config
+  # Deploy pod network
+  echo "${FUNCNAME[0]}: Deploy calico as CNI"
+  sudo kubectl apply -f http://docs.projectcalico.org/v2.4/getting-started/kubernetes/installation/hosted/kubeadm/1.6/calico.yaml
+}
+
+function setup_k8s_agents() {
+  agents="$1"
+  export k8s_joincmd=$(grep "kubeadm join" /tmp/kubeadm.out)
+  echo "${FUNCNAME[0]}: Installing agents at $1 with joincmd: $k8s_joincmd"
+
+  setup_prereqs
+
+  kubedns=$(kubectl get pods --all-namespaces | grep kube-dns | awk '{print $4}')
+  while [[ "$kubedns" != "Running" ]]; do
+    echo "${FUNCNAME[0]}: kube-dns status is $kubedns. Waiting 60 seconds for it to be 'Running'" 
+    sleep 60
+    kubedns=$(kubectl get pods --all-namespaces | grep kube-dns | awk '{print $4}')
+  done
+  echo "${FUNCNAME[0]}: kube-dns status is $kubedns" 
+
+  for agent in $agents; do
+    echo "${FUNCNAME[0]}: Install agent at $agent"
+    scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no /tmp/prereqs.sh ubuntu@$agent:/tmp/prereqs.sh
+    ssh -x -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ubuntu@$agent bash /tmp/prereqs.sh agent
+    # Workaround for "[preflight] Some fatal errors occurred: /var/lib/kubelet is not empty" per https://github.com/kubernetes/kubeadm/issues/1
+    ssh -x -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ubuntu@$agent sudo kubeadm reset
+    ssh -x -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ubuntu@$agent sudo $k8s_joincmd
+  done
+
+  echo "${FUNCNAME[0]}: Cluster is ready when all nodes in the output of 'kubectl get nodes' show as 'Ready'."
+}
+
+function setup_ceph() {
+  node_ips=$1
+  cluster_net=$2
+  public_net=$3
+  ceph_dev=$4
+  echo "${FUNCNAME[0]}: Deploying ceph-mon on localhost $HOSTNAME"
+  echo "${FUNCNAME[0]}: Deploying ceph-osd on nodes $node_ips"
+  echo "${FUNCNAME[0]}: Setting cluster-network=$cluster_net and public-network=$public_net"
+  mon_ip=$(ip route get 8.8.8.8 | awk '{print $NF; exit}')
+  all_nodes="$mon_ip $node_ips"
+  # Also caches the server fingerprints so ceph-deploy does not prompt the user
+  # Note this loop may be partially redundant with the ceph-deploy steps below
+  for node_ip in $all_nodes; do
+    echo "${FUNCNAME[0]}: Install ntp and ceph on $node_ip"
+    ssh -x -o StrictHostKeyChecking=no ubuntu@$node_ip <<EOF
+sudo timedatectl set-ntp no
+wget -q -O- 'https://download.ceph.com/keys/release.asc' | sudo apt-key add -
+echo deb https://download.ceph.com/debian/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
+sudo apt update
+sudo apt-get install -y ntp ceph ceph-deploy
+EOF
+  done
+  
+  # per http://docs.ceph.com/docs/master/start/quick-ceph-deploy/
+  # also https://upcommons.upc.edu/bitstream/handle/2117/101816/Degree_Thesis_Nabil_El_Alami.pdf#vote +1
+  echo "${FUNCNAME[0]}: Create ceph config folder ~/ceph-cluster"
+  mkdir ~/ceph-cluster
+  cd ~/ceph-cluster
+  
+  echo "${FUNCNAME[0]}: Create new cluster with $HOSTNAME as initial ceph-mon node"
+  ceph-deploy new --cluster-network $cluster_net --public-network $public_net --no-ssh-copykey $HOSTNAME
+  # Update conf per recommendations of http://docs.ceph.com/docs/jewel/rados/configuration/filesystem-recommendations/
+  cat <<EOF >>ceph.conf
+osd max object name len = 256
+osd max object namespace len = 64
+EOF
+  cat ceph.conf
+
+  echo "${FUNCNAME[0]}: Deploy ceph packages on other nodes"
+  ceph-deploy install $mon_ip $node_ips
+
+  echo "${FUNCNAME[0]}: Deploy the initial monitor and gather the keys"
+  ceph-deploy mon create-initial
+
+  if [[ "x$ceph_dev" == "x" ]]; then
+    n=1
+    for node_ip in $node_ips; do
+      echo "${FUNCNAME[0]}: Prepare ceph OSD on node $node_ip"
+      echo "$node_ip ceph-osd$n" | sudo tee -a /etc/hosts
+      # Using ceph-osd$n here avoids need for manual acceptance of the new server hash
+      ssh -x -o StrictHostKeyChecking=no ubuntu@ceph-osd$n <<EOF
+echo "$node_ip ceph-osd$n" | sudo tee -a /etc/hosts
+sudo mkdir /ceph && sudo chown -R ceph:ceph /ceph
+EOF
+      ceph-deploy osd prepare ceph-osd$n:/ceph
+      ceph-deploy osd activate ceph-osd$n:/ceph
+      ((n++))
+    done
+  else 
+    echo "${FUNCNAME[0]}: Deploy OSDs"
+    for node_ip in $node_ips; do
+      echo "${FUNCNAME[0]}: Create ceph osd on $node_ip using $ceph_dev"
+      ceph-deploy osd create $node_ip:$ceph_dev
+    done
+  fi
+
+  echo "${FUNCNAME[0]}: Copy the config file and admin key to the admin node and OSD nodes"
+  ceph-deploy admin $mon_ip $node_ips
+
+  echo "${FUNCNAME[0]}: Check the cluster health"
+  sudo ceph health
+  sudo ceph -s
+
+  # per https://crondev.com/kubernetes-persistent-storage-ceph/ and https://github.com/kubernetes/kubernetes/issues/38923
+  # rbd  is not included in default kube-controller-manager... use attcomdev version
+  sudo sed -i -- 's~gcr.io/google_containers/kube-controller-manager-amd64:.*~quay.io/attcomdev/kube-controller-manager:v1.7.3~' /etc/kubernetes/manifests/kube-controller-manager.yaml
+  if [[ $(sudo grep -c attcomdev/kube-controller-manager /etc/kubernetes/manifests/kube-controller-manager.yaml) == 0 ]]; then
+    echo "${FUNCNAME[0]}: Problem patching /etc/kubernetes/manifests/kube-controller-manager.yaml... script update needed"
+    exit 1
+  fi
+  mgr=$(kubectl get pods --all-namespaces | grep kube-controller-manager | awk '{print $4}')
+  while [[ "$mgr" != "Running" ]]; do
+    echo "${FUNCNAME[0]}: kube-controller-manager status is $mgr. Waiting 60 seconds for it to be 'Running'" 
+    sleep 60
+    mgr=$(kubectl get pods --all-namespaces | grep kube-controller-manager | awk '{print $4}')
+  done
+  echo "${FUNCNAME[0]}: kube-controller-manager status is $mgr"
+
+  echo "${FUNCNAME[0]}: Create Ceph admin secret"
+  admin_key=$(sudo ceph auth get-key client.admin)
+  kubectl create secret generic ceph-secret-admin --from-literal=key="$admin_key" --namespace=kube-system --type=kubernetes.io/rbd
+
+  echo "${FUNCNAME[0]}: Create rdb storageClass 'slow'"
+  cat <<EOF >/tmp/ceph-sc.yaml
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+   name: slow
+provisioner: kubernetes.io/rbd
+parameters:
+    monitors: $mon_ip:6789
+    adminId: admin
+    adminSecretName: ceph-secret-admin
+    adminSecretNamespace: "kube-system"
+    pool: kube
+    userId: kube
+    userSecretName: ceph-secret-user
+EOF
+  # TODO: find out where in the above ~/.kube folders became owned by root
+  sudo chown -R ubuntu:ubuntu ~/.kube/*
+  kubectl create -f /tmp/ceph-sc.yaml
+
+  echo "${FUNCNAME[0]}: Create storage pool 'kube'"
+  # https://github.com/kubernetes/examples/blob/master/staging/persistent-volume-provisioning/README.md method
+  sudo ceph osd pool create kube 32 32
+
+  echo "${FUNCNAME[0]}: Authorize client 'kube' access to pool 'kube'"
+  sudo ceph auth get-or-create client.kube mon 'allow r' osd 'allow rwx pool=kube'
+
+  echo "${FUNCNAME[0]}: Create ceph-secret-user secret in namespace 'default'"
+  kube_key=$(sudo ceph auth get-key client.kube)
+  kubectl create secret generic ceph-secret-user --from-literal=key="$kube_key" --namespace=default --type=kubernetes.io/rbd
+  # A similar secret must be created in other namespaces that intend to access the ceph pool
+
+  # Per https://github.com/kubernetes/examples/blob/master/staging/persistent-volume-provisioning/README.md
+
+  echo "${FUNCNAME[0]}: Create andtest a persistentVolumeClaim"
+  cat <<EOF >/tmp/ceph-pvc.yaml
+{
+  "kind": "PersistentVolumeClaim",
+  "apiVersion": "v1",
+  "metadata": {
+    "name": "claim1",
+    "annotations": {
+        "volume.beta.kubernetes.io/storage-class": "slow"
+    }
+  },
+  "spec": {
+    "accessModes": [
+      "ReadWriteOnce"
+    ],
+    "resources": {
+      "requests": {
+        "storage": "3Gi"
+      }
+    }
+  }
+}
+EOF
+  kubectl create -f /tmp/ceph-pvc.yaml
+  while [[ "x$(kubectl get pvc -o jsonpath='{.status.phase}' claim1)" != "xBound" ]]; do
+    echo "${FUNCNAME[0]}: Waiting for pvc claim1 to be 'Bound'"
+    kubectl describe pvc 
+    sleep 10
+  done
+  echo "${FUNCNAME[0]}: pvc claim1 successfully bound to $(kubectl get pvc -o jsonpath='{.spec.volumeName}' claim1)"
+  kubectl get pvc
+  kubectl delete pvc claim1
+  kubectl describe pods
+}
+
+function wait_for_service() {
+  echo "${FUNCNAME[0]}: Waiting for service $1 to be available"
+  pod=$(kubectl get pods --namespace default | awk "/$1/ { print \$1 }")
+  echo "${FUNCNAME[0]}: Service $1 is at pod $pod"
+  ready=$(kubectl get pods --namespace default -o jsonpath='{.status.containerStatuses[0].ready}' $pod)
+  while [[ "$ready" != "true" ]]; do
+    echo "${FUNCNAME[0]}: $1 container is not yet ready... waiting 10 seconds"
+    sleep 10
+    # TODO: figure out why transient pods sometimes mess up this logic, thus need to re-get the pods
+    pod=$(kubectl get pods --namespace default | awk "/$1/ { print \$1 }")
+    ready=$(kubectl get pods --namespace default -o jsonpath='{.status.containerStatuses[0].ready}' $pod)
+  done
+  echo "${FUNCNAME[0]}: pod $pod container status is $ready"
+  host_ip=$(kubectl get pods --namespace default -o jsonpath='{.status.hostIP}' $pod)
+  port=$(kubectl get --namespace default -o jsonpath="{.spec.ports[0].nodePort}" services $1)
+  echo "${FUNCNAME[0]}: pod $pod container is at host $host_ip and port $port"
+  while ! curl http://$host_ip:$port ; do
+    echo "${FUNCNAME[0]}: $1 service is not yet responding... waiting 10 seconds"
+    sleep 10
+  done
+  echo "${FUNCNAME[0]}: $1 is available at http://$host_ip:$port"
+}
+
+function demo_chart() {
+  cd ~
+  rm -rf charts
+  git clone https://github.com/kubernetes/charts.git
+  cd charts/stable
+  case "$1" in
+    mediawiki)
+      # NOT YET WORKING
+      # mariadb: Readiness probe failed: mysqladmin: connect to server at 'localhost' failed
+      mkdir ./mediawiki/charts
+      cp -r ./mariadb ./mediawiki/charts
+      # LoadBalancer is N/A for baremetal (public cloud only) - use NodePort
+      sed -i -- 's/LoadBalancer/NodePort/g' ./mediawiki/values.yaml
+      # Select the storageClass created in the ceph setup step
+      sed -i -- 's/# storageClass:/storageClass: "slow"/g' ./mediawiki/values.yaml
+      sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./mediawiki/charts/mariadb/values.yaml
+      helm install --name mw -f ./mediawiki/values.yaml ./mediawiki
+      wait_for_service mw-mediawiki
+      ;;
+    dokuwiki)
+      sed -i -- 's/# storageClass:/storageClass: "slow"/g' ./dokuwiki/values.yaml
+      sed -i -- 's/LoadBalancer/NodePort/g' ./dokuwiki/values.yaml
+      helm install --name dw -f ./dokuwiki/values.yaml ./dokuwiki
+      wait_for_service dw-dokuwiki
+      ;;
+    wordpress)
+      # NOT YET WORKING
+      # mariadb: Readiness probe failed: mysqladmin: connect to server at 'localhost' failed
+      mkdir ./wordpress/charts
+      cp -r ./mariadb ./wordpress/charts
+      sed -i -- 's/LoadBalancer/NodePort/g' ./wordpress/values.yaml
+      sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./wordpress/values.yaml
+      sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./wordpress/charts/mariadb/values.yaml
+      helm install --name wp -f ./wordpress/values.yaml ./wordpress
+      wait_for_service wp-wordpress
+      ;;
+    redmine)
+      # NOT YET WORKING
+      # mariadb: Readiness probe failed: mysqladmin: connect to server at 'localhost' failed
+      mkdir ./redmine/charts
+      cp -r ./mariadb ./redmine/charts
+      cp -r ./postgresql ./redmine/charts
+      sed -i -- 's/LoadBalancer/NodePort/g' ./redmine/values.yaml
+      sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./redmine/values.yaml
+      sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./redmine/charts/mariadb/values.yaml
+      sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./redmine/charts/postgresql/values.yaml
+      helm install --name rdm -f ./redmine/values.yaml ./redmine
+      wait_for_service rdm-redmine
+      ;;
+    owncloud)
+      # NOT YET WORKING: needs resolvable hostname for service
+      mkdir ./owncloud/charts
+      cp -r ./mariadb ./owncloud/charts
+      sed -i -- 's/LoadBalancer/NodePort/g' ./owncloud/values.yaml
+      sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./owncloud/values.yaml
+      sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./owncloud/charts/mariadb/values.yaml
+      helm install --name oc -f ./owncloud/values.yaml ./owncloud
+      wait_for_service oc-owncloud
+      ;;
+    *)
+      echo "${FUNCNAME[0]}: demo not implemented for $1"
+  esac    
+# extra useful commands
+# kubectl describe pvc 
+# kubectl get pvc
+# kubectl describe pods
+# kubectl get pods --namespace default
+# kubectl get pods --all-namespaces
+# kubectl get svc --namespace default dw-dokuwiki
+# kubectl describe svc --namespace default dw-dokuwiki
+# kubectl describe pods --namespace default dw-dokuwiki
+}
+
+function setup_helm() {
+  echo "${FUNCNAME[0]}: Setup helm"
+  # Install Helm
+  cd ~
+  curl https://raw.githubusercontent.com/kubernetes/helm/master/scripts/get > get_helm.sh
+  chmod 700 get_helm.sh
+  ./get_helm.sh
+  helm init
+  helm repo update
+  # TODO: Workaround for bug https://github.com/kubernetes/helm/issues/2224
+  # For testing use only!
+  kubectl create clusterrolebinding permissive-binding --clusterrole=cluster-admin --user=admin --user=kubelet --group=system:serviceaccounts;
+  # TODO: workaround for tiller FailedScheduling (No nodes are available that match all of the following predicates:: PodToleratesNodeTaints (1).)
+  # kubectl taint nodes $HOSTNAME node-role.kubernetes.io/master:NoSchedule-
+  # Wait till tiller is running
+  tiller_deploy=$(kubectl get pods --all-namespaces | grep tiller-deploy | awk '{print $4}')
+  while [[ "$tiller_deploy" != "Running" ]]; do
+    echo "${FUNCNAME[0]}: tiller-deploy status is $tiller_deploy. Waiting 60 seconds for it to be 'Running'" 
+    sleep 60
+    tiller_deploy=$(kubectl get pods --all-namespaces | grep tiller-deploy | awk '{print $4}')
+  done
+  echo "${FUNCNAME[0]}: tiller-deploy status is $tiller_deploy"
+
+  # Install services via helm charts from https://kubeapps.com/charts
+  # e.g. helm install stable/dokuwiki
+}
+
+export WORK_DIR=$(pwd)
+case "$1" in
+  master)
+    setup_k8s_master
+    ;;
+  agents)
+    setup_k8s_agents "$2"
+    ;;
+  ceph)
+    setup_ceph "$2" $3 $4 $5
+    ;;
+  helm)
+    setup_helm
+    ;;
+  demo)
+    demo_chart $2
+    ;;
+  all)
+    setup_k8s_master
+    setup_k8s_agents "$2"
+    setup_ceph "$2" $3 $4 $5
+    setup_helm
+    demo_chart dokuwiki
+    ;;
+  clean)
+    # TODO
+    ;;
+  *)
+    if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then grep '#. ' $0; fi
+esac
author	Bryan Sullivan <bryan.sullivan@att.com>	2017-10-17 09:55:26 -0700
committer	Bryan Sullivan <bryan.sullivan@att.com>	2017-10-17 09:55:26 -0700
commit	b2c252cd8913ef15a00d63a391da1c8a8a17d739 (patch)
tree	ec09ee2ae95c995ed7a363c1dce7ee40c31a9f99 /tools/kubernetes/k8s-cluster.sh
parent	7ecbfa707b3b1b1ac65924b097e4ff9b72ad981f (diff)