summaryrefslogtreecommitdiffstats
path: root/tools/kubernetes/k8s-cluster.sh
diff options
context:
space:
mode:
authorBryan Sullivan <bryan.sullivan@att.com>2017-10-17 09:55:26 -0700
committerBryan Sullivan <bryan.sullivan@att.com>2017-10-17 09:55:26 -0700
commitb2c252cd8913ef15a00d63a391da1c8a8a17d739 (patch)
treeec09ee2ae95c995ed7a363c1dce7ee40c31a9f99 /tools/kubernetes/k8s-cluster.sh
parent7ecbfa707b3b1b1ac65924b097e4ff9b72ad981f (diff)
Merge AT&T WIP on modeled cloud-native stacks into Models
Change-Id: I646825bf7d1a9c1be9c00475028084f920c9d399 Signed-off-by: Bryan Sullivan <bryan.sullivan@att.com>
Diffstat (limited to 'tools/kubernetes/k8s-cluster.sh')
-rw-r--r--tools/kubernetes/k8s-cluster.sh438
1 files changed, 438 insertions, 0 deletions
diff --git a/tools/kubernetes/k8s-cluster.sh b/tools/kubernetes/k8s-cluster.sh
new file mode 100644
index 0000000..6a91cdb
--- /dev/null
+++ b/tools/kubernetes/k8s-cluster.sh
@@ -0,0 +1,438 @@
+#!/bin/bash
+# Copyright 2017 AT&T Intellectual Property, Inc
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#. What this is: script to setup a kubernetes cluster with calico as sni
+#. Prerequisites:
+#. - Ubuntu xenial server for master and agent nodes
+#. - key-based auth setup for ssh/scp between master and agent nodes
+#. - 192.168.0.0/16 should not be used on your server network interface subnets
+#. Usage:
+#. $ git clone https://gerrit.opnfv.org/gerrit/models ~/models
+#. $ cd ~/models/tools/kubernetes
+#. $ bash k8s-cluster.sh master
+#. $ bash k8s-cluster.sh agents "<nodes>"
+#. nodes: space-separated list of ceph node IPs
+#. $ bash k8s-cluster.sh ceph "<nodes>" <cluster-net> <public-net> [ceph_dev]
+#. nodes: space-separated list of ceph node IPs
+#. cluster-net: CIDR of ceph cluster network e.g. 10.0.0.1/24
+#. public-net: CIDR of public network
+#. ceph_dev: disk to use for ceph. ***MUST NOT BE USED FOR ANY OTHER PURPOSE***
+#. if not provided, ceph data will be stored on osd nodes in /ceph
+#. $ bash k8s-cluster.sh helm
+#. Setup helm as app kubernetes orchestration tool
+#. $ bash k8s-cluster.sh demo
+#. Install helm charts for mediawiki and dokuwiki
+#. $ bash k8s-cluster.sh all "<nodes>" <cluster-net> <public-net> [ceph_dev]
+#. Runs all the steps above
+#.
+#. Status: work in progress, incomplete
+#
+
+function setup_prereqs() {
+ echo "${FUNCNAME[0]}: Create prerequisite setup script"
+ cat <<'EOG' >/tmp/prereqs.sh
+#!/bin/bash
+# Basic server pre-reqs
+sudo apt-get -y remove kubectl kubelet kubeadm
+sudo apt-get update
+sudo apt-get upgrade -y
+# Set hostname on agent nodes
+if [[ "$1" == "agent" ]]; then
+ echo $(ip route get 8.8.8.8 | awk '{print $NF; exit}') $HOSTNAME | sudo tee -a /etc/hosts
+fi
+# Install docker 1.12 (default for xenial is 1.12.6)
+sudo apt-get install -y docker.io
+sudo service docker start
+export KUBE_VERSION=1.7.5
+# per https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/
+# Install kubelet, kubeadm, kubectl per https://kubernetes.io/docs/setup/independent/install-kubeadm/
+sudo apt-get update && sudo apt-get install -y apt-transport-https
+curl -s https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
+cat <<EOF | sudo tee /etc/apt/sources.list.d/kubernetes.list
+deb http://apt.kubernetes.io/ kubernetes-xenial main
+EOF
+sudo apt-get update
+# Next command is to workaround bug resulting in "PersistentVolumeClaim is not bound" for pod startup (remain in Pending)
+# TODO: reverify if this is still an issue in the final working script
+sudo apt-get -y install ceph-common
+sudo apt-get -y install --allow-downgrades kubectl=${KUBE_VERSION}-00 kubelet=${KUBE_VERSION}-00 kubeadm=${KUBE_VERSION}-00
+EOG
+}
+
+function setup_k8s_master() {
+ echo "${FUNCNAME[0]}: Setting up kubernetes master"
+ setup_prereqs
+
+ # Install master
+ bash /tmp/prereqs.sh master
+ # per https://kubernetes.io/docs/setup/independent/create-cluster-kubeadm/
+ # If the following command fails, run "kubeadm reset" before trying again
+ # --pod-network-cidr=192.168.0.0/16 is required for calico; this should not conflict with your server network interface subnets
+ sudo kubeadm init --pod-network-cidr=192.168.0.0/16 >>/tmp/kubeadm.out
+ cat /tmp/kubeadm.out
+ export k8s_joincmd=$(grep "kubeadm join" /tmp/kubeadm.out)
+ echo "${FUNCNAME[0]}: Cluster join command for manual use if needed: $k8s_joincmd"
+
+ # Start cluster
+ echo "${FUNCNAME[0]}: Start the cluster"
+ mkdir -p $HOME/.kube
+ sudo cp -f /etc/kubernetes/admin.conf $HOME/.kube/config
+ sudo chown $(id -u):$(id -g) $HOME/.kube/config
+ # Deploy pod network
+ echo "${FUNCNAME[0]}: Deploy calico as CNI"
+ sudo kubectl apply -f http://docs.projectcalico.org/v2.4/getting-started/kubernetes/installation/hosted/kubeadm/1.6/calico.yaml
+}
+
+function setup_k8s_agents() {
+ agents="$1"
+ export k8s_joincmd=$(grep "kubeadm join" /tmp/kubeadm.out)
+ echo "${FUNCNAME[0]}: Installing agents at $1 with joincmd: $k8s_joincmd"
+
+ setup_prereqs
+
+ kubedns=$(kubectl get pods --all-namespaces | grep kube-dns | awk '{print $4}')
+ while [[ "$kubedns" != "Running" ]]; do
+ echo "${FUNCNAME[0]}: kube-dns status is $kubedns. Waiting 60 seconds for it to be 'Running'"
+ sleep 60
+ kubedns=$(kubectl get pods --all-namespaces | grep kube-dns | awk '{print $4}')
+ done
+ echo "${FUNCNAME[0]}: kube-dns status is $kubedns"
+
+ for agent in $agents; do
+ echo "${FUNCNAME[0]}: Install agent at $agent"
+ scp -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no /tmp/prereqs.sh ubuntu@$agent:/tmp/prereqs.sh
+ ssh -x -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ubuntu@$agent bash /tmp/prereqs.sh agent
+ # Workaround for "[preflight] Some fatal errors occurred: /var/lib/kubelet is not empty" per https://github.com/kubernetes/kubeadm/issues/1
+ ssh -x -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ubuntu@$agent sudo kubeadm reset
+ ssh -x -o UserKnownHostsFile=/dev/null -o StrictHostKeyChecking=no ubuntu@$agent sudo $k8s_joincmd
+ done
+
+ echo "${FUNCNAME[0]}: Cluster is ready when all nodes in the output of 'kubectl get nodes' show as 'Ready'."
+}
+
+function setup_ceph() {
+ node_ips=$1
+ cluster_net=$2
+ public_net=$3
+ ceph_dev=$4
+ echo "${FUNCNAME[0]}: Deploying ceph-mon on localhost $HOSTNAME"
+ echo "${FUNCNAME[0]}: Deploying ceph-osd on nodes $node_ips"
+ echo "${FUNCNAME[0]}: Setting cluster-network=$cluster_net and public-network=$public_net"
+ mon_ip=$(ip route get 8.8.8.8 | awk '{print $NF; exit}')
+ all_nodes="$mon_ip $node_ips"
+ # Also caches the server fingerprints so ceph-deploy does not prompt the user
+ # Note this loop may be partially redundant with the ceph-deploy steps below
+ for node_ip in $all_nodes; do
+ echo "${FUNCNAME[0]}: Install ntp and ceph on $node_ip"
+ ssh -x -o StrictHostKeyChecking=no ubuntu@$node_ip <<EOF
+sudo timedatectl set-ntp no
+wget -q -O- 'https://download.ceph.com/keys/release.asc' | sudo apt-key add -
+echo deb https://download.ceph.com/debian/ $(lsb_release -sc) main | sudo tee /etc/apt/sources.list.d/ceph.list
+sudo apt update
+sudo apt-get install -y ntp ceph ceph-deploy
+EOF
+ done
+
+ # per http://docs.ceph.com/docs/master/start/quick-ceph-deploy/
+ # also https://upcommons.upc.edu/bitstream/handle/2117/101816/Degree_Thesis_Nabil_El_Alami.pdf#vote +1
+ echo "${FUNCNAME[0]}: Create ceph config folder ~/ceph-cluster"
+ mkdir ~/ceph-cluster
+ cd ~/ceph-cluster
+
+ echo "${FUNCNAME[0]}: Create new cluster with $HOSTNAME as initial ceph-mon node"
+ ceph-deploy new --cluster-network $cluster_net --public-network $public_net --no-ssh-copykey $HOSTNAME
+ # Update conf per recommendations of http://docs.ceph.com/docs/jewel/rados/configuration/filesystem-recommendations/
+ cat <<EOF >>ceph.conf
+osd max object name len = 256
+osd max object namespace len = 64
+EOF
+ cat ceph.conf
+
+ echo "${FUNCNAME[0]}: Deploy ceph packages on other nodes"
+ ceph-deploy install $mon_ip $node_ips
+
+ echo "${FUNCNAME[0]}: Deploy the initial monitor and gather the keys"
+ ceph-deploy mon create-initial
+
+ if [[ "x$ceph_dev" == "x" ]]; then
+ n=1
+ for node_ip in $node_ips; do
+ echo "${FUNCNAME[0]}: Prepare ceph OSD on node $node_ip"
+ echo "$node_ip ceph-osd$n" | sudo tee -a /etc/hosts
+ # Using ceph-osd$n here avoids need for manual acceptance of the new server hash
+ ssh -x -o StrictHostKeyChecking=no ubuntu@ceph-osd$n <<EOF
+echo "$node_ip ceph-osd$n" | sudo tee -a /etc/hosts
+sudo mkdir /ceph && sudo chown -R ceph:ceph /ceph
+EOF
+ ceph-deploy osd prepare ceph-osd$n:/ceph
+ ceph-deploy osd activate ceph-osd$n:/ceph
+ ((n++))
+ done
+ else
+ echo "${FUNCNAME[0]}: Deploy OSDs"
+ for node_ip in $node_ips; do
+ echo "${FUNCNAME[0]}: Create ceph osd on $node_ip using $ceph_dev"
+ ceph-deploy osd create $node_ip:$ceph_dev
+ done
+ fi
+
+ echo "${FUNCNAME[0]}: Copy the config file and admin key to the admin node and OSD nodes"
+ ceph-deploy admin $mon_ip $node_ips
+
+ echo "${FUNCNAME[0]}: Check the cluster health"
+ sudo ceph health
+ sudo ceph -s
+
+ # per https://crondev.com/kubernetes-persistent-storage-ceph/ and https://github.com/kubernetes/kubernetes/issues/38923
+ # rbd is not included in default kube-controller-manager... use attcomdev version
+ sudo sed -i -- 's~gcr.io/google_containers/kube-controller-manager-amd64:.*~quay.io/attcomdev/kube-controller-manager:v1.7.3~' /etc/kubernetes/manifests/kube-controller-manager.yaml
+ if [[ $(sudo grep -c attcomdev/kube-controller-manager /etc/kubernetes/manifests/kube-controller-manager.yaml) == 0 ]]; then
+ echo "${FUNCNAME[0]}: Problem patching /etc/kubernetes/manifests/kube-controller-manager.yaml... script update needed"
+ exit 1
+ fi
+ mgr=$(kubectl get pods --all-namespaces | grep kube-controller-manager | awk '{print $4}')
+ while [[ "$mgr" != "Running" ]]; do
+ echo "${FUNCNAME[0]}: kube-controller-manager status is $mgr. Waiting 60 seconds for it to be 'Running'"
+ sleep 60
+ mgr=$(kubectl get pods --all-namespaces | grep kube-controller-manager | awk '{print $4}')
+ done
+ echo "${FUNCNAME[0]}: kube-controller-manager status is $mgr"
+
+ echo "${FUNCNAME[0]}: Create Ceph admin secret"
+ admin_key=$(sudo ceph auth get-key client.admin)
+ kubectl create secret generic ceph-secret-admin --from-literal=key="$admin_key" --namespace=kube-system --type=kubernetes.io/rbd
+
+ echo "${FUNCNAME[0]}: Create rdb storageClass 'slow'"
+ cat <<EOF >/tmp/ceph-sc.yaml
+apiVersion: storage.k8s.io/v1
+kind: StorageClass
+metadata:
+ name: slow
+provisioner: kubernetes.io/rbd
+parameters:
+ monitors: $mon_ip:6789
+ adminId: admin
+ adminSecretName: ceph-secret-admin
+ adminSecretNamespace: "kube-system"
+ pool: kube
+ userId: kube
+ userSecretName: ceph-secret-user
+EOF
+ # TODO: find out where in the above ~/.kube folders became owned by root
+ sudo chown -R ubuntu:ubuntu ~/.kube/*
+ kubectl create -f /tmp/ceph-sc.yaml
+
+ echo "${FUNCNAME[0]}: Create storage pool 'kube'"
+ # https://github.com/kubernetes/examples/blob/master/staging/persistent-volume-provisioning/README.md method
+ sudo ceph osd pool create kube 32 32
+
+ echo "${FUNCNAME[0]}: Authorize client 'kube' access to pool 'kube'"
+ sudo ceph auth get-or-create client.kube mon 'allow r' osd 'allow rwx pool=kube'
+
+ echo "${FUNCNAME[0]}: Create ceph-secret-user secret in namespace 'default'"
+ kube_key=$(sudo ceph auth get-key client.kube)
+ kubectl create secret generic ceph-secret-user --from-literal=key="$kube_key" --namespace=default --type=kubernetes.io/rbd
+ # A similar secret must be created in other namespaces that intend to access the ceph pool
+
+ # Per https://github.com/kubernetes/examples/blob/master/staging/persistent-volume-provisioning/README.md
+
+ echo "${FUNCNAME[0]}: Create andtest a persistentVolumeClaim"
+ cat <<EOF >/tmp/ceph-pvc.yaml
+{
+ "kind": "PersistentVolumeClaim",
+ "apiVersion": "v1",
+ "metadata": {
+ "name": "claim1",
+ "annotations": {
+ "volume.beta.kubernetes.io/storage-class": "slow"
+ }
+ },
+ "spec": {
+ "accessModes": [
+ "ReadWriteOnce"
+ ],
+ "resources": {
+ "requests": {
+ "storage": "3Gi"
+ }
+ }
+ }
+}
+EOF
+ kubectl create -f /tmp/ceph-pvc.yaml
+ while [[ "x$(kubectl get pvc -o jsonpath='{.status.phase}' claim1)" != "xBound" ]]; do
+ echo "${FUNCNAME[0]}: Waiting for pvc claim1 to be 'Bound'"
+ kubectl describe pvc
+ sleep 10
+ done
+ echo "${FUNCNAME[0]}: pvc claim1 successfully bound to $(kubectl get pvc -o jsonpath='{.spec.volumeName}' claim1)"
+ kubectl get pvc
+ kubectl delete pvc claim1
+ kubectl describe pods
+}
+
+function wait_for_service() {
+ echo "${FUNCNAME[0]}: Waiting for service $1 to be available"
+ pod=$(kubectl get pods --namespace default | awk "/$1/ { print \$1 }")
+ echo "${FUNCNAME[0]}: Service $1 is at pod $pod"
+ ready=$(kubectl get pods --namespace default -o jsonpath='{.status.containerStatuses[0].ready}' $pod)
+ while [[ "$ready" != "true" ]]; do
+ echo "${FUNCNAME[0]}: $1 container is not yet ready... waiting 10 seconds"
+ sleep 10
+ # TODO: figure out why transient pods sometimes mess up this logic, thus need to re-get the pods
+ pod=$(kubectl get pods --namespace default | awk "/$1/ { print \$1 }")
+ ready=$(kubectl get pods --namespace default -o jsonpath='{.status.containerStatuses[0].ready}' $pod)
+ done
+ echo "${FUNCNAME[0]}: pod $pod container status is $ready"
+ host_ip=$(kubectl get pods --namespace default -o jsonpath='{.status.hostIP}' $pod)
+ port=$(kubectl get --namespace default -o jsonpath="{.spec.ports[0].nodePort}" services $1)
+ echo "${FUNCNAME[0]}: pod $pod container is at host $host_ip and port $port"
+ while ! curl http://$host_ip:$port ; do
+ echo "${FUNCNAME[0]}: $1 service is not yet responding... waiting 10 seconds"
+ sleep 10
+ done
+ echo "${FUNCNAME[0]}: $1 is available at http://$host_ip:$port"
+}
+
+function demo_chart() {
+ cd ~
+ rm -rf charts
+ git clone https://github.com/kubernetes/charts.git
+ cd charts/stable
+ case "$1" in
+ mediawiki)
+ # NOT YET WORKING
+ # mariadb: Readiness probe failed: mysqladmin: connect to server at 'localhost' failed
+ mkdir ./mediawiki/charts
+ cp -r ./mariadb ./mediawiki/charts
+ # LoadBalancer is N/A for baremetal (public cloud only) - use NodePort
+ sed -i -- 's/LoadBalancer/NodePort/g' ./mediawiki/values.yaml
+ # Select the storageClass created in the ceph setup step
+ sed -i -- 's/# storageClass:/storageClass: "slow"/g' ./mediawiki/values.yaml
+ sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./mediawiki/charts/mariadb/values.yaml
+ helm install --name mw -f ./mediawiki/values.yaml ./mediawiki
+ wait_for_service mw-mediawiki
+ ;;
+ dokuwiki)
+ sed -i -- 's/# storageClass:/storageClass: "slow"/g' ./dokuwiki/values.yaml
+ sed -i -- 's/LoadBalancer/NodePort/g' ./dokuwiki/values.yaml
+ helm install --name dw -f ./dokuwiki/values.yaml ./dokuwiki
+ wait_for_service dw-dokuwiki
+ ;;
+ wordpress)
+ # NOT YET WORKING
+ # mariadb: Readiness probe failed: mysqladmin: connect to server at 'localhost' failed
+ mkdir ./wordpress/charts
+ cp -r ./mariadb ./wordpress/charts
+ sed -i -- 's/LoadBalancer/NodePort/g' ./wordpress/values.yaml
+ sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./wordpress/values.yaml
+ sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./wordpress/charts/mariadb/values.yaml
+ helm install --name wp -f ./wordpress/values.yaml ./wordpress
+ wait_for_service wp-wordpress
+ ;;
+ redmine)
+ # NOT YET WORKING
+ # mariadb: Readiness probe failed: mysqladmin: connect to server at 'localhost' failed
+ mkdir ./redmine/charts
+ cp -r ./mariadb ./redmine/charts
+ cp -r ./postgresql ./redmine/charts
+ sed -i -- 's/LoadBalancer/NodePort/g' ./redmine/values.yaml
+ sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./redmine/values.yaml
+ sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./redmine/charts/mariadb/values.yaml
+ sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./redmine/charts/postgresql/values.yaml
+ helm install --name rdm -f ./redmine/values.yaml ./redmine
+ wait_for_service rdm-redmine
+ ;;
+ owncloud)
+ # NOT YET WORKING: needs resolvable hostname for service
+ mkdir ./owncloud/charts
+ cp -r ./mariadb ./owncloud/charts
+ sed -i -- 's/LoadBalancer/NodePort/g' ./owncloud/values.yaml
+ sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./owncloud/values.yaml
+ sed -i -- 's/# storageClass: "-"/storageClass: "slow"/g' ./owncloud/charts/mariadb/values.yaml
+ helm install --name oc -f ./owncloud/values.yaml ./owncloud
+ wait_for_service oc-owncloud
+ ;;
+ *)
+ echo "${FUNCNAME[0]}: demo not implemented for $1"
+ esac
+# extra useful commands
+# kubectl describe pvc
+# kubectl get pvc
+# kubectl describe pods
+# kubectl get pods --namespace default
+# kubectl get pods --all-namespaces
+# kubectl get svc --namespace default dw-dokuwiki
+# kubectl describe svc --namespace default dw-dokuwiki
+# kubectl describe pods --namespace default dw-dokuwiki
+}
+
+function setup_helm() {
+ echo "${FUNCNAME[0]}: Setup helm"
+ # Install Helm
+ cd ~
+ curl https://raw.githubusercontent.com/kubernetes/helm/master/scripts/get > get_helm.sh
+ chmod 700 get_helm.sh
+ ./get_helm.sh
+ helm init
+ helm repo update
+ # TODO: Workaround for bug https://github.com/kubernetes/helm/issues/2224
+ # For testing use only!
+ kubectl create clusterrolebinding permissive-binding --clusterrole=cluster-admin --user=admin --user=kubelet --group=system:serviceaccounts;
+ # TODO: workaround for tiller FailedScheduling (No nodes are available that match all of the following predicates:: PodToleratesNodeTaints (1).)
+ # kubectl taint nodes $HOSTNAME node-role.kubernetes.io/master:NoSchedule-
+ # Wait till tiller is running
+ tiller_deploy=$(kubectl get pods --all-namespaces | grep tiller-deploy | awk '{print $4}')
+ while [[ "$tiller_deploy" != "Running" ]]; do
+ echo "${FUNCNAME[0]}: tiller-deploy status is $tiller_deploy. Waiting 60 seconds for it to be 'Running'"
+ sleep 60
+ tiller_deploy=$(kubectl get pods --all-namespaces | grep tiller-deploy | awk '{print $4}')
+ done
+ echo "${FUNCNAME[0]}: tiller-deploy status is $tiller_deploy"
+
+ # Install services via helm charts from https://kubeapps.com/charts
+ # e.g. helm install stable/dokuwiki
+}
+
+export WORK_DIR=$(pwd)
+case "$1" in
+ master)
+ setup_k8s_master
+ ;;
+ agents)
+ setup_k8s_agents "$2"
+ ;;
+ ceph)
+ setup_ceph "$2" $3 $4 $5
+ ;;
+ helm)
+ setup_helm
+ ;;
+ demo)
+ demo_chart $2
+ ;;
+ all)
+ setup_k8s_master
+ setup_k8s_agents "$2"
+ setup_ceph "$2" $3 $4 $5
+ setup_helm
+ demo_chart dokuwiki
+ ;;
+ clean)
+ # TODO
+ ;;
+ *)
+ if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then grep '#. ' $0; fi
+esac