aboutsummaryrefslogtreecommitdiffstats
path: root/tools/lma/ansible-server/roles
diff options
context:
space:
mode:
authorAditya Srivastava <adityasrivastava301199@gmail.com>2020-08-24 01:53:02 +0530
committerAditya Srivastava <adityasrivastava301199@gmail.com>2020-09-17 16:57:18 +0530
commit38a2852c84bb9ce692a79d3f1ab941b9f11106a4 (patch)
treecd23f1aa69f5e85914d8618401c57335412e92e4 /tools/lma/ansible-server/roles
parente5eef0ffdf2d281fecf12597041fd8af23d65e42 (diff)
Tools: Add K8s monitoring cluster
This patch adds k8s monitoring cluster deployment using ansible for both client and server side. Also adds scripts (ansible roles) to clean (remove) the K8S cluster completely. Signed-off-by: Aditya Srivastava <adityasrivastava301199@gmail.com> Change-Id: I1115869c0a3e72a20047b31994f3d27e5fdae6c6
Diffstat (limited to 'tools/lma/ansible-server/roles')
-rw-r--r--tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml34
-rw-r--r--tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml65
-rw-r--r--tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml26
-rw-r--r--tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml48
-rw-r--r--tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml44
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml37
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml62
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml41
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml62
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml42
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml79
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml30
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml51
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml35
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml35
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml68
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml31
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml33
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml36
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml36
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml26
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml18
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml80
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml33
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml35
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml609
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml73
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml30
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml33
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml34
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml73
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml35
-rw-r--r--tools/lma/ansible-server/roles/monitoring/tasks/main.yml273
33 files changed, 2247 insertions, 0 deletions
diff --git a/tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml
new file mode 100644
index 00000000..83ac086d
--- /dev/null
+++ b/tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml
@@ -0,0 +1,34 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+#check kubelet is running or not
+- name: check for kubelet
+ shell: "systemctl status kubelet"
+ register: _svc_kubelet
+ failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr)
+
+#IF KUBELET IS RUNNING, THEN
+#reset k8s
+- name: reset k8s
+ shell: |
+ kubectl drain {{vm3}} --delete-local-data --force --ignore-daemonsets
+ kubectl drain {{vm2}} --delete-local-data --force --ignore-daemonsets
+ kubectl drain {{vm1}} --delete-local-data --force --ignore-daemonsets
+ kubectl delete node {{vm3}}
+ kubectl delete node {{vm2}}
+ kubectl delete node {{vm1}}
+ sudo kubeadm reset -f
+ sudo rm $HOME/.kube/config
+ when: "_svc_kubelet.rc == 0"
+
diff --git a/tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml
new file mode 100644
index 00000000..6d12bd5f
--- /dev/null
+++ b/tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml
@@ -0,0 +1,65 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+#Uninstalling K8s
+- name: Uninstalling K8s
+ yum:
+ name: ['kubeadm', 'kubectl', 'kubelet', 'docker-ce']
+ state: absent
+
+#Enabling Swap
+- name: Enabling Swap on all nodes
+ shell: swapon -a
+ ignore_errors: yes
+
+#Uncommenting Swap entries
+- name: Uncommenting Swap entries in /etc/fstab
+ replace:
+ path: /etc/fstab
+ regexp: '^# /(.*swap.*)'
+ replace: '\1'
+
+
+#Starting firewalld
+- name: 'Starting firewall'
+ service:
+ name: firewalld
+ state: started
+ enabled: yes
+
+# Enabling SELinux
+- name: Enabling SELinux on all nodes
+ shell: |
+ setenforce 1
+ sudo sed -i 's/^SELINUX=permissive$/SELINUX=enforcing/' /etc/selinux/config
+
+#removing Docker repo
+- name: removing Docker repo
+ command: yum-config-manager --disable docker-ce-stable
+
+#removing K8s repo
+- name: removing repository details in Kubernetes repo file.
+ blockinfile:
+ path: /etc/yum.repos.d/kubernetes.repo
+ state: absent
+ block: |
+ [kubernetes]
+ name=Kubernetes
+ baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64
+ enabled=1
+ gpgcheck=1
+ repo_gpgcheck=1
+ gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg
+ https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
diff --git a/tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml
new file mode 100644
index 00000000..3ba9c9ea
--- /dev/null
+++ b/tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml
@@ -0,0 +1,26 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+#check kubelet is running or not
+- name: check for kubelet
+ shell: "systemctl status kubelet"
+ register: _svc_kubelet
+ failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr)
+
+#IF KUBELET IS RUNNING, THEN
+#reset k8s
+- name: reset k8s
+ command: kubeadm reset -f
+ when: "_svc_kubelet.rc == 0"
+
diff --git a/tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml b/tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml
new file mode 100644
index 00000000..49943ec0
--- /dev/null
+++ b/tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml
@@ -0,0 +1,48 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+#Deleting PAG setup from k8s cluster
+
+#check kubelet is running or not
+- name: check for kubelet
+ shell: "systemctl status kubelet"
+ register: _svc_kubelet
+ failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr)
+
+#***********************************************************************************************************
+#copy yaml to /tmp/files/
+#***********************************************************************************************************
+- name: copy namespace yaml to /tmp/files/
+ copy:
+ src: ../../monitoring/files/monitoring-namespace.yaml
+ dest: /tmp/monitoring-namespace.yaml
+
+#***********************************************************************************************************
+#Deleting Namespace
+#***********************************************************************************************************
+- name: Deleting Namespace
+ k8s:
+ state: absent
+ src: /tmp/monitoring-namespace.yaml
+ namespace: monitoring
+ when: "_svc_kubelet.rc == 0"
+
+#***********************************************************************************************************
+#removing /tmp/files
+#***********************************************************************************************************
+- name: Removing /tmp/monitoring-namespace.yaml
+ file:
+ path: "/tmp/monitoring-namespace.yaml"
+ state: absent
diff --git a/tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml b/tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml
new file mode 100644
index 00000000..157db849
--- /dev/null
+++ b/tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml
@@ -0,0 +1,44 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+#Edit /etc/export
+- name: Edit /etc/export file for NFS
+ lineinfile:
+ path: /etc/exports
+ line: "{{item.line}}"
+ state: absent
+ with_items:
+ - {line: "/srv/nfs/master *(rw,sync,no_root_squash,no_subtree_check)"}
+ - {line: "/srv/nfs/data *(rw,sync,no_root_squash,no_subtree_check)"}
+ - {line: "/usr/share/monitoring_data/grafana *(rw,sync,no_root_squash,no_subtree_check)"}
+
+#uninstall NFS server
+- name: Uninstalling NFS server utils
+ yum:
+ name: nfs-utils
+ state: absent
+
+#remove Elasticsearch data
+- name: Removing Directory for elasticsearch
+ file:
+ path: "/srv/nfs/{{item}}"
+ state: absent
+ with_items:
+ - ['data', 'master']
+
+#remove Grafana data
+- name: Removing Directory for grafana
+ file:
+ path: "/usr/share/monitoring_data/grafana"
+ state: absent
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml
new file mode 100644
index 00000000..7b9abc47
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml
@@ -0,0 +1,37 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: ConfigMap
+apiVersion: v1
+metadata:
+ name: alertmanager-config
+ namespace: monitoring
+data:
+ config.yml: |-
+ global:
+ route:
+ receiver: "webhook"
+ group_by: ['alertname', 'priority']
+ group_wait: 1s
+ group_interval: 5s
+ repeat_interval: 5s
+ routes:
+ - match:
+ severity: critical
+
+ receivers:
+ - name: "webhook"
+ webhook_configs:
+ - url: 'http://10.10.120.20/alertmanager'
+ send_resolved: true
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml
new file mode 100644
index 00000000..f1c3d78e
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml
@@ -0,0 +1,62 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ labels:
+ app: alertmanager
+ adi10hero.monitoring: alertmanager
+ name: alertmanager
+ namespace: monitoring
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: alertmanager
+ adi10hero.monitoring: alertmanager
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ name: alertmanager
+ labels:
+ app: alertmanager
+ adi10hero.monitoring: alertmanager
+ spec:
+ containers:
+ - name: alertmanager
+ image: prom/alertmanager
+ args:
+ - --config.file=/etc/alertmanager/config.yml
+ - --storage.path=/alertmanager
+ - --cluster.peer=alertmanager1:6783
+ - --cluster.listen-address=0.0.0.0:6783
+ ports:
+ - containerPort: 9093
+ - containerPort: 6783
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - name: config-volume
+ mountPath: /etc/alertmanager
+ - name: alertmanager
+ mountPath: /alertmanager
+ restartPolicy: Always
+ volumes:
+ - name: config-volume
+ configMap:
+ name: alertmanager-config
+ - name: alertmanager
+ emptyDir: {}
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml
new file mode 100644
index 00000000..c67517d3
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml
@@ -0,0 +1,41 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: alertmanager
+ app: alertmanager
+ name: alertmanager
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/path: /
+ prometheus.io/port: '8080'
+
+spec:
+ selector:
+ app: alertmanager
+ adi10hero.monitoring: alertmanager
+ type: NodePort
+ ports:
+ - name: "9093"
+ port: 9093
+ targetPort: 9093
+ nodePort: 30930
+ - name: "6783"
+ port: 6783
+ targetPort: 6783
+ nodePort: 30679
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml
new file mode 100644
index 00000000..18b76456
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml
@@ -0,0 +1,62 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ labels:
+ app: alertmanager1
+ adi10hero.monitoring: alertmanager1
+ name: alertmanager1
+ namespace: monitoring
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: alertmanager1
+ adi10hero.monitoring: alertmanager1
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ name: alertmanager1
+ labels:
+ app: alertmanager1
+ adi10hero.monitoring: alertmanager1
+ spec:
+ containers:
+ - name: alertmanager1
+ image: prom/alertmanager
+ args:
+ - --config.file=/etc/alertmanager/config.yml
+ - --storage.path=/alertmanager
+ - --cluster.peer=alertmanager:6783
+ - --cluster.listen-address=0.0.0.0:6783
+ ports:
+ - containerPort: 9093
+ - containerPort: 6783
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - name: config-volume
+ mountPath: /etc/alertmanager
+ - name: alertmanager
+ mountPath: /alertmanager
+ restartPolicy: Always
+ volumes:
+ - name: config-volume
+ configMap:
+ name: alertmanager-config
+ - name: alertmanager
+ emptyDir: {}
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml
new file mode 100644
index 00000000..66d0d2b1
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml
@@ -0,0 +1,42 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: alertmanager1
+ app: alertmanager1
+ name: alertmanager1
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/path: /
+ prometheus.io/port: '8080'
+
+spec:
+ selector:
+ app: alertmanager1
+ adi10hero.monitoring: alertmanager1
+ type: NodePort
+ ports:
+ - name: "9093"
+ port: 9093
+ targetPort: 9093
+ nodePort: 30931
+ - name: "6783"
+ port: 6783
+ targetPort: 6783
+ nodePort: 30678
+
diff --git a/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml
new file mode 100644
index 00000000..6a62985e
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml
@@ -0,0 +1,79 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: cadvisor
+ namespace: monitoring
+ labels:
+ adi10hero.monitoring: cadvisor
+ app: cadvisor
+spec:
+ selector:
+ matchLabels:
+ app: cadvisor
+ adi10hero.monitoring: cadvisor
+ template:
+ metadata:
+ name: cadvisor
+ labels:
+ adi10hero.monitoring: cadvisor
+ app: cadvisor
+ spec:
+ containers:
+ - image: gcr.io/google-containers/cadvisor
+ name: cadvisor
+ ports:
+ - containerPort: 8080
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - mountPath: /rootfs
+ name: cadvisor-hostpath0
+ readOnly: true
+ - mountPath: /var/run
+ name: cadvisor-hostpath1
+ - mountPath: /sys
+ name: cadvisor-hostpath2
+ readOnly: true
+ - mountPath: /sys/fs/cgroup
+ name: cadvisor-hostpath3
+ readOnly: true
+ - mountPath: /dev/disk
+ name: cadvisor-hostpath4
+ readOnly: true
+ - mountPath: /var/lib/docker
+ name: cadvisor-hostpath5
+ readOnly: true
+ restartPolicy: Always
+ volumes:
+ - hostPath:
+ path: /
+ name: cadvisor-hostpath0
+ - hostPath:
+ path: /var/run
+ name: cadvisor-hostpath1
+ - hostPath:
+ path: /sys
+ name: cadvisor-hostpath2
+ - hostPath:
+ path: /cgroup
+ name: cadvisor-hostpath3
+ - hostPath:
+ path: /dev/disk/
+ name: cadvisor-hostpath4
+ - hostPath:
+ path: /var/lib/docker/
+ name: cadvisor-hostpath5
diff --git a/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml
new file mode 100644
index 00000000..734240b8
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml
@@ -0,0 +1,30 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ app: cadvisor
+ adi10hero.monitoring: cadvisor
+ name: cadvisor
+ namespace: monitoring
+spec:
+ ports:
+ - name: "8080"
+ port: 8080
+ targetPort: 8080
+ selector:
+ app: cadvisor
+ adi10hero.monitoring: cadvisor
diff --git a/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml
new file mode 100644
index 00000000..b6bfe0b6
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml
@@ -0,0 +1,51 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: collectd-exporter
+ namespace: monitoring
+ labels:
+ app: collectd-exporter
+ adi10hero.monitoring: collectd-exporter
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: collectd-exporter
+ adi10hero.monitoring: collectd-exporter
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ name: collectd-exporter
+ labels:
+ app: collectd-exporter
+ adi10hero.monitoring: collectd-exporter
+ spec:
+ containers:
+ - args:
+ - --collectd.listen-address=0.0.0.0:25826
+ image: prom/collectd-exporter
+ name: collectd-exporter
+ ports:
+ - containerPort: 9103
+ - containerPort: 25826
+ protocol: UDP
+ securityContext:
+ runAsUser: 0
+ restartPolicy: Always
+ volumes: null
+
diff --git a/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml
new file mode 100644
index 00000000..5609d04a
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ name: collectd-exporter
+ namespace: monitoring
+ labels:
+ app: collectd-exporter
+ adi10hero.monitoring: collectd-exporter
+spec:
+ ports:
+ - name: "9103"
+ port: 9103
+ nodePort: 30103
+ - name: "25826"
+ port: 25826
+ protocol: UDP
+ nodePort: 30826
+ selector:
+ app: collectd-exporter
+ adi10hero.monitoring: collectd-exporter
+ type: NodePort
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml
new file mode 100644
index 00000000..e2b8c9fa
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: grafana-datasources
+ namespace: monitoring
+data:
+ prometheus.yaml: |-
+ {
+ "apiVersion": 1,
+ "datasources": [
+ {
+ "access":"proxy",
+ "editable": true,
+ "name": "prometheus",
+ "orgId": 1,
+ "type": "prometheus",
+ "url": "http://prometheus-main:9090",
+ "version": 1
+ }
+ ]
+ }
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml
new file mode 100644
index 00000000..afb00948
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml
@@ -0,0 +1,68 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ labels:
+ adi10hero.monitoring: grafana
+ app: grafana
+ name: grafana
+ namespace: monitoring
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ adi10hero.monitoring: grafana
+ app: grafana
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ name: grafana
+ labels:
+ adi10hero.monitoring: grafana
+ app: grafana
+ spec:
+ containers:
+ - name: grafana
+ image: grafana/grafana
+ ports:
+ - containerPort: 3000
+ env:
+ - name: GF_SECURITY_ADMIN_PASSWORD
+ value: admin
+ - name: GF_SECURITY_ADMIN_USER
+ value: admin
+ - name: GF_SERVER_DOMAIN
+ value: 10.10.120.20
+ - name: GF_SERVER_ROOT_URL
+ value: "%(protocol)s://%(domain)s:/metrics"
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - mountPath: /var/lib/grafana
+ name: grafana-storage
+ - mountPath: /etc/grafana/provisioning/datasources
+ name: grafana-datasources
+ readOnly: false
+ restartPolicy: Always
+ volumes:
+ - name: grafana-storage
+ persistentVolumeClaim:
+ claimName: grafana-pvc
+ - name: grafana-datasources
+ configMap:
+ defaultMode: 420
+ name: grafana-datasources
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml
new file mode 100644
index 00000000..06bcc31b
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml
@@ -0,0 +1,31 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: grafana-pv
+ namespace: monitoring
+ labels:
+ app: grafana-pv
+ adi10hero.monitoring: grafana-pv
+spec:
+ storageClassName: monitoring
+ capacity:
+ storage: 5Gi
+ accessModes:
+ - ReadWriteMany
+ nfs:
+ server: 10.10.120.211
+ path: "/usr/share/monitoring_data/grafana"
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml
new file mode 100644
index 00000000..2c2955c8
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml
@@ -0,0 +1,33 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: grafana-pvc
+ namespace: monitoring
+ labels:
+ app: grafana-pvc
+ adi10hero.monitoring: grafana-pvc
+spec:
+ accessModes:
+ - ReadWriteMany
+ storageClassName: monitoring
+ resources:
+ requests:
+ storage: 4Gi
+ selector:
+ matchLabels:
+ app: grafana-pv
+ adi10hero.monitoring: grafana-pv
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml
new file mode 100644
index 00000000..d1c9c9cc
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml
@@ -0,0 +1,36 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ name: grafana
+ namespace: monitoring
+ labels:
+ app: grafana
+ adi10hero.monitoring: grafana
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/port: '3000'
+spec:
+ selector:
+ app: grafana
+ adi10hero.monitoring: grafana
+ type: NodePort
+ ports:
+ - name: "3000"
+ port: 3000
+ targetPort: 3000
+ nodePort: 30000
+
diff --git a/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml
new file mode 100644
index 00000000..af3c5469
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml
@@ -0,0 +1,36 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: kube-state-metrics
+ namespace: kube-system
+spec:
+ selector:
+ matchLabels:
+ app: kube-state-metrics
+ replicas: 1
+ template:
+ metadata:
+ labels:
+ app: kube-state-metrics
+ spec:
+ #serviceAccountName: prometheus
+ containers:
+ - name: kube-state-metrics
+ image: quay.io/coreos/kube-state-metrics:v1.2.0
+ ports:
+ - containerPort: 8080
+ name: monitoring
diff --git a/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml
new file mode 100644
index 00000000..8d294391
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml
@@ -0,0 +1,26 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: Service
+apiVersion: v1
+metadata:
+ name: kube-state-metrics
+ namespace: kube-system
+spec:
+ selector:
+ app: kube-state-metrics
+ ports:
+ - protocol: TCP
+ port: 8080
+ targetPort: 8080
diff --git a/tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml b/tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml
new file mode 100644
index 00000000..f1c9b889
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml
@@ -0,0 +1,18 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: monitoring
diff --git a/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml
new file mode 100644
index 00000000..9334b2f4
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml
@@ -0,0 +1,80 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: node-exporter-daemonset
+ namespace: monitoring
+ labels:
+ app: node-exporter
+ adi10hero.monitoring: node-exporter
+spec:
+ selector:
+ matchLabels:
+ app: node-exporter
+ adi10hero.monitoring: node-exporter
+ template:
+ metadata:
+ labels:
+ app: node-exporter
+ adi10hero.monitoring: node-exporter
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/port: "9100"
+ spec:
+ hostPID: true
+ hostIPC: true
+ hostNetwork: true
+ containers:
+ - ports:
+ - containerPort: 9100
+ protocol: TCP
+ resources:
+ requests:
+ cpu: 0.15
+ securityContext:
+ runAsUser: 0
+ privileged: true
+ image: prom/node-exporter:v0.15.2
+ args:
+ - --path.procfs
+ - /host/proc
+ - --path.sysfs
+ - /host/sys
+ - --collector.filesystem.ignored-mount-points
+ - '"^/(sys|proc|dev|host|etc)($|/)"'
+ name: node-exporter
+ volumeMounts:
+ - name: dev
+ mountPath: /host/dev
+ - name: proc
+ mountPath: /host/proc
+ - name: sys
+ mountPath: /host/sys
+ - name: rootfs
+ mountPath: /rootfs
+ volumes:
+ - name: proc
+ hostPath:
+ path: /proc
+ - name: dev
+ hostPath:
+ path: /dev
+ - name: sys
+ hostPath:
+ path: /sys
+ - name: rootfs
+ hostPath:
+ path: /
diff --git a/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml
new file mode 100644
index 00000000..dd0aea4d
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml
@@ -0,0 +1,33 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: node-exporter
+ app: node-exporter
+ name: node-exporter
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/port: "9100"
+spec:
+ ports:
+ - name: "node-exporter"
+ port: 9100
+ targetPort: 9100
+ selector:
+ adi10hero.monitoring: node-exporter
+ app: node-exporter
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml
new file mode 100644
index 00000000..58b220a8
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: prometheus-main
+ app: prometheus-main
+ name: prometheus-main
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/port: '9090'
+spec:
+ type: NodePort
+ ports:
+ - name: prometheus-main
+ protocol: TCP
+ port: 9090
+ nodePort: 30902
+ selector:
+ adi10hero.monitoring: prometheus1
+ app: prometheus
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml
new file mode 100644
index 00000000..917f978f
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml
@@ -0,0 +1,609 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: prometheus-config
+ namespace: monitoring
+data:
+ alert.rules: |-
+ groups:
+ - name: targets
+ rules:
+ - alert: MonitorServiceDown
+ expr: up == 0
+ for: 30s
+ labels:
+ severity: critical
+ annotations:
+ summary: "Monitor service non-operational"
+ description: "Service {{ $labels.instance }} is down."
+ - alert: HighCpuLoad
+ expr: node_load1 > 1.9
+ for: 15s
+ labels:
+ severity: critical
+ annotations:
+ summary: "Service under high load"
+ description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+ - name: host and hardware
+ rules:
+ - alert: HostHighCpuLoad
+ expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host high CPU load (instance {{ $labels.instance }})"
+ description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostSwapIsFillingUp
+ expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host swap is filling up (instance {{ $labels.instance }})"
+ description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HighMemoryLoad
+ expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
+ for: 30s
+ labels:
+ severity: warning
+ annotations:
+ summary: "Server memory is almost full"
+ description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+ - alert: HighStorageLoad
+ expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
+ for: 30s
+ labels:
+ severity: warning
+ annotations:
+ summary: "Server storage is almost full"
+ description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+ - alert: HostNetworkTransmitErrors
+ expr: increase(node_network_transmit_errs_total[5m]) > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host Network Transmit Errors (instance {{ $labels.instance }})"
+ description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostOutOfMemory
+ expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host out of memory (instance {{ $labels.instance }})"
+ description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostMemoryUnderMemoryPressure
+ expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
+ description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostUnusualNetworkThroughputIn
+ expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
+ description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostUnusualNetworkThroughputOut
+ expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
+ description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostUnusualDiskRateRead
+ expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host unusual disk read rate (instance {{ $labels.instance }})"
+ description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostUnusualDiskRateWrite
+ expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host unusual disk write rate (instance {{ $labels.instance }})"
+ description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostOutOfDiskSpace
+ expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host out of disk space (instance {{ $labels.instance }})"
+ description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostDiskWillFillIn4Hours
+ expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
+ description: "Disk will fill in 4 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostPhysicalComponentTooHot
+ expr: node_hwmon_temp_celsius > 75
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host physical component too hot (instance {{ $labels.instance }})"
+ description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostNodeOvertemperatureAlarm
+ expr: node_hwmon_temp_alarm == 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Host node overtemperature alarm (instance {{ $labels.instance }})"
+ description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostKernelVersionDeviations
+ expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host kernel version deviations (instance {{ $labels.instance }})"
+ description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostOomKillDetected
+ expr: increase(node_vmstat_oom_kill[5m]) > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host OOM kill detected (instance {{ $labels.instance }})"
+ description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostEdacCorrectableErrorsDetected
+ expr: increase(node_edac_correctable_errors_total[5m]) > 0
+ for: 5m
+ labels:
+ severity: info
+ annotations:
+ summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})"
+ description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostEdacUncorrectableErrorsDetected
+ expr: node_edac_uncorrectable_errors_total > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})"
+ description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostNetworkReceiveErrors
+ expr: increase(node_network_receive_errs_total[5m]) > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host Network Receive Errors (instance {{ $labels.instance }})"
+ description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostNetworkTransmitErrors
+ expr: increase(node_network_transmit_errs_total[5m]) > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host Network Transmit Errors (instance {{ $labels.instance }})"
+ description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - name: container
+ rules:
+ - alert: ContainerKilled
+ expr: time() - container_last_seen > 60
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container killed (instance {{ $labels.instance }})"
+ description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: ContainerCpuUsage
+ expr: sum by(instance, name) (rate(container_cpu_usage_seconds_total[3m]) * 100 > 80)
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container CPU usage (instance {{ $labels.instance }})"
+ description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: ContainerMemoryUsage
+ expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 125
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container Memory usage (instance {{ $labels.instance }})"
+ description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: ContainerVolumeUsage
+ expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container Volume usage (instance {{ $labels.instance }})"
+ description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: ContainerVolumeIoUsage
+ expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container Volume IO usage (instance {{ $labels.instance }})"
+ description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: ContainerHighThrottleRate
+ expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container high throttle rate (instance {{ $labels.instance }})"
+ description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - name: kubernetes
+ rules:
+ - alert: KubernetesNodeReady
+ expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes Node ready (instance {{ $labels.instance }})"
+ description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesMemoryPressure
+ expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes memory pressure (instance {{ $labels.instance }})"
+ description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesDiskPressure
+ expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes disk pressure (instance {{ $labels.instance }})"
+ description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesOutOfDisk
+ expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes out of disk (instance {{ $labels.instance }})"
+ description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesJobFailed
+ expr: kube_job_status_failed > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes Job failed (instance {{ $labels.instance }})"
+ description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesCronjobSuspended
+ expr: kube_cronjob_spec_suspend != 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes CronJob suspended (instance {{ $labels.instance }})"
+ description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesPersistentvolumeclaimPending
+ expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})"
+ description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesVolumeOutOfDiskSpace
+ expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes Volume out of disk space (instance {{ $labels.instance }})"
+ description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesVolumeFullInFourDays
+ expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes Volume full in four days (instance {{ $labels.instance }})"
+ description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesPersistentvolumeError
+ expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes PersistentVolume error (instance {{ $labels.instance }})"
+ description: "Persistent volume is in bad state\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesStatefulsetDown
+ expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes StatefulSet down (instance {{ $labels.instance }})"
+ description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesHpaScalingAbility
+ expr: kube_hpa_status_condition{condition="false", status="AbleToScale"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes HPA scaling ability (instance {{ $labels.instance }})"
+ description: "Pod is unable to scale\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesHpaMetricAvailability
+ expr: kube_hpa_status_condition{condition="false", status="ScalingActive"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes HPA metric availability (instance {{ $labels.instance }})"
+ description: "HPA is not able to colelct metrics\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesHpaScaleCapability
+ expr: kube_hpa_status_desired_replicas >= kube_hpa_spec_max_replicas
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes HPA scale capability (instance {{ $labels.instance }})"
+ description: "The maximum number of desired Pods has been hit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesPodNotHealthy
+ expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:]) > 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes Pod not healthy (instance {{ $labels.instance }})"
+ description: "Pod has been in a non-ready state for longer than an hour.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesPodCrashLooping
+ expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 5
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes pod crash looping (instance {{ $labels.instance }})"
+ description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesReplicassetMismatch
+ expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})"
+ description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesDeploymentReplicasMismatch
+ expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})"
+ description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesStatefulsetReplicasMismatch
+ expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})"
+ description: "A StatefulSet has not matched the expected number of replicas for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesDeploymentGenerationMismatch
+ expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})"
+ description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesStatefulsetGenerationMismatch
+ expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})"
+ description: "A StatefulSet has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesStatefulsetUpdateNotRolledOut
+ expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})"
+ description: "StatefulSet update has not been rolled out.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesDaemonsetRolloutStuck
+ expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})"
+ description: "Some Pods of DaemonSet are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesDaemonsetMisscheduled
+ expr: kube_daemonset_status_number_misscheduled > 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})"
+ description: "Some DaemonSet Pods are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesCronjobTooLong
+ expr: time() - kube_cronjob_next_schedule_time > 3600
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes CronJob too long (instance {{ $labels.instance }})"
+ description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesJobCompletion
+ expr: kube_job_spec_completions - kube_job_status_succeeded > 0 or kube_job_status_failed > 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes job completion (instance {{ $labels.instance }})"
+ description: "Kubernetes Job failed to complete\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesApiServerErrors
+ expr: sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[2m])) / sum(rate(apiserver_request_count{job="apiserver"}[2m])) * 100 > 3
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes API server errors (instance {{ $labels.instance }})"
+ description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesApiClientErrors
+ expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[2m])) by (instance, job) / sum(rate(rest_client_requests_total[2m])) by (instance, job)) * 100 > 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes API client errors (instance {{ $labels.instance }})"
+ description: "Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesClientCertificateExpiresNextWeek
+ expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes client certificate expires next week (instance {{ $labels.instance }})"
+ description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesClientCertificateExpiresSoon
+ expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes client certificate expires soon (instance {{ $labels.instance }})"
+ description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesApiServerLatency
+ expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes API server latency (instance {{ $labels.instance }})"
+ description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+
+ prometheus.yml: |-
+ global:
+ scrape_interval: 15s
+ evaluation_interval: 15s
+
+ rule_files:
+ - "/etc/prometheus/alert.rules"
+
+ scrape_configs:
+ - job_name: 'collectd-exporter'
+ scrape_interval: 5s
+ static_configs:
+ - targets: ['collectd-exporter:9103']
+
+ - job_name: 'cadvisor'
+ scrape_interval: 5s
+ static_configs:
+ - targets: ['cadvisor:8080']
+
+ - job_name: 'node-exporter'
+ scrape_interval: 5s
+ static_configs:
+ - targets: ['node-exporter:9100']
+
+ - job_name: 'prometheus'
+ scrape_interval: 10s
+ static_configs:
+ - targets: ['localhost:9090']
+
+ - job_name: 'kube-state-metrics'
+ scrape_interval: 10s
+ static_configs:
+ - targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080']
+
+ alerting:
+ alertmanagers:
+ - scheme: http
+ static_configs:
+ - targets: ['alertmanager:9093', 'alertmanager1:9093']
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml
new file mode 100644
index 00000000..5b98b154
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml
@@ -0,0 +1,73 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: prometheus-deployment
+ namespace: monitoring
+ labels:
+ app: prometheus
+ adi10hero.monitoring: prometheus
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ adi10hero.monitoring: prometheus
+ app: prometheus
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ labels:
+ adi10hero.monitoring: prometheus
+ app: prometheus
+ spec:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: kubernetes.io/hostname
+ operator: In
+ values:
+ - vm2
+ containers:
+ - name: prometheus
+ image: prom/prometheus
+ args:
+ - --config.file=/etc/prometheus/prometheus.yml
+ - --storage.tsdb.path=/prometheus
+ - --storage.tsdb.retention.size=3GB
+ - --storage.tsdb.retention.time=30d
+ - --web.console.libraries=/etc/prometheus/console_libraries
+ - --web.console.templates=/etc/prometheus/consoles
+ ports:
+ - containerPort: 9090
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - name: prometheus-config-volume
+ mountPath: /etc/prometheus/
+ - name: prometheus-storage-volume
+ mountPath: /prometheus/
+ restartPolicy: Always
+ volumes:
+ - name: prometheus-config-volume
+ configMap:
+ defaultMode: 420
+ name: prometheus-config
+ - name: prometheus-storage-volume
+ persistentVolumeClaim:
+ claimName: prometheus-pvc
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml
new file mode 100644
index 00000000..f10cd073
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml
@@ -0,0 +1,30 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: prometheus-pv
+ namespace: monitoring
+ labels:
+ app: prometheus-pv
+ adi10hero.monitoring: prometheus-pv
+spec:
+ storageClassName: monitoring
+ capacity:
+ storage: 6Gi
+ accessModes:
+ - ReadWriteMany
+ hostPath:
+ path: "/usr/share/monitoring_data/prometheus"
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml
new file mode 100644
index 00000000..812fcc73
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml
@@ -0,0 +1,33 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: prometheus-pvc
+ namespace: monitoring
+ labels:
+ app: prometheus-pvc
+ adi10hero.monitoring: prometheus-pvc
+spec:
+ accessModes:
+ - ReadWriteMany
+ storageClassName: monitoring
+ resources:
+ requests:
+ storage: 3Gi
+ selector:
+ matchLabels:
+ app: prometheus-pv
+ adi10hero.monitoring: prometheus-pv
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml
new file mode 100644
index 00000000..5be76d3e
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml
@@ -0,0 +1,34 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: prometheus
+ app: prometheus
+ name: prometheus
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/port: '9090'
+spec:
+ type: NodePort
+ ports:
+ - name: prometheus
+ protocol: TCP
+ port: 9090
+ nodePort: 30900
+ selector:
+ adi10hero.monitoring: prometheus
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml
new file mode 100644
index 00000000..149bea84
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml
@@ -0,0 +1,73 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: prometheus1-deployment
+ namespace: monitoring
+ labels:
+ app: prometheus1
+ adi10hero.monitoring: prometheus1
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ adi10hero.monitoring: prometheus1
+ app: prometheus1
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ labels:
+ adi10hero.monitoring: prometheus1
+ app: prometheus1
+ spec:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: kubernetes.io/hostname
+ operator: In
+ values:
+ - vm3
+ containers:
+ - name: prometheus
+ image: prom/prometheus
+ args:
+ - --config.file=/etc/prometheus/prometheus.yml
+ - --storage.tsdb.path=/prometheus
+ - --storage.tsdb.retention.size=3GB
+ - --storage.tsdb.retention.time=30d
+ - --web.console.libraries=/etc/prometheus/console_libraries
+ - --web.console.templates=/etc/prometheus/consoles
+ ports:
+ - containerPort: 9090
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - name: prometheus-config-volume
+ mountPath: /etc/prometheus/
+ - name: prometheus-storage-volume
+ mountPath: /prometheus/
+ restartPolicy: Always
+ volumes:
+ - name: prometheus-config-volume
+ configMap:
+ defaultMode: 420
+ name: prometheus-config
+ - name: prometheus-storage-volume
+ persistentVolumeClaim:
+ claimName: prometheus-pvc
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml
new file mode 100644
index 00000000..439deec1
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: prometheus1
+ app: prometheus1
+ name: prometheus1
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/port: '9090'
+spec:
+ type: NodePort
+ ports:
+ - name: prometheus1
+ protocol: TCP
+ port: 9090
+ nodePort: 30901
+ selector:
+ adi10hero.monitoring: prometheus1
+ app: prometheus1
diff --git a/tools/lma/ansible-server/roles/monitoring/tasks/main.yml b/tools/lma/ansible-server/roles/monitoring/tasks/main.yml
new file mode 100644
index 00000000..cd4e6aca
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/tasks/main.yml
@@ -0,0 +1,273 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+#PAG setup in k8s cluster
+
+#***********************************************************************************************************
+#copy all yaml to /tmp/files/
+#***********************************************************************************************************
+- name: copy all yaml to /tmp/files/
+ copy:
+ src: ../files/
+ dest: /tmp/files/
+
+#***********************************************************************************************************
+#Creating Namespace
+#***********************************************************************************************************
+- name: Creating Monitoring Namespace
+ k8s:
+ state: present
+ src: /tmp/files/monitoring-namespace.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume
+#***********************************************************************************************************
+- name: creating Persistent Volume for Prometheus
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus-pv.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume
+#***********************************************************************************************************
+- name: creating Persistent Volume for Grafana
+ k8s:
+ state: present
+ src: /tmp/files/grafana/grafana-pv.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume Claim
+#***********************************************************************************************************
+- name: creating Persistent Volume Claim for Prometheus
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus-pvc.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume Claim
+#***********************************************************************************************************
+- name: creating Persistent Volume Claim for Grafana
+ k8s:
+ state: present
+ src: /tmp/files/grafana/grafana-pvc.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the CAdvisor deamonset
+#***********************************************************************************************************
+- name: Creating cAdvisor deamonset
+ k8s:
+ state: present
+ src: /tmp/files/cadvisor/cadvisor-deamonset.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Starting the CAdvisor service
+#***********************************************************************************************************
+- name: Starting cAdvisor service
+ k8s:
+ state: present
+ src: /tmp/files/cadvisor/cadvisor-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Deploying and Starting the kube-system-metrics service
+#***********************************************************************************************************
+- name: Deploying kube-system-metrics
+ k8s:
+ state: present
+ src: /tmp/files/kube-state-metrics/kube-state-metrics-deployment.yaml
+ namespace: kube-system
+
+- name: Starting kube-system-metrics service
+ k8s:
+ state: present
+ src: /tmp/files/kube-state-metrics/kube-state-metrics-service.yaml
+ namespace: kube-system
+
+#***********************************************************************************************************
+#Making the NodeExporter deamonset
+#***********************************************************************************************************
+- name: Creating NodeExporter deamonset
+ k8s:
+ state: present
+ src: /tmp/files/node-exporter/nodeexporter-daemonset.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Starting the NodeExporter service
+#***********************************************************************************************************
+- name: Starting NodeExporter service
+ k8s:
+ state: present
+ src: /tmp/files/node-exporter/nodeexporter-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the collectd-exporter deployment
+#***********************************************************************************************************
+- name: Creating collectd-exporter deamonset
+ k8s:
+ state: present
+ src: /tmp/files/collectd-exporter/collectd-exporter-deployment.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the collectd-exporter service
+#***********************************************************************************************************
+- name: Creating collectd-exporter service
+ k8s:
+ state: present
+ src: /tmp/files/collectd-exporter/collectd-exporter-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Webhook goes here
+#***********************************************************************************************************
+
+#***********************************************************************************************************
+#Making the config file for Alertmanagers
+#***********************************************************************************************************
+- name: Creating config map for Alertmanagers
+ k8s:
+ state: present
+ src: /tmp/files/alertmanager/alertmanager-config.yaml
+ namespace: monitoring
+
+# - name: Creating config map for Alertmanagers
+# k8s:
+# state: present
+# src: /tmp/files/alertmanager1-config.yaml
+# namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 1st alertmanager deployment
+#***********************************************************************************************************
+- name: Creating 1st alertmanager deployment
+ k8s:
+ state: present
+ src: /tmp/files/alertmanager/alertmanager-deployment.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 1st alertmanager service
+#***********************************************************************************************************
+- name: Creating 1st alertmanager service
+ k8s:
+ state: present
+ src: /tmp/files/alertmanager/alertmanager-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 2nd alertmanager deployment
+#***********************************************************************************************************
+- name: Creating 2nd alertmanager deployment
+ k8s:
+ state: present
+ src: /tmp/files/alertmanager/alertmanager1-deployment.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 2nd alertmanager service
+#***********************************************************************************************************
+- name: Creating 2nd alertmanager service
+ k8s:
+ state: present
+ src: /tmp/files/alertmanager/alertmanager1-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the config file for Prometheus
+#***********************************************************************************************************
+- name: Creating 1st Prometheus Config
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus-config.yaml
+ namespace: monitoring
+
+# - name: Creating 2nd Prometheus Config
+# k8s:
+# state: present
+# src: /tmp/files/prometheus1-config.yaml
+# namespace: monitoring
+
+#***********************************************************************************************************
+#Starting Prometheus
+#***********************************************************************************************************
+- name: Starting Prometheus 1
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus-deployment.yaml
+ namespace: monitoring
+
+- name: Starting Prometheus 2
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus1-deployment.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Starting Prometheus Service
+#***********************************************************************************************************
+- name: Starting Prometheus 1 Service
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus-service.yaml
+ namespace: monitoring
+
+- name: Starting Prometheus 2 Service
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus1-service.yaml
+ namespace: monitoring
+
+- name: Starting Main Prometheus Service
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/main-prometheus-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Starting Grafana
+#***********************************************************************************************************
+- name: Creating Grafana Datasource Config
+ k8s:
+ state: present
+ src: /tmp/files/grafana/grafana-datasource-config.yaml
+ namespace: monitoring
+
+- name: Starting Grafana
+ k8s:
+ state: present
+ src: /tmp/files/grafana/grafana-deployment.yaml
+ namespace: monitoring
+
+- name: Starting Grafana Service
+ k8s:
+ state: present
+ src: /tmp/files/grafana/grafana-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#removing /tmp/files
+#***********************************************************************************************************
+- name: Removing /tmp/files
+ file:
+ path: "/tmp/files"
+ state: absent