From 38a2852c84bb9ce692a79d3f1ab941b9f11106a4 Mon Sep 17 00:00:00 2001 From: Aditya Srivastava Date: Mon, 24 Aug 2020 01:53:02 +0530 Subject: Tools: Add K8s monitoring cluster This patch adds k8s monitoring cluster deployment using ansible for both client and server side. Also adds scripts (ansible roles) to clean (remove) the K8S cluster completely. Signed-off-by: Aditya Srivastava Change-Id: I1115869c0a3e72a20047b31994f3d27e5fdae6c6 --- tools/lma/ansible-client/ansible.cfg | 17 + tools/lma/ansible-client/hosts | 2 + tools/lma/ansible-client/playbooks/clean.yaml | 25 + .../ansible-client/roles/clean-collectd/main.yml | 44 ++ .../roles/collectd/files/collectd.conf.j2 | 44 ++ .../ansible-client/roles/collectd/tasks/main.yml | 60 ++ tools/lma/ansible-server/ansible.cfg | 17 + tools/lma/ansible-server/group_vars/all.yml | 27 + tools/lma/ansible-server/hosts | 12 + tools/lma/ansible-server/playbooks/clean.yaml | 52 ++ .../roles/clean-k8s-cluster/tasks/main.yml | 34 + .../roles/clean-k8s-pre/tasks/main.yml | 65 ++ .../roles/clean-k8s-worker-reset/tasks/main.yml | 26 + .../roles/clean-monitoring/tasks/main.yml | 48 ++ .../ansible-server/roles/clean-nfs/tasks/main.yml | 44 ++ .../files/alertmanager/alertmanager-config.yaml | 37 + .../alertmanager/alertmanager-deployment.yaml | 62 ++ .../files/alertmanager/alertmanager-service.yaml | 41 + .../alertmanager/alertmanager1-deployment.yaml | 62 ++ .../files/alertmanager/alertmanager1-service.yaml | 42 + .../files/cadvisor/cadvisor-deamonset.yaml | 79 ++ .../files/cadvisor/cadvisor-service.yaml | 30 + .../collectd-exporter-deployment.yaml | 51 ++ .../collectd-exporter-service.yaml | 35 + .../files/grafana/grafana-datasource-config.yaml | 35 + .../files/grafana/grafana-deployment.yaml | 68 ++ .../roles/monitoring/files/grafana/grafana-pv.yaml | 31 + .../monitoring/files/grafana/grafana-pvc.yaml | 33 + .../monitoring/files/grafana/grafana-service.yaml | 36 + .../kube-state-metrics-deployment.yaml | 36 + .../kube-state-metrics-service.yaml | 26 + .../monitoring/files/monitoring-namespace.yaml | 18 + .../node-exporter/nodeexporter-daemonset.yaml | 80 ++ .../files/node-exporter/nodeexporter-service.yaml | 33 + .../files/prometheus/main-prometheus-service.yaml | 35 + .../files/prometheus/prometheus-config.yaml | 609 +++++++++++++++ .../files/prometheus/prometheus-deployment.yaml | 73 ++ .../monitoring/files/prometheus/prometheus-pv.yaml | 30 + .../files/prometheus/prometheus-pvc.yaml | 33 + .../files/prometheus/prometheus-service.yaml | 34 + .../files/prometheus/prometheus1-deployment.yaml | 73 ++ .../files/prometheus/prometheus1-service.yaml | 35 + .../ansible-server/roles/monitoring/tasks/main.yml | 273 +++++++ tools/lma/metrics/dashboard/cpu_usage_using.json | 750 ++++++++++++++++++ tools/lma/metrics/dashboard/memory_using.json | 337 ++++++++ tools/lma/metrics/dashboard/ovs_stats_using.json | 854 +++++++++++++++++++++ tools/lma/metrics/dashboard/rdt_using.json | 833 ++++++++++++++++++++ 47 files changed, 5321 insertions(+) create mode 100644 tools/lma/ansible-client/ansible.cfg create mode 100644 tools/lma/ansible-client/hosts create mode 100644 tools/lma/ansible-client/playbooks/clean.yaml create mode 100644 tools/lma/ansible-client/roles/clean-collectd/main.yml create mode 100644 tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2 create mode 100644 tools/lma/ansible-client/roles/collectd/tasks/main.yml create mode 100644 tools/lma/ansible-server/ansible.cfg create mode 100644 tools/lma/ansible-server/group_vars/all.yml create mode 100644 tools/lma/ansible-server/hosts create mode 100644 tools/lma/ansible-server/playbooks/clean.yaml create mode 100644 tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml create mode 100644 tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml create mode 100644 tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml create mode 100644 tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml create mode 100644 tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml create mode 100644 tools/lma/ansible-server/roles/monitoring/tasks/main.yml create mode 100644 tools/lma/metrics/dashboard/cpu_usage_using.json create mode 100644 tools/lma/metrics/dashboard/memory_using.json create mode 100644 tools/lma/metrics/dashboard/ovs_stats_using.json create mode 100644 tools/lma/metrics/dashboard/rdt_using.json diff --git a/tools/lma/ansible-client/ansible.cfg b/tools/lma/ansible-client/ansible.cfg new file mode 100644 index 00000000..307ef457 --- /dev/null +++ b/tools/lma/ansible-client/ansible.cfg @@ -0,0 +1,17 @@ +[defaults] +inventory = ./hosts +host_key_checking = false + +# additional path to search for roles in +roles_path = roles + +# enable logging +log_path = ./ansible.log + +[privilege_escalation] +become=True +become_method=sudo +become_user=root + +[ssh_connection] +pipelining = True diff --git a/tools/lma/ansible-client/hosts b/tools/lma/ansible-client/hosts new file mode 100644 index 00000000..eba586ce --- /dev/null +++ b/tools/lma/ansible-client/hosts @@ -0,0 +1,2 @@ +[all] +127.0.0.1 ansible_connection=local diff --git a/tools/lma/ansible-client/playbooks/clean.yaml b/tools/lma/ansible-client/playbooks/clean.yaml new file mode 100644 index 00000000..4f77b062 --- /dev/null +++ b/tools/lma/ansible-client/playbooks/clean.yaml @@ -0,0 +1,25 @@ +# Copyright 2020 Adarsh yadav, Aditya Srivastava +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#clean td-agent +- name: clean td-agent + hosts: all + roles: + - clean-td-agent + +#clean collectd +- name: clean collectd + hosts: all + roles: + - clean-collectd diff --git a/tools/lma/ansible-client/roles/clean-collectd/main.yml b/tools/lma/ansible-client/roles/clean-collectd/main.yml new file mode 100644 index 00000000..97100cad --- /dev/null +++ b/tools/lma/ansible-client/roles/clean-collectd/main.yml @@ -0,0 +1,44 @@ +# Copyright 2020 Aditya Srivastava +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +- hosts: localhost + + tasks: + - name: Check and install dependencies + yum: + name: docker + state: present + + - name: Install python sdk + yum: + name: python-docker-py + state: present + + - name: Stopping collectd container + docker_container: + name: collectd + state: stopped + + - name: Removing collectd container + docker_container: + name: collectd + state: absent + + # Removes the image (not recommended) + # - name: Remove image + # docker_image: + # state: absent + # name: opnfv/barometer-collectd + # tag: latest diff --git a/tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2 b/tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2 new file mode 100644 index 00000000..ba953e3a --- /dev/null +++ b/tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2 @@ -0,0 +1,44 @@ +Hostname "{{ host_name }}" +Interval 10 +LoadPlugin intel_rdt +LoadPlugin processes +LoadPlugin interface +LoadPlugin network +LoadPlugin ovs_stats +LoadPlugin cpu +LoadPlugin memory +#LoadPlugin csv +#LoadPlugin write_http +#LoadPlugin dpdkstat +############################################################################## +# Plugin configuration # +############################################################################## + + ProcessMatch "ovs-vswitchd" "ovs-vswitchd" + ProcessMatch "ovsdb-server" "ovsdb-server" + ProcessMatch "collectd" "collectd" + + + + ReportByCpu true + ReportByState true + ValuesPercentage true + ReportNumCpu true + ReportGuestState false + SubtractGuestState false + + + + Server "10.10.120.211" "30826" + + + + Port "6640" + Address "127.0.0.1" + Socket "/usr/local/var/run/openvswitch/db.sock" + Bridges "vsperf-br0" + + + + Cores "2" "4-5" "6-7" "8" "9" "22" "23" "24" "25" "26" "27" + diff --git a/tools/lma/ansible-client/roles/collectd/tasks/main.yml b/tools/lma/ansible-client/roles/collectd/tasks/main.yml new file mode 100644 index 00000000..0befb22b --- /dev/null +++ b/tools/lma/ansible-client/roles/collectd/tasks/main.yml @@ -0,0 +1,60 @@ +# Copyright 2020 Aditya Srivastava +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- + +# Dependency check +- name: Check and install dependencies + yum: + name: ['docker', 'python-docker-py'] + state: present + +- name: Install pip + yum: + name: python-pip + state: present + +- name: install docker-py + pip: name=docker-py + +- name: Cloning barometer + git: + repo: https://gerrit.opnfv.org/gerrit/barometer + dest: /tmp/barometer + +- name: Create Folder + file: + path: /tmp/barometer/docker/src/collectd_sample_configs + state: directory + +# Build collectd +- name: Downlaod and Build Image + command: chdir=/tmp/ {{ item }} + become: true + with_items: + - docker build -t opnfv/barometer-collectd -f barometer/docker/barometer-collectd/Dockerfile barometer/docker/barometer-collectd + +# Configuring collectd0 +- name: Ensure collectd is configured + template: + src: ../files/collectd.conf.j2 + dest: /tmp/barometer/docker/src/collectd_sample_configs/collectd.conf + +# Running Collectd container ##################### +- name: Running collectd + command : chdir=/tmp/ {{ item }} + become: true + with_items: + - docker run -tid --name collectd --net=host -v /tmp/barometer/docker/src/collectd_sample_configs:/opt/collectd/etc/collectd.conf.d -v /var/run:/var/run -v /tmp:/tmp --privileged opnfv/barometer-collectd /run_collectd.sh + - docker ps diff --git a/tools/lma/ansible-server/ansible.cfg b/tools/lma/ansible-server/ansible.cfg new file mode 100644 index 00000000..307ef457 --- /dev/null +++ b/tools/lma/ansible-server/ansible.cfg @@ -0,0 +1,17 @@ +[defaults] +inventory = ./hosts +host_key_checking = false + +# additional path to search for roles in +roles_path = roles + +# enable logging +log_path = ./ansible.log + +[privilege_escalation] +become=True +become_method=sudo +become_user=root + +[ssh_connection] +pipelining = True diff --git a/tools/lma/ansible-server/group_vars/all.yml b/tools/lma/ansible-server/group_vars/all.yml new file mode 100644 index 00000000..b0725ff5 --- /dev/null +++ b/tools/lma/ansible-server/group_vars/all.yml @@ -0,0 +1,27 @@ +# Copyright 2020 Adarsh yadav, Aditya Srivastava +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#apiserver advertise address +ad_addr: 10.10.120.211 + +#pod network cidr +pod_cidr: 192.168.0.0/16 + +#token generated by master +token_file: join_token + +#give hostname +vm3: 'vm3' +vm2: 'vm2' +vm1: 'vm1' diff --git a/tools/lma/ansible-server/hosts b/tools/lma/ansible-server/hosts new file mode 100644 index 00000000..0a13d754 --- /dev/null +++ b/tools/lma/ansible-server/hosts @@ -0,0 +1,12 @@ +[all] +10.10.120.211 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd +10.10.120.203 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd +10.10.120.204 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd + + +[master] +10.10.120.211 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd + +[worker-nodes] +10.10.120.203 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd +10.10.120.204 ansible_connection=ssh ansible_ssh_user=root ansible_sudo_pass=P@ssw0rd ansible_ssh_pass=P@ssw0rd \ No newline at end of file diff --git a/tools/lma/ansible-server/playbooks/clean.yaml b/tools/lma/ansible-server/playbooks/clean.yaml new file mode 100644 index 00000000..b4da66da --- /dev/null +++ b/tools/lma/ansible-server/playbooks/clean.yaml @@ -0,0 +1,52 @@ +# Copyright 2020 Adarsh yadav, Aditya Srivastava +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# clean monitoring +- name: Clean PAG setup + hosts: master + roles: + - clean-monitoring + +#clean logging +- name: Clean EFK setup + hosts: master + roles: + - clean-logging + +#IF KUBELET IS RUNNING THEN RUN THIS +#clean k8s cluster +- name: Clean k8s cluster + hosts: master + roles: + - clean-k8s-cluster + +#reset worker-nodes +- name: Reset worker-nodes + hosts: worker-nodes + roles: + - clean-k8s-worker-reset + +#unistall pre-requisites for k8s +- name: unistall pre-requisites for k8s + hosts: all + roles: + - clean-k8s-pre + +#************************************************************************************************************* +#THIS WILL DELETE DATA OF ELASTICSEARCH +#************************************************************************************************************* +# - name: Clean nfs server +# hosts: all +# roles: +# - clean-nfs diff --git a/tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml new file mode 100644 index 00000000..83ac086d --- /dev/null +++ b/tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml @@ -0,0 +1,34 @@ +# Copyright 2020 Adarsh yadav, Aditya Srivastava +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +#check kubelet is running or not +- name: check for kubelet + shell: "systemctl status kubelet" + register: _svc_kubelet + failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr) + +#IF KUBELET IS RUNNING, THEN +#reset k8s +- name: reset k8s + shell: | + kubectl drain {{vm3}} --delete-local-data --force --ignore-daemonsets + kubectl drain {{vm2}} --delete-local-data --force --ignore-daemonsets + kubectl drain {{vm1}} --delete-local-data --force --ignore-daemonsets + kubectl delete node {{vm3}} + kubectl delete node {{vm2}} + kubectl delete node {{vm1}} + sudo kubeadm reset -f + sudo rm $HOME/.kube/config + when: "_svc_kubelet.rc == 0" + diff --git a/tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml new file mode 100644 index 00000000..6d12bd5f --- /dev/null +++ b/tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml @@ -0,0 +1,65 @@ +# Copyright 2020 Adarsh yadav, Aditya Srivastava +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- + +#Uninstalling K8s +- name: Uninstalling K8s + yum: + name: ['kubeadm', 'kubectl', 'kubelet', 'docker-ce'] + state: absent + +#Enabling Swap +- name: Enabling Swap on all nodes + shell: swapon -a + ignore_errors: yes + +#Uncommenting Swap entries +- name: Uncommenting Swap entries in /etc/fstab + replace: + path: /etc/fstab + regexp: '^# /(.*swap.*)' + replace: '\1' + + +#Starting firewalld +- name: 'Starting firewall' + service: + name: firewalld + state: started + enabled: yes + +# Enabling SELinux +- name: Enabling SELinux on all nodes + shell: | + setenforce 1 + sudo sed -i 's/^SELINUX=permissive$/SELINUX=enforcing/' /etc/selinux/config + +#removing Docker repo +- name: removing Docker repo + command: yum-config-manager --disable docker-ce-stable + +#removing K8s repo +- name: removing repository details in Kubernetes repo file. + blockinfile: + path: /etc/yum.repos.d/kubernetes.repo + state: absent + block: | + [kubernetes] + name=Kubernetes + baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64 + enabled=1 + gpgcheck=1 + repo_gpgcheck=1 + gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg + https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg diff --git a/tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml new file mode 100644 index 00000000..3ba9c9ea --- /dev/null +++ b/tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml @@ -0,0 +1,26 @@ +# Copyright 2020 Adarsh yadav, Aditya Srivastava +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +#check kubelet is running or not +- name: check for kubelet + shell: "systemctl status kubelet" + register: _svc_kubelet + failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr) + +#IF KUBELET IS RUNNING, THEN +#reset k8s +- name: reset k8s + command: kubeadm reset -f + when: "_svc_kubelet.rc == 0" + diff --git a/tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml b/tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml new file mode 100644 index 00000000..49943ec0 --- /dev/null +++ b/tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml @@ -0,0 +1,48 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +#Deleting PAG setup from k8s cluster + +#check kubelet is running or not +- name: check for kubelet + shell: "systemctl status kubelet" + register: _svc_kubelet + failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr) + +#*********************************************************************************************************** +#copy yaml to /tmp/files/ +#*********************************************************************************************************** +- name: copy namespace yaml to /tmp/files/ + copy: + src: ../../monitoring/files/monitoring-namespace.yaml + dest: /tmp/monitoring-namespace.yaml + +#*********************************************************************************************************** +#Deleting Namespace +#*********************************************************************************************************** +- name: Deleting Namespace + k8s: + state: absent + src: /tmp/monitoring-namespace.yaml + namespace: monitoring + when: "_svc_kubelet.rc == 0" + +#*********************************************************************************************************** +#removing /tmp/files +#*********************************************************************************************************** +- name: Removing /tmp/monitoring-namespace.yaml + file: + path: "/tmp/monitoring-namespace.yaml" + state: absent diff --git a/tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml b/tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml new file mode 100644 index 00000000..157db849 --- /dev/null +++ b/tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml @@ -0,0 +1,44 @@ +# Copyright 2020 Adarsh yadav, Aditya Srivastava +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +--- +#Edit /etc/export +- name: Edit /etc/export file for NFS + lineinfile: + path: /etc/exports + line: "{{item.line}}" + state: absent + with_items: + - {line: "/srv/nfs/master *(rw,sync,no_root_squash,no_subtree_check)"} + - {line: "/srv/nfs/data *(rw,sync,no_root_squash,no_subtree_check)"} + - {line: "/usr/share/monitoring_data/grafana *(rw,sync,no_root_squash,no_subtree_check)"} + +#uninstall NFS server +- name: Uninstalling NFS server utils + yum: + name: nfs-utils + state: absent + +#remove Elasticsearch data +- name: Removing Directory for elasticsearch + file: + path: "/srv/nfs/{{item}}" + state: absent + with_items: + - ['data', 'master'] + +#remove Grafana data +- name: Removing Directory for grafana + file: + path: "/usr/share/monitoring_data/grafana" + state: absent diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml new file mode 100644 index 00000000..7b9abc47 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml @@ -0,0 +1,37 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: ConfigMap +apiVersion: v1 +metadata: + name: alertmanager-config + namespace: monitoring +data: + config.yml: |- + global: + route: + receiver: "webhook" + group_by: ['alertname', 'priority'] + group_wait: 1s + group_interval: 5s + repeat_interval: 5s + routes: + - match: + severity: critical + + receivers: + - name: "webhook" + webhook_configs: + - url: 'http://10.10.120.20/alertmanager' + send_resolved: true diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml new file mode 100644 index 00000000..f1c3d78e --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml @@ -0,0 +1,62 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: alertmanager + adi10hero.monitoring: alertmanager + name: alertmanager + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: alertmanager + adi10hero.monitoring: alertmanager + strategy: + type: Recreate + template: + metadata: + name: alertmanager + labels: + app: alertmanager + adi10hero.monitoring: alertmanager + spec: + containers: + - name: alertmanager + image: prom/alertmanager + args: + - --config.file=/etc/alertmanager/config.yml + - --storage.path=/alertmanager + - --cluster.peer=alertmanager1:6783 + - --cluster.listen-address=0.0.0.0:6783 + ports: + - containerPort: 9093 + - containerPort: 6783 + securityContext: + runAsUser: 0 + volumeMounts: + - name: config-volume + mountPath: /etc/alertmanager + - name: alertmanager + mountPath: /alertmanager + restartPolicy: Always + volumes: + - name: config-volume + configMap: + name: alertmanager-config + - name: alertmanager + emptyDir: {} diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml new file mode 100644 index 00000000..c67517d3 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml @@ -0,0 +1,41 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + adi10hero.monitoring: alertmanager + app: alertmanager + name: alertmanager + namespace: monitoring + annotations: + prometheus.io/scrape: 'true' + prometheus.io/path: / + prometheus.io/port: '8080' + +spec: + selector: + app: alertmanager + adi10hero.monitoring: alertmanager + type: NodePort + ports: + - name: "9093" + port: 9093 + targetPort: 9093 + nodePort: 30930 + - name: "6783" + port: 6783 + targetPort: 6783 + nodePort: 30679 diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml new file mode 100644 index 00000000..18b76456 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml @@ -0,0 +1,62 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + app: alertmanager1 + adi10hero.monitoring: alertmanager1 + name: alertmanager1 + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + app: alertmanager1 + adi10hero.monitoring: alertmanager1 + strategy: + type: Recreate + template: + metadata: + name: alertmanager1 + labels: + app: alertmanager1 + adi10hero.monitoring: alertmanager1 + spec: + containers: + - name: alertmanager1 + image: prom/alertmanager + args: + - --config.file=/etc/alertmanager/config.yml + - --storage.path=/alertmanager + - --cluster.peer=alertmanager:6783 + - --cluster.listen-address=0.0.0.0:6783 + ports: + - containerPort: 9093 + - containerPort: 6783 + securityContext: + runAsUser: 0 + volumeMounts: + - name: config-volume + mountPath: /etc/alertmanager + - name: alertmanager + mountPath: /alertmanager + restartPolicy: Always + volumes: + - name: config-volume + configMap: + name: alertmanager-config + - name: alertmanager + emptyDir: {} diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml new file mode 100644 index 00000000..66d0d2b1 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml @@ -0,0 +1,42 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + adi10hero.monitoring: alertmanager1 + app: alertmanager1 + name: alertmanager1 + namespace: monitoring + annotations: + prometheus.io/scrape: 'true' + prometheus.io/path: / + prometheus.io/port: '8080' + +spec: + selector: + app: alertmanager1 + adi10hero.monitoring: alertmanager1 + type: NodePort + ports: + - name: "9093" + port: 9093 + targetPort: 9093 + nodePort: 30931 + - name: "6783" + port: 6783 + targetPort: 6783 + nodePort: 30678 + diff --git a/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml new file mode 100644 index 00000000..6a62985e --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml @@ -0,0 +1,79 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: cadvisor + namespace: monitoring + labels: + adi10hero.monitoring: cadvisor + app: cadvisor +spec: + selector: + matchLabels: + app: cadvisor + adi10hero.monitoring: cadvisor + template: + metadata: + name: cadvisor + labels: + adi10hero.monitoring: cadvisor + app: cadvisor + spec: + containers: + - image: gcr.io/google-containers/cadvisor + name: cadvisor + ports: + - containerPort: 8080 + securityContext: + runAsUser: 0 + volumeMounts: + - mountPath: /rootfs + name: cadvisor-hostpath0 + readOnly: true + - mountPath: /var/run + name: cadvisor-hostpath1 + - mountPath: /sys + name: cadvisor-hostpath2 + readOnly: true + - mountPath: /sys/fs/cgroup + name: cadvisor-hostpath3 + readOnly: true + - mountPath: /dev/disk + name: cadvisor-hostpath4 + readOnly: true + - mountPath: /var/lib/docker + name: cadvisor-hostpath5 + readOnly: true + restartPolicy: Always + volumes: + - hostPath: + path: / + name: cadvisor-hostpath0 + - hostPath: + path: /var/run + name: cadvisor-hostpath1 + - hostPath: + path: /sys + name: cadvisor-hostpath2 + - hostPath: + path: /cgroup + name: cadvisor-hostpath3 + - hostPath: + path: /dev/disk/ + name: cadvisor-hostpath4 + - hostPath: + path: /var/lib/docker/ + name: cadvisor-hostpath5 diff --git a/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml new file mode 100644 index 00000000..734240b8 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml @@ -0,0 +1,30 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + app: cadvisor + adi10hero.monitoring: cadvisor + name: cadvisor + namespace: monitoring +spec: + ports: + - name: "8080" + port: 8080 + targetPort: 8080 + selector: + app: cadvisor + adi10hero.monitoring: cadvisor diff --git a/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml new file mode 100644 index 00000000..b6bfe0b6 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml @@ -0,0 +1,51 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: collectd-exporter + namespace: monitoring + labels: + app: collectd-exporter + adi10hero.monitoring: collectd-exporter +spec: + replicas: 1 + selector: + matchLabels: + app: collectd-exporter + adi10hero.monitoring: collectd-exporter + strategy: + type: Recreate + template: + metadata: + name: collectd-exporter + labels: + app: collectd-exporter + adi10hero.monitoring: collectd-exporter + spec: + containers: + - args: + - --collectd.listen-address=0.0.0.0:25826 + image: prom/collectd-exporter + name: collectd-exporter + ports: + - containerPort: 9103 + - containerPort: 25826 + protocol: UDP + securityContext: + runAsUser: 0 + restartPolicy: Always + volumes: null + diff --git a/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml new file mode 100644 index 00000000..5609d04a --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml @@ -0,0 +1,35 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: collectd-exporter + namespace: monitoring + labels: + app: collectd-exporter + adi10hero.monitoring: collectd-exporter +spec: + ports: + - name: "9103" + port: 9103 + nodePort: 30103 + - name: "25826" + port: 25826 + protocol: UDP + nodePort: 30826 + selector: + app: collectd-exporter + adi10hero.monitoring: collectd-exporter + type: NodePort diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml new file mode 100644 index 00000000..e2b8c9fa --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml @@ -0,0 +1,35 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-datasources + namespace: monitoring +data: + prometheus.yaml: |- + { + "apiVersion": 1, + "datasources": [ + { + "access":"proxy", + "editable": true, + "name": "prometheus", + "orgId": 1, + "type": "prometheus", + "url": "http://prometheus-main:9090", + "version": 1 + } + ] + } diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml new file mode 100644 index 00000000..afb00948 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml @@ -0,0 +1,68 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + adi10hero.monitoring: grafana + app: grafana + name: grafana + namespace: monitoring +spec: + replicas: 1 + selector: + matchLabels: + adi10hero.monitoring: grafana + app: grafana + strategy: + type: Recreate + template: + metadata: + name: grafana + labels: + adi10hero.monitoring: grafana + app: grafana + spec: + containers: + - name: grafana + image: grafana/grafana + ports: + - containerPort: 3000 + env: + - name: GF_SECURITY_ADMIN_PASSWORD + value: admin + - name: GF_SECURITY_ADMIN_USER + value: admin + - name: GF_SERVER_DOMAIN + value: 10.10.120.20 + - name: GF_SERVER_ROOT_URL + value: "%(protocol)s://%(domain)s:/metrics" + securityContext: + runAsUser: 0 + volumeMounts: + - mountPath: /var/lib/grafana + name: grafana-storage + - mountPath: /etc/grafana/provisioning/datasources + name: grafana-datasources + readOnly: false + restartPolicy: Always + volumes: + - name: grafana-storage + persistentVolumeClaim: + claimName: grafana-pvc + - name: grafana-datasources + configMap: + defaultMode: 420 + name: grafana-datasources diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml new file mode 100644 index 00000000..06bcc31b --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml @@ -0,0 +1,31 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: PersistentVolume +metadata: + name: grafana-pv + namespace: monitoring + labels: + app: grafana-pv + adi10hero.monitoring: grafana-pv +spec: + storageClassName: monitoring + capacity: + storage: 5Gi + accessModes: + - ReadWriteMany + nfs: + server: 10.10.120.211 + path: "/usr/share/monitoring_data/grafana" diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml new file mode 100644 index 00000000..2c2955c8 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml @@ -0,0 +1,33 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: grafana-pvc + namespace: monitoring + labels: + app: grafana-pvc + adi10hero.monitoring: grafana-pvc +spec: + accessModes: + - ReadWriteMany + storageClassName: monitoring + resources: + requests: + storage: 4Gi + selector: + matchLabels: + app: grafana-pv + adi10hero.monitoring: grafana-pv diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml new file mode 100644 index 00000000..d1c9c9cc --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml @@ -0,0 +1,36 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + name: grafana + namespace: monitoring + labels: + app: grafana + adi10hero.monitoring: grafana + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '3000' +spec: + selector: + app: grafana + adi10hero.monitoring: grafana + type: NodePort + ports: + - name: "3000" + port: 3000 + targetPort: 3000 + nodePort: 30000 + diff --git a/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml new file mode 100644 index 00000000..af3c5469 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml @@ -0,0 +1,36 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: kube-state-metrics + namespace: kube-system +spec: + selector: + matchLabels: + app: kube-state-metrics + replicas: 1 + template: + metadata: + labels: + app: kube-state-metrics + spec: + #serviceAccountName: prometheus + containers: + - name: kube-state-metrics + image: quay.io/coreos/kube-state-metrics:v1.2.0 + ports: + - containerPort: 8080 + name: monitoring diff --git a/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml new file mode 100644 index 00000000..8d294391 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml @@ -0,0 +1,26 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: Service +apiVersion: v1 +metadata: + name: kube-state-metrics + namespace: kube-system +spec: + selector: + app: kube-state-metrics + ports: + - protocol: TCP + port: 8080 + targetPort: 8080 diff --git a/tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml b/tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml new file mode 100644 index 00000000..f1c9b889 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml @@ -0,0 +1,18 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Namespace +metadata: + name: monitoring diff --git a/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml new file mode 100644 index 00000000..9334b2f4 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml @@ -0,0 +1,80 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: node-exporter-daemonset + namespace: monitoring + labels: + app: node-exporter + adi10hero.monitoring: node-exporter +spec: + selector: + matchLabels: + app: node-exporter + adi10hero.monitoring: node-exporter + template: + metadata: + labels: + app: node-exporter + adi10hero.monitoring: node-exporter + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9100" + spec: + hostPID: true + hostIPC: true + hostNetwork: true + containers: + - ports: + - containerPort: 9100 + protocol: TCP + resources: + requests: + cpu: 0.15 + securityContext: + runAsUser: 0 + privileged: true + image: prom/node-exporter:v0.15.2 + args: + - --path.procfs + - /host/proc + - --path.sysfs + - /host/sys + - --collector.filesystem.ignored-mount-points + - '"^/(sys|proc|dev|host|etc)($|/)"' + name: node-exporter + volumeMounts: + - name: dev + mountPath: /host/dev + - name: proc + mountPath: /host/proc + - name: sys + mountPath: /host/sys + - name: rootfs + mountPath: /rootfs + volumes: + - name: proc + hostPath: + path: /proc + - name: dev + hostPath: + path: /dev + - name: sys + hostPath: + path: /sys + - name: rootfs + hostPath: + path: / diff --git a/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml new file mode 100644 index 00000000..dd0aea4d --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml @@ -0,0 +1,33 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + adi10hero.monitoring: node-exporter + app: node-exporter + name: node-exporter + namespace: monitoring + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "9100" +spec: + ports: + - name: "node-exporter" + port: 9100 + targetPort: 9100 + selector: + adi10hero.monitoring: node-exporter + app: node-exporter diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml new file mode 100644 index 00000000..58b220a8 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml @@ -0,0 +1,35 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + adi10hero.monitoring: prometheus-main + app: prometheus-main + name: prometheus-main + namespace: monitoring + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '9090' +spec: + type: NodePort + ports: + - name: prometheus-main + protocol: TCP + port: 9090 + nodePort: 30902 + selector: + adi10hero.monitoring: prometheus1 + app: prometheus diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml new file mode 100644 index 00000000..917f978f --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml @@ -0,0 +1,609 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: ConfigMap +metadata: + name: prometheus-config + namespace: monitoring +data: + alert.rules: |- + groups: + - name: targets + rules: + - alert: MonitorServiceDown + expr: up == 0 + for: 30s + labels: + severity: critical + annotations: + summary: "Monitor service non-operational" + description: "Service {{ $labels.instance }} is down." + - alert: HighCpuLoad + expr: node_load1 > 1.9 + for: 15s + labels: + severity: critical + annotations: + summary: "Service under high load" + description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." + + - name: host and hardware + rules: + - alert: HostHighCpuLoad + expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Host high CPU load (instance {{ $labels.instance }})" + description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostSwapIsFillingUp + expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Host swap is filling up (instance {{ $labels.instance }})" + description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HighMemoryLoad + expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85 + for: 30s + labels: + severity: warning + annotations: + summary: "Server memory is almost full" + description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." + + - alert: HighStorageLoad + expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85 + for: 30s + labels: + severity: warning + annotations: + summary: "Server storage is almost full" + description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}." + + - alert: HostNetworkTransmitErrors + expr: increase(node_network_transmit_errs_total[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Host Network Transmit Errors (instance {{ $labels.instance }})" + description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostOutOfMemory + expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Host out of memory (instance {{ $labels.instance }})" + description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostMemoryUnderMemoryPressure + expr: rate(node_vmstat_pgmajfault[1m]) > 1000 + for: 5m + labels: + severity: warning + annotations: + summary: "Host memory under memory pressure (instance {{ $labels.instance }})" + description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostUnusualNetworkThroughputIn + expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "Host unusual network throughput in (instance {{ $labels.instance }})" + description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostUnusualNetworkThroughputOut + expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 + for: 5m + labels: + severity: warning + annotations: + summary: "Host unusual network throughput out (instance {{ $labels.instance }})" + description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostUnusualDiskRateRead + expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 + for: 5m + labels: + severity: warning + annotations: + summary: "Host unusual disk read rate (instance {{ $labels.instance }})" + description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostUnusualDiskRateWrite + expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 + for: 5m + labels: + severity: warning + annotations: + summary: "Host unusual disk write rate (instance {{ $labels.instance }})" + description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostOutOfDiskSpace + expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Host out of disk space (instance {{ $labels.instance }})" + description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostDiskWillFillIn4Hours + expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})" + description: "Disk will fill in 4 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostPhysicalComponentTooHot + expr: node_hwmon_temp_celsius > 75 + for: 5m + labels: + severity: warning + annotations: + summary: "Host physical component too hot (instance {{ $labels.instance }})" + description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostNodeOvertemperatureAlarm + expr: node_hwmon_temp_alarm == 1 + for: 5m + labels: + severity: critical + annotations: + summary: "Host node overtemperature alarm (instance {{ $labels.instance }})" + description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostKernelVersionDeviations + expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Host kernel version deviations (instance {{ $labels.instance }})" + description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostOomKillDetected + expr: increase(node_vmstat_oom_kill[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Host OOM kill detected (instance {{ $labels.instance }})" + description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostEdacCorrectableErrorsDetected + expr: increase(node_edac_correctable_errors_total[5m]) > 0 + for: 5m + labels: + severity: info + annotations: + summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})" + description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostEdacUncorrectableErrorsDetected + expr: node_edac_uncorrectable_errors_total > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})" + description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostNetworkReceiveErrors + expr: increase(node_network_receive_errs_total[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Host Network Receive Errors (instance {{ $labels.instance }})" + description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: HostNetworkTransmitErrors + expr: increase(node_network_transmit_errs_total[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Host Network Transmit Errors (instance {{ $labels.instance }})" + description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - name: container + rules: + - alert: ContainerKilled + expr: time() - container_last_seen > 60 + for: 5m + labels: + severity: warning + annotations: + summary: "Container killed (instance {{ $labels.instance }})" + description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: ContainerCpuUsage + expr: sum by(instance, name) (rate(container_cpu_usage_seconds_total[3m]) * 100 > 80) + for: 5m + labels: + severity: warning + annotations: + summary: "Container CPU usage (instance {{ $labels.instance }})" + description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: ContainerMemoryUsage + expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 125 + for: 5m + labels: + severity: warning + annotations: + summary: "Container Memory usage (instance {{ $labels.instance }})" + description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: ContainerVolumeUsage + expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Container Volume usage (instance {{ $labels.instance }})" + description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: ContainerVolumeIoUsage + expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80 + for: 5m + labels: + severity: warning + annotations: + summary: "Container Volume IO usage (instance {{ $labels.instance }})" + description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: ContainerHighThrottleRate + expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Container high throttle rate (instance {{ $labels.instance }})" + description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - name: kubernetes + rules: + - alert: KubernetesNodeReady + expr: kube_node_status_condition{condition="Ready",status="true"} == 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes Node ready (instance {{ $labels.instance }})" + description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesMemoryPressure + expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes memory pressure (instance {{ $labels.instance }})" + description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesDiskPressure + expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes disk pressure (instance {{ $labels.instance }})" + description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesOutOfDisk + expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes out of disk (instance {{ $labels.instance }})" + description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesJobFailed + expr: kube_job_status_failed > 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes Job failed (instance {{ $labels.instance }})" + description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesCronjobSuspended + expr: kube_cronjob_spec_suspend != 0 + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes CronJob suspended (instance {{ $labels.instance }})" + description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesPersistentvolumeclaimPending + expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})" + description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesVolumeOutOfDiskSpace + expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10 + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes Volume out of disk space (instance {{ $labels.instance }})" + description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesVolumeFullInFourDays + expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes Volume full in four days (instance {{ $labels.instance }})" + description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesPersistentvolumeError + expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes PersistentVolume error (instance {{ $labels.instance }})" + description: "Persistent volume is in bad state\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesStatefulsetDown + expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes StatefulSet down (instance {{ $labels.instance }})" + description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesHpaScalingAbility + expr: kube_hpa_status_condition{condition="false", status="AbleToScale"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes HPA scaling ability (instance {{ $labels.instance }})" + description: "Pod is unable to scale\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesHpaMetricAvailability + expr: kube_hpa_status_condition{condition="false", status="ScalingActive"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes HPA metric availability (instance {{ $labels.instance }})" + description: "HPA is not able to colelct metrics\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesHpaScaleCapability + expr: kube_hpa_status_desired_replicas >= kube_hpa_spec_max_replicas + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes HPA scale capability (instance {{ $labels.instance }})" + description: "The maximum number of desired Pods has been hit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesPodNotHealthy + expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:]) > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes Pod not healthy (instance {{ $labels.instance }})" + description: "Pod has been in a non-ready state for longer than an hour.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesPodCrashLooping + expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 5 + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes pod crash looping (instance {{ $labels.instance }})" + description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesReplicassetMismatch + expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})" + description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesDeploymentReplicasMismatch + expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})" + description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesStatefulsetReplicasMismatch + expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})" + description: "A StatefulSet has not matched the expected number of replicas for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesDeploymentGenerationMismatch + expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})" + description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesStatefulsetGenerationMismatch + expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})" + description: "A StatefulSet has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesStatefulsetUpdateNotRolledOut + expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated) + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})" + description: "StatefulSet update has not been rolled out.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesDaemonsetRolloutStuck + expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})" + description: "Some Pods of DaemonSet are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesDaemonsetMisscheduled + expr: kube_daemonset_status_number_misscheduled > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})" + description: "Some DaemonSet Pods are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesCronjobTooLong + expr: time() - kube_cronjob_next_schedule_time > 3600 + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes CronJob too long (instance {{ $labels.instance }})" + description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesJobCompletion + expr: kube_job_spec_completions - kube_job_status_succeeded > 0 or kube_job_status_failed > 0 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes job completion (instance {{ $labels.instance }})" + description: "Kubernetes Job failed to complete\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesApiServerErrors + expr: sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[2m])) / sum(rate(apiserver_request_count{job="apiserver"}[2m])) * 100 > 3 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes API server errors (instance {{ $labels.instance }})" + description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesApiClientErrors + expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[2m])) by (instance, job) / sum(rate(rest_client_requests_total[2m])) by (instance, job)) * 100 > 1 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes API client errors (instance {{ $labels.instance }})" + description: "Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesClientCertificateExpiresNextWeek + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60 + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes client certificate expires next week (instance {{ $labels.instance }})" + description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesClientCertificateExpiresSoon + expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60 + for: 5m + labels: + severity: critical + annotations: + summary: "Kubernetes client certificate expires soon (instance {{ $labels.instance }})" + description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + - alert: KubernetesApiServerLatency + expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1 + for: 5m + labels: + severity: warning + annotations: + summary: "Kubernetes API server latency (instance {{ $labels.instance }})" + description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}" + + + prometheus.yml: |- + global: + scrape_interval: 15s + evaluation_interval: 15s + + rule_files: + - "/etc/prometheus/alert.rules" + + scrape_configs: + - job_name: 'collectd-exporter' + scrape_interval: 5s + static_configs: + - targets: ['collectd-exporter:9103'] + + - job_name: 'cadvisor' + scrape_interval: 5s + static_configs: + - targets: ['cadvisor:8080'] + + - job_name: 'node-exporter' + scrape_interval: 5s + static_configs: + - targets: ['node-exporter:9100'] + + - job_name: 'prometheus' + scrape_interval: 10s + static_configs: + - targets: ['localhost:9090'] + + - job_name: 'kube-state-metrics' + scrape_interval: 10s + static_configs: + - targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080'] + + alerting: + alertmanagers: + - scheme: http + static_configs: + - targets: ['alertmanager:9093', 'alertmanager1:9093'] diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml new file mode 100644 index 00000000..5b98b154 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml @@ -0,0 +1,73 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus-deployment + namespace: monitoring + labels: + app: prometheus + adi10hero.monitoring: prometheus +spec: + replicas: 1 + selector: + matchLabels: + adi10hero.monitoring: prometheus + app: prometheus + strategy: + type: Recreate + template: + metadata: + labels: + adi10hero.monitoring: prometheus + app: prometheus + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - vm2 + containers: + - name: prometheus + image: prom/prometheus + args: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.size=3GB + - --storage.tsdb.retention.time=30d + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + ports: + - containerPort: 9090 + securityContext: + runAsUser: 0 + volumeMounts: + - name: prometheus-config-volume + mountPath: /etc/prometheus/ + - name: prometheus-storage-volume + mountPath: /prometheus/ + restartPolicy: Always + volumes: + - name: prometheus-config-volume + configMap: + defaultMode: 420 + name: prometheus-config + - name: prometheus-storage-volume + persistentVolumeClaim: + claimName: prometheus-pvc diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml new file mode 100644 index 00000000..f10cd073 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml @@ -0,0 +1,30 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: PersistentVolume +metadata: + name: prometheus-pv + namespace: monitoring + labels: + app: prometheus-pv + adi10hero.monitoring: prometheus-pv +spec: + storageClassName: monitoring + capacity: + storage: 6Gi + accessModes: + - ReadWriteMany + hostPath: + path: "/usr/share/monitoring_data/prometheus" diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml new file mode 100644 index 00000000..812fcc73 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml @@ -0,0 +1,33 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: prometheus-pvc + namespace: monitoring + labels: + app: prometheus-pvc + adi10hero.monitoring: prometheus-pvc +spec: + accessModes: + - ReadWriteMany + storageClassName: monitoring + resources: + requests: + storage: 3Gi + selector: + matchLabels: + app: prometheus-pv + adi10hero.monitoring: prometheus-pv diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml new file mode 100644 index 00000000..5be76d3e --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml @@ -0,0 +1,34 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + adi10hero.monitoring: prometheus + app: prometheus + name: prometheus + namespace: monitoring + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '9090' +spec: + type: NodePort + ports: + - name: prometheus + protocol: TCP + port: 9090 + nodePort: 30900 + selector: + adi10hero.monitoring: prometheus diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml new file mode 100644 index 00000000..149bea84 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml @@ -0,0 +1,73 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: apps/v1 +kind: Deployment +metadata: + name: prometheus1-deployment + namespace: monitoring + labels: + app: prometheus1 + adi10hero.monitoring: prometheus1 +spec: + replicas: 1 + selector: + matchLabels: + adi10hero.monitoring: prometheus1 + app: prometheus1 + strategy: + type: Recreate + template: + metadata: + labels: + adi10hero.monitoring: prometheus1 + app: prometheus1 + spec: + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: kubernetes.io/hostname + operator: In + values: + - vm3 + containers: + - name: prometheus + image: prom/prometheus + args: + - --config.file=/etc/prometheus/prometheus.yml + - --storage.tsdb.path=/prometheus + - --storage.tsdb.retention.size=3GB + - --storage.tsdb.retention.time=30d + - --web.console.libraries=/etc/prometheus/console_libraries + - --web.console.templates=/etc/prometheus/consoles + ports: + - containerPort: 9090 + securityContext: + runAsUser: 0 + volumeMounts: + - name: prometheus-config-volume + mountPath: /etc/prometheus/ + - name: prometheus-storage-volume + mountPath: /prometheus/ + restartPolicy: Always + volumes: + - name: prometheus-config-volume + configMap: + defaultMode: 420 + name: prometheus-config + - name: prometheus-storage-volume + persistentVolumeClaim: + claimName: prometheus-pvc diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml new file mode 100644 index 00000000..439deec1 --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml @@ -0,0 +1,35 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +apiVersion: v1 +kind: Service +metadata: + labels: + adi10hero.monitoring: prometheus1 + app: prometheus1 + name: prometheus1 + namespace: monitoring + annotations: + prometheus.io/scrape: 'true' + prometheus.io/port: '9090' +spec: + type: NodePort + ports: + - name: prometheus1 + protocol: TCP + port: 9090 + nodePort: 30901 + selector: + adi10hero.monitoring: prometheus1 + app: prometheus1 diff --git a/tools/lma/ansible-server/roles/monitoring/tasks/main.yml b/tools/lma/ansible-server/roles/monitoring/tasks/main.yml new file mode 100644 index 00000000..cd4e6aca --- /dev/null +++ b/tools/lma/ansible-server/roles/monitoring/tasks/main.yml @@ -0,0 +1,273 @@ +# Copyright 2020 Aditya Srivastava. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +--- +#PAG setup in k8s cluster + +#*********************************************************************************************************** +#copy all yaml to /tmp/files/ +#*********************************************************************************************************** +- name: copy all yaml to /tmp/files/ + copy: + src: ../files/ + dest: /tmp/files/ + +#*********************************************************************************************************** +#Creating Namespace +#*********************************************************************************************************** +- name: Creating Monitoring Namespace + k8s: + state: present + src: /tmp/files/monitoring-namespace.yaml + namespace: monitoring + +#*********************************************************************************************************** +#creating Persistent Volume +#*********************************************************************************************************** +- name: creating Persistent Volume for Prometheus + k8s: + state: present + src: /tmp/files/prometheus/prometheus-pv.yaml + namespace: monitoring + +#*********************************************************************************************************** +#creating Persistent Volume +#*********************************************************************************************************** +- name: creating Persistent Volume for Grafana + k8s: + state: present + src: /tmp/files/grafana/grafana-pv.yaml + namespace: monitoring + +#*********************************************************************************************************** +#creating Persistent Volume Claim +#*********************************************************************************************************** +- name: creating Persistent Volume Claim for Prometheus + k8s: + state: present + src: /tmp/files/prometheus/prometheus-pvc.yaml + namespace: monitoring + +#*********************************************************************************************************** +#creating Persistent Volume Claim +#*********************************************************************************************************** +- name: creating Persistent Volume Claim for Grafana + k8s: + state: present + src: /tmp/files/grafana/grafana-pvc.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Making the CAdvisor deamonset +#*********************************************************************************************************** +- name: Creating cAdvisor deamonset + k8s: + state: present + src: /tmp/files/cadvisor/cadvisor-deamonset.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Starting the CAdvisor service +#*********************************************************************************************************** +- name: Starting cAdvisor service + k8s: + state: present + src: /tmp/files/cadvisor/cadvisor-service.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Deploying and Starting the kube-system-metrics service +#*********************************************************************************************************** +- name: Deploying kube-system-metrics + k8s: + state: present + src: /tmp/files/kube-state-metrics/kube-state-metrics-deployment.yaml + namespace: kube-system + +- name: Starting kube-system-metrics service + k8s: + state: present + src: /tmp/files/kube-state-metrics/kube-state-metrics-service.yaml + namespace: kube-system + +#*********************************************************************************************************** +#Making the NodeExporter deamonset +#*********************************************************************************************************** +- name: Creating NodeExporter deamonset + k8s: + state: present + src: /tmp/files/node-exporter/nodeexporter-daemonset.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Starting the NodeExporter service +#*********************************************************************************************************** +- name: Starting NodeExporter service + k8s: + state: present + src: /tmp/files/node-exporter/nodeexporter-service.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Making the collectd-exporter deployment +#*********************************************************************************************************** +- name: Creating collectd-exporter deamonset + k8s: + state: present + src: /tmp/files/collectd-exporter/collectd-exporter-deployment.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Making the collectd-exporter service +#*********************************************************************************************************** +- name: Creating collectd-exporter service + k8s: + state: present + src: /tmp/files/collectd-exporter/collectd-exporter-service.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Webhook goes here +#*********************************************************************************************************** + +#*********************************************************************************************************** +#Making the config file for Alertmanagers +#*********************************************************************************************************** +- name: Creating config map for Alertmanagers + k8s: + state: present + src: /tmp/files/alertmanager/alertmanager-config.yaml + namespace: monitoring + +# - name: Creating config map for Alertmanagers +# k8s: +# state: present +# src: /tmp/files/alertmanager1-config.yaml +# namespace: monitoring + +#*********************************************************************************************************** +#Making the 1st alertmanager deployment +#*********************************************************************************************************** +- name: Creating 1st alertmanager deployment + k8s: + state: present + src: /tmp/files/alertmanager/alertmanager-deployment.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Making the 1st alertmanager service +#*********************************************************************************************************** +- name: Creating 1st alertmanager service + k8s: + state: present + src: /tmp/files/alertmanager/alertmanager-service.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Making the 2nd alertmanager deployment +#*********************************************************************************************************** +- name: Creating 2nd alertmanager deployment + k8s: + state: present + src: /tmp/files/alertmanager/alertmanager1-deployment.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Making the 2nd alertmanager service +#*********************************************************************************************************** +- name: Creating 2nd alertmanager service + k8s: + state: present + src: /tmp/files/alertmanager/alertmanager1-service.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Making the config file for Prometheus +#*********************************************************************************************************** +- name: Creating 1st Prometheus Config + k8s: + state: present + src: /tmp/files/prometheus/prometheus-config.yaml + namespace: monitoring + +# - name: Creating 2nd Prometheus Config +# k8s: +# state: present +# src: /tmp/files/prometheus1-config.yaml +# namespace: monitoring + +#*********************************************************************************************************** +#Starting Prometheus +#*********************************************************************************************************** +- name: Starting Prometheus 1 + k8s: + state: present + src: /tmp/files/prometheus/prometheus-deployment.yaml + namespace: monitoring + +- name: Starting Prometheus 2 + k8s: + state: present + src: /tmp/files/prometheus/prometheus1-deployment.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Starting Prometheus Service +#*********************************************************************************************************** +- name: Starting Prometheus 1 Service + k8s: + state: present + src: /tmp/files/prometheus/prometheus-service.yaml + namespace: monitoring + +- name: Starting Prometheus 2 Service + k8s: + state: present + src: /tmp/files/prometheus/prometheus1-service.yaml + namespace: monitoring + +- name: Starting Main Prometheus Service + k8s: + state: present + src: /tmp/files/prometheus/main-prometheus-service.yaml + namespace: monitoring + +#*********************************************************************************************************** +#Starting Grafana +#*********************************************************************************************************** +- name: Creating Grafana Datasource Config + k8s: + state: present + src: /tmp/files/grafana/grafana-datasource-config.yaml + namespace: monitoring + +- name: Starting Grafana + k8s: + state: present + src: /tmp/files/grafana/grafana-deployment.yaml + namespace: monitoring + +- name: Starting Grafana Service + k8s: + state: present + src: /tmp/files/grafana/grafana-service.yaml + namespace: monitoring + +#*********************************************************************************************************** +#removing /tmp/files +#*********************************************************************************************************** +- name: Removing /tmp/files + file: + path: "/tmp/files" + state: absent diff --git a/tools/lma/metrics/dashboard/cpu_usage_using.json b/tools/lma/metrics/dashboard/cpu_usage_using.json new file mode 100644 index 00000000..85f7f122 --- /dev/null +++ b/tools/lma/metrics/dashboard/cpu_usage_using.json @@ -0,0 +1,750 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "prometheus", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Monitoring", + "showIn": 0, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 4, + "iteration": 1596637894836, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "grid": {}, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 3, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideZero": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "connected", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "collectd_cpu_percent{exported_instance='$host'}", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 4, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "collectd_cpu_percent{cpu='$core', exported_instance='$host'}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU utilization per core", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 7, + "w": 24, + "x": 0, + "y": 14 + }, + "hiddenSeries": false, + "id": 5, + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "collectd_cpu_percent{cpu='$core',exported_instance='$host'}", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "CPU Usage per core", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "10s", + "schemaVersion": 26, + "style": "dark", + "tags": [ + "monitoring" + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "pod12-node4", + "value": "pod12-node4" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "host", + "options": [ + { + "selected": true, + "text": "pod12-node4", + "value": "pod12-node4" + } + ], + "query": "pod12-node4,", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "0", + "value": "0" + }, + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "core", + "options": [ + { + "selected": false, + "text": "All", + "value": "$__all" + }, + { + "selected": true, + "text": "0", + "value": "0" + }, + { + "selected": false, + "text": "1", + "value": "1" + }, + { + "selected": false, + "text": "2", + "value": "2" + }, + { + "selected": false, + "text": "3", + "value": "3" + }, + { + "selected": false, + "text": "4", + "value": "4" + }, + { + "selected": false, + "text": "5", + "value": "5" + }, + { + "selected": false, + "text": "6", + "value": "6" + }, + { + "selected": false, + "text": "7", + "value": "7" + }, + { + "selected": false, + "text": "8", + "value": "8" + }, + { + "selected": false, + "text": "9", + "value": "9" + }, + { + "selected": false, + "text": "10", + "value": "10" + }, + { + "selected": false, + "text": "11", + "value": "11" + }, + { + "selected": false, + "text": "12", + "value": "12" + }, + { + "selected": false, + "text": "13", + "value": "13" + }, + { + "selected": false, + "text": "14", + "value": "14" + }, + { + "selected": false, + "text": "15", + "value": "15" + }, + { + "selected": false, + "text": "16", + "value": "16" + }, + { + "selected": false, + "text": "17", + "value": "17" + }, + { + "selected": false, + "text": "18", + "value": "18" + }, + { + "selected": false, + "text": "19", + "value": "19" + }, + { + "selected": false, + "text": "20", + "value": "20" + }, + { + "selected": false, + "text": "21", + "value": "21" + }, + { + "selected": false, + "text": "22", + "value": "22" + }, + { + "selected": false, + "text": "23", + "value": "23" + }, + { + "selected": false, + "text": "24", + "value": "24" + }, + { + "selected": false, + "text": "25", + "value": "25" + }, + { + "selected": false, + "text": "26", + "value": "26" + }, + { + "selected": false, + "text": "27", + "value": "27" + }, + { + "selected": false, + "text": "28", + "value": "28" + }, + { + "selected": false, + "text": "29", + "value": "29" + }, + { + "selected": false, + "text": "30", + "value": "30" + }, + { + "selected": false, + "text": "31", + "value": "31" + }, + { + "selected": false, + "text": "32", + "value": "32" + }, + { + "selected": false, + "text": "33", + "value": "33" + }, + { + "selected": false, + "text": "34", + "value": "34" + }, + { + "selected": false, + "text": "35", + "value": "35" + }, + { + "selected": false, + "text": "36", + "value": "36" + }, + { + "selected": false, + "text": "37", + "value": "37" + }, + { + "selected": false, + "text": "38", + "value": "38" + }, + { + "selected": false, + "text": "39", + "value": "39" + }, + { + "selected": false, + "text": "40", + "value": "40" + }, + { + "selected": false, + "text": "41", + "value": "41" + }, + { + "selected": false, + "text": "42", + "value": "42" + }, + { + "selected": false, + "text": "43", + "value": "43" + }, + { + "selected": false, + "text": "44", + "value": "44" + }, + { + "selected": false, + "text": "45", + "value": "45" + }, + { + "selected": false, + "text": "46", + "value": "46" + }, + { + "selected": false, + "text": "47", + "value": "47" + }, + { + "selected": false, + "text": "48", + "value": "48" + }, + { + "selected": false, + "text": "49", + "value": "49" + }, + { + "selected": false, + "text": "50", + "value": "50" + }, + { + "selected": false, + "text": "51", + "value": "51" + }, + { + "selected": false, + "text": "52", + "value": "52" + }, + { + "selected": false, + "text": "53", + "value": "53" + }, + { + "selected": false, + "text": "54", + "value": "54" + }, + { + "selected": false, + "text": "55", + "value": "55" + }, + { + "selected": false, + "text": "56", + "value": "56" + }, + { + "selected": false, + "text": "57", + "value": "57" + }, + { + "selected": false, + "text": "58", + "value": "58" + }, + { + "selected": false, + "text": "59", + "value": "59" + }, + { + "selected": false, + "text": "60", + "value": "60" + }, + { + "selected": false, + "text": "61", + "value": "61" + }, + { + "selected": false, + "text": "62", + "value": "62" + }, + { + "selected": false, + "text": "63", + "value": "63" + }, + { + "selected": false, + "text": "64", + "value": "64" + } + ], + "query": "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "CPU Usage", + "uid": "XeDwSiSGk", + "version": 13 +} \ No newline at end of file diff --git a/tools/lma/metrics/dashboard/memory_using.json b/tools/lma/metrics/dashboard/memory_using.json new file mode 100644 index 00000000..3b92d8f5 --- /dev/null +++ b/tools/lma/metrics/dashboard/memory_using.json @@ -0,0 +1,337 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "prometheus", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Monitoring", + "showIn": 0, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 6, + "iteration": 1597616052316, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "description": "", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 15, + "w": 24, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 1, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_memory{exported_instance='$host', memory='$type'}[$range])", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Bytes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "30s", + "schemaVersion": 26, + "style": "dark", + "tags": [ + "monitoring" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "pod12-node4", + "value": "pod12-node4" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "host", + "options": [ + { + "selected": true, + "text": "pod12-node4", + "value": "pod12-node4" + } + ], + "query": "pod12-node4,", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "label": null, + "name": "range", + "options": [ + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "30s,1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "used", + "value": "used" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "type", + "options": [ + { + "selected": false, + "text": "buffered", + "value": "buffered" + }, + { + "selected": false, + "text": "cached", + "value": "cached" + }, + { + "selected": false, + "text": "free", + "value": "free" + }, + { + "selected": false, + "text": "slab_recl", + "value": "slab_recl" + }, + { + "selected": false, + "text": "slab_unrecl", + "value": "slab_unrecl" + }, + { + "selected": true, + "text": "used", + "value": "used" + } + ], + "query": "buffered,cached,free,slab_recl,slab_unrecl,used", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Memory", + "uid": "kuro-mem", + "version": 4 +} \ No newline at end of file diff --git a/tools/lma/metrics/dashboard/ovs_stats_using.json b/tools/lma/metrics/dashboard/ovs_stats_using.json new file mode 100644 index 00000000..1e679fbe --- /dev/null +++ b/tools/lma/metrics/dashboard/ovs_stats_using.json @@ -0,0 +1,854 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "prometheus", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Monitoring", + "showIn": 0, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 6, + "iteration": 1596643135141, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 1, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_ovs_stats_if_rx_octets_total{exported_instance='$host'}[$__interval])", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average RX values", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 2, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_ovs_stats_if_tx_octets_total{exported_instance='$host'}[$__interval])", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average TX values", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 12 + }, + "hiddenSeries": false, + "id": 3, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_ovs_stats_if_collisions_total{exported_instance='$host'}[$range])", + "interval": "", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "rate(collectd_ovs_stats_if_dropped_0_total{exported_instance='$host'}[$range])", + "interval": "", + "legendFormat": "", + "refId": "B" + }, + { + "expr": "rate(collectd_ovs_stats_if_dropped_1_total{exported_instance='$host'}[$range])", + "interval": "", + "legendFormat": "", + "refId": "C" + }, + { + "expr": "rate(collectd_ovs_stats_if_errors_0_total{exported_instance='$host'}[$range])", + "interval": "", + "legendFormat": "", + "refId": "D" + }, + { + "expr": "rate(collectd_ovs_stats_if_errors_1_total{exported_instance='$host'}[$range])", + "interval": "", + "legendFormat": "", + "refId": "E" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Average Collisions, Drops and Error values", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "30s", + "schemaVersion": 26, + "style": "dark", + "tags": [ + "monitoring" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "pod12-node4", + "value": "pod12-node4" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "host", + "options": [ + { + "selected": true, + "text": "pod12-node4", + "value": "pod12-node4" + } + ], + "query": "pod12-node4,", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "0", + "value": "0" + }, + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "core", + "options": [ + { + "selected": false, + "text": "All", + "value": "$__all" + }, + { + "selected": true, + "text": "0", + "value": "0" + }, + { + "selected": false, + "text": "1", + "value": "1" + }, + { + "selected": false, + "text": "2", + "value": "2" + }, + { + "selected": false, + "text": "3", + "value": "3" + }, + { + "selected": false, + "text": "4", + "value": "4" + }, + { + "selected": false, + "text": "5", + "value": "5" + }, + { + "selected": false, + "text": "6", + "value": "6" + }, + { + "selected": false, + "text": "7", + "value": "7" + }, + { + "selected": false, + "text": "8", + "value": "8" + }, + { + "selected": false, + "text": "9", + "value": "9" + }, + { + "selected": false, + "text": "10", + "value": "10" + }, + { + "selected": false, + "text": "11", + "value": "11" + }, + { + "selected": false, + "text": "12", + "value": "12" + }, + { + "selected": false, + "text": "13", + "value": "13" + }, + { + "selected": false, + "text": "14", + "value": "14" + }, + { + "selected": false, + "text": "15", + "value": "15" + }, + { + "selected": false, + "text": "16", + "value": "16" + }, + { + "selected": false, + "text": "17", + "value": "17" + }, + { + "selected": false, + "text": "18", + "value": "18" + }, + { + "selected": false, + "text": "19", + "value": "19" + }, + { + "selected": false, + "text": "20", + "value": "20" + }, + { + "selected": false, + "text": "21", + "value": "21" + }, + { + "selected": false, + "text": "22", + "value": "22" + }, + { + "selected": false, + "text": "23", + "value": "23" + }, + { + "selected": false, + "text": "24", + "value": "24" + }, + { + "selected": false, + "text": "25", + "value": "25" + }, + { + "selected": false, + "text": "26", + "value": "26" + }, + { + "selected": false, + "text": "27", + "value": "27" + }, + { + "selected": false, + "text": "28", + "value": "28" + }, + { + "selected": false, + "text": "29", + "value": "29" + }, + { + "selected": false, + "text": "30", + "value": "30" + }, + { + "selected": false, + "text": "31", + "value": "31" + }, + { + "selected": false, + "text": "32", + "value": "32" + }, + { + "selected": false, + "text": "33", + "value": "33" + }, + { + "selected": false, + "text": "34", + "value": "34" + }, + { + "selected": false, + "text": "35", + "value": "35" + }, + { + "selected": false, + "text": "36", + "value": "36" + }, + { + "selected": false, + "text": "37", + "value": "37" + }, + { + "selected": false, + "text": "38", + "value": "38" + }, + { + "selected": false, + "text": "39", + "value": "39" + }, + { + "selected": false, + "text": "40", + "value": "40" + }, + { + "selected": false, + "text": "41", + "value": "41" + }, + { + "selected": false, + "text": "42", + "value": "42" + }, + { + "selected": false, + "text": "43", + "value": "43" + }, + { + "selected": false, + "text": "44", + "value": "44" + }, + { + "selected": false, + "text": "45", + "value": "45" + }, + { + "selected": false, + "text": "46", + "value": "46" + }, + { + "selected": false, + "text": "47", + "value": "47" + }, + { + "selected": false, + "text": "48", + "value": "48" + }, + { + "selected": false, + "text": "49", + "value": "49" + }, + { + "selected": false, + "text": "50", + "value": "50" + }, + { + "selected": false, + "text": "51", + "value": "51" + }, + { + "selected": false, + "text": "52", + "value": "52" + }, + { + "selected": false, + "text": "53", + "value": "53" + }, + { + "selected": false, + "text": "54", + "value": "54" + }, + { + "selected": false, + "text": "55", + "value": "55" + }, + { + "selected": false, + "text": "56", + "value": "56" + }, + { + "selected": false, + "text": "57", + "value": "57" + }, + { + "selected": false, + "text": "58", + "value": "58" + }, + { + "selected": false, + "text": "59", + "value": "59" + }, + { + "selected": false, + "text": "60", + "value": "60" + }, + { + "selected": false, + "text": "61", + "value": "61" + }, + { + "selected": false, + "text": "62", + "value": "62" + }, + { + "selected": false, + "text": "63", + "value": "63" + }, + { + "selected": false, + "text": "64", + "value": "64" + } + ], + "query": "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "label": null, + "name": "range", + "options": [ + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "30s,1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "OVS Stats", + "uid": "K1N5ciIGz", + "version": 7 + } \ No newline at end of file diff --git a/tools/lma/metrics/dashboard/rdt_using.json b/tools/lma/metrics/dashboard/rdt_using.json new file mode 100644 index 00000000..a0ce7987 --- /dev/null +++ b/tools/lma/metrics/dashboard/rdt_using.json @@ -0,0 +1,833 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "prometheus", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Monitoring", + "showIn": 0, + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "id": 7, + "iteration": 1597615840124, + "links": [], + "panels": [ + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 1, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_intel_rdt_bytes{exported_instance='$host', intel_rdt='$intel_rdt'}[$range])", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "RDT Bytes", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 6, + "w": 24, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 2, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_intel_rdt_ipc{exported_instance='$host', intel_rdt='$intel_rdt'}[$range])", + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "IPC values", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": "$datasource", + "fieldConfig": { + "defaults": { + "custom": {} + }, + "overrides": [] + }, + "fill": 1, + "fillGradient": 0, + "gridPos": { + "h": 5, + "w": 24, + "x": 0, + "y": 12 + }, + "hiddenSeries": false, + "id": 3, + "interval": "1s", + "legend": { + "alignAsTable": true, + "avg": true, + "current": true, + "max": true, + "min": true, + "rightSide": true, + "show": false, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [], + "nullPointMode": "null", + "percentage": false, + "pluginVersion": "7.1.3", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "span": 12, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "rate(collectd_intel_rdt_memory_bandwidth_total{exported_instance='$host', type='local'}[$range])", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "A" + }, + { + "expr": "rate(collectd_intel_rdt_memory_bandwidth_total{exported_instance='$host', type='remote'}[$range])", + "hide": false, + "interval": "", + "legendFormat": "", + "refId": "B" + } + ], + "thresholds": [], + "timeFrom": null, + "timeRegions": [], + "timeShift": null, + "title": "Memory Bandwidth Total", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": "30s", + "schemaVersion": 26, + "style": "dark", + "tags": [ + "monitoring" + ], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "prometheus", + "value": "prometheus" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "queryValue": "", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": { + "selected": false, + "text": "pod12-node4", + "value": "pod12-node4" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "host", + "options": [ + { + "selected": true, + "text": "pod12-node4", + "value": "pod12-node4" + } + ], + "query": "pod12-node4,", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": false, + "text": "30s", + "value": "30s" + }, + "hide": 0, + "label": null, + "name": "range", + "options": [ + { + "selected": true, + "text": "30s", + "value": "30s" + }, + { + "selected": false, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + }, + { + "selected": false, + "text": "7d", + "value": "7d" + }, + { + "selected": false, + "text": "14d", + "value": "14d" + }, + { + "selected": false, + "text": "30d", + "value": "30d" + } + ], + "query": "30s,1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d", + "queryValue": "", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "2", + "value": "2" + }, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "intel_rdt", + "options": [ + { + "selected": false, + "text": "0", + "value": "0" + }, + { + "selected": false, + "text": "1", + "value": "1" + }, + { + "selected": true, + "text": "2", + "value": "2" + }, + { + "selected": false, + "text": "3", + "value": "3" + }, + { + "selected": false, + "text": "4", + "value": "4" + }, + { + "selected": false, + "text": "5", + "value": "5" + }, + { + "selected": false, + "text": "6", + "value": "6" + }, + { + "selected": false, + "text": "7", + "value": "7" + }, + { + "selected": false, + "text": "8", + "value": "8" + }, + { + "selected": false, + "text": "9", + "value": "9" + }, + { + "selected": false, + "text": "10", + "value": "10" + }, + { + "selected": false, + "text": "11", + "value": "11" + }, + { + "selected": false, + "text": "12", + "value": "12" + }, + { + "selected": false, + "text": "13", + "value": "13" + }, + { + "selected": false, + "text": "14", + "value": "14" + }, + { + "selected": false, + "text": "15", + "value": "15" + }, + { + "selected": false, + "text": "16", + "value": "16" + }, + { + "selected": false, + "text": "17", + "value": "17" + }, + { + "selected": false, + "text": "18", + "value": "18" + }, + { + "selected": false, + "text": "19", + "value": "19" + }, + { + "selected": false, + "text": "20", + "value": "20" + }, + { + "selected": false, + "text": "21", + "value": "21" + }, + { + "selected": false, + "text": "22", + "value": "22" + }, + { + "selected": false, + "text": "23", + "value": "23" + }, + { + "selected": false, + "text": "24", + "value": "24" + }, + { + "selected": false, + "text": "25", + "value": "25" + }, + { + "selected": false, + "text": "26", + "value": "26" + }, + { + "selected": false, + "text": "27", + "value": "27" + }, + { + "selected": false, + "text": "28", + "value": "28" + }, + { + "selected": false, + "text": "29", + "value": "29" + }, + { + "selected": false, + "text": "30", + "value": "30" + }, + { + "selected": false, + "text": "31", + "value": "31" + }, + { + "selected": false, + "text": "32", + "value": "32" + }, + { + "selected": false, + "text": "33", + "value": "33" + }, + { + "selected": false, + "text": "34", + "value": "34" + }, + { + "selected": false, + "text": "35", + "value": "35" + }, + { + "selected": false, + "text": "36", + "value": "36" + }, + { + "selected": false, + "text": "37", + "value": "37" + }, + { + "selected": false, + "text": "38", + "value": "38" + }, + { + "selected": false, + "text": "39", + "value": "39" + }, + { + "selected": false, + "text": "40", + "value": "40" + }, + { + "selected": false, + "text": "41", + "value": "41" + }, + { + "selected": false, + "text": "42", + "value": "42" + }, + { + "selected": false, + "text": "43", + "value": "43" + }, + { + "selected": false, + "text": "44", + "value": "44" + }, + { + "selected": false, + "text": "45", + "value": "45" + }, + { + "selected": false, + "text": "46", + "value": "46" + }, + { + "selected": false, + "text": "47", + "value": "47" + }, + { + "selected": false, + "text": "48", + "value": "48" + }, + { + "selected": false, + "text": "49", + "value": "49" + }, + { + "selected": false, + "text": "50", + "value": "50" + }, + { + "selected": false, + "text": "51", + "value": "51" + }, + { + "selected": false, + "text": "52", + "value": "52" + }, + { + "selected": false, + "text": "53", + "value": "53" + }, + { + "selected": false, + "text": "54", + "value": "54" + }, + { + "selected": false, + "text": "55", + "value": "55" + }, + { + "selected": false, + "text": "56", + "value": "56" + }, + { + "selected": false, + "text": "57", + "value": "57" + }, + { + "selected": false, + "text": "58", + "value": "58" + }, + { + "selected": false, + "text": "59", + "value": "59" + }, + { + "selected": false, + "text": "60", + "value": "60" + }, + { + "selected": false, + "text": "61", + "value": "61" + }, + { + "selected": false, + "text": "62", + "value": "62" + }, + { + "selected": false, + "text": "63", + "value": "63" + }, + { + "selected": false, + "text": "64", + "value": "64" + } + ], + "query": "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64", + "queryValue": "", + "skipUrlSync": false, + "type": "custom" + } + ] + }, + "time": { + "from": "now-5m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "RDT (L3 Cache)", + "uid": "kuro-rdt", + "version": 9 +} \ No newline at end of file -- cgit 1.2.3-korg From 8f3d8b3d1072ca33cf3503e95f8fd3bc629ace18 Mon Sep 17 00:00:00 2001 From: Aditya Srivastava Date: Mon, 24 Aug 2020 02:06:54 +0530 Subject: Docs: Add monitoring cluster related documentation This patch adds documentation related to deployment, configuration and usage of K8s monitoring cluster. Also adds the devguide explaining mapping of each yaml file with its associated task. Signed-off-by: Aditya Srivastava Change-Id: Ib6252f7c853a643eb5cb9f562a55ee366f9c71ea --- docs/lma/metrics/devguide.rst | 474 +++++++++++++++++++++++++++++++++++ docs/lma/metrics/images/dataflow.png | Bin 0 -> 42443 bytes docs/lma/metrics/images/setup.png | Bin 0 -> 15019 bytes docs/lma/metrics/userguide.rst | 230 +++++++++++++++++ 4 files changed, 704 insertions(+) create mode 100644 docs/lma/metrics/devguide.rst create mode 100644 docs/lma/metrics/images/dataflow.png create mode 100644 docs/lma/metrics/images/setup.png create mode 100644 docs/lma/metrics/userguide.rst diff --git a/docs/lma/metrics/devguide.rst b/docs/lma/metrics/devguide.rst new file mode 100644 index 00000000..93d33016 --- /dev/null +++ b/docs/lma/metrics/devguide.rst @@ -0,0 +1,474 @@ +==================== +Metrics Dev Guide +==================== +Table of Contents +================= +.. contents:: +.. section-numbering:: + + +Anible File Organization +============================ + +Ansible-Server +---------------- + +Please follow the following file structure: + +.. code-block:: bash + + ansible-server + | ansible.cfg + | hosts + | + +---group_vars + | all.yml + | + +---playbooks + | clean.yaml + | setup.yaml + | + \---roles + +---clean-monitoring + | \---tasks + | main.yml + | + +---monitoring + +---files + | | monitoring-namespace.yaml + | | + | +---alertmanager + | | alertmanager-config.yaml + | | alertmanager-deployment.yaml + | | alertmanager-service.yaml + | | alertmanager1-deployment.yaml + | | alertmanager1-service.yaml + | | + | +---cadvisor + | | cadvisor-daemonset.yaml + | | cadvisor-service.yaml + | | + | +---collectd-exporter + | | collectd-exporter-deployment.yaml + | | collectd-exporter-service.yaml + | | + | +---grafana + | | grafana-datasource-config.yaml + | | grafana-deployment.yaml + | | grafana-pv.yaml + | | grafana-pvc.yaml + | | grafana-service.yaml + | | + | +---kube-state-metrics + | | kube-state-metrics-deployment.yaml + | | kube-state-metrics-service.yaml + | | + | +---node-exporter + | | nodeexporter-daemonset.yaml + | | nodeexporter-service.yaml + | | + | \---prometheus + | main-prometheus-service.yaml + | prometheus-config.yaml + | prometheus-deployment.yaml + | prometheus-pv.yaml + | prometheus-pvc.yaml + | prometheus-service.yaml + | prometheus1-deployment.yaml + | prometheus1-service.yaml + | + \---tasks + main.yml + + +Ansible - Client +------------------ + +Please follow the following file structure: + +.. code-block:: bash + + ansible-server + | ansible.cfg + | hosts + | + +---group_vars + | all.yml + | + +---playbooks + | clean.yaml + | setup.yaml + | + \---roles + +---clean-collectd + | \---tasks + | main.yml + | + +---collectd + +---files + | collectd.conf.j2 + | + \---tasks + main.yml + + +Summary of Roles +================== + +A brief description of the Ansible playbook roles, +which are used to deploy the monitoring cluster + +Ansible Server Roles +---------------------- + +Ansible Server, this part consists of the roles used to deploy +Prometheus Alertmanager Grafana stack on the server-side + +Role: Monitoring +~~~~~~~~~~~~~~~~~~ + +Deployment and configuration of PAG stack along with collectd-exporter, +cadvisor and node-exporter. + +Role: Clean-Monitoring +~~~~~~~~~~~~~~~~~~~~~~~~ + +Removes all the components deployed by the Monitoring role. + + +File-Task Mapping and Configurable Parameters +================================================ + +Ansible Server +---------------- + +Role: Monitoring +~~~~~~~~~~~~~~~~~~~ + +Alert Manager +^^^^^^^^^^^^^^^ + +File: alertmanager-config.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/alertmanager/alertmanager-config.yaml + +Task: Configures Receivers for alertmanager + +Summary: A configmap, currently configures webhook for alertmanager, +can be used to configure any kind of receiver + +Configurable Parameters: + receiver.url: change to the webhook receiver's URL + route: Can be used to add receivers + + +File: alertmanager-deployment.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/alertmanager/alertmanager-deployment.yaml + +Task: Deploys alertmanager instance + +Summary: A Deployment, deploys 1 replica of alertmanager + + +File: alertmanager-service.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/alertmanager/alertmanager-service.yaml + +Task: Creates a K8s service for alertmanager + +Summary: A Nodeport type of service, so that user can create "silences", +view the status of alerts from the native alertmanager dashboard / UI. + +Configurable Parameters: + spec.type: Options : NodePort, ClusterIP, LoadBalancer + spec.ports: Edit / add ports to be handled by the service + +**Note: alertmanager1-deployment, alertmanager1-service are the same as +alertmanager-deployment and alertmanager-service respectively.** + +CAdvisor +^^^^^^^^^^^ + +File: cadvisor-daemonset.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/cadvisor/cadvisor-daemonset.yaml + +Task: To create a cadvisor daemonset + +Summary: A daemonset, used to scrape data of the kubernetes cluster itself, +its a daemonset so an instance is run on every node. + +Configurable Parameters: + spec.template.spec.ports: Port of the container + + +File: cadvisor-service.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/cadvisor/cadvisor-service.yaml + +Task: To create a cadvisor service + +Summary: A ClusterIP service for cadvisor to communicate with prometheus + +Configurable Parameters: + spec.ports: Add / Edit ports + + +Collectd Exporter +^^^^^^^^^^^^^^^^^^^^ + +File: collectd-exporter-deployment.yaml +'''''''''''''''''''''''''''''''''''''''''' +Path : monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml + +Task: To create a collectd replica + +Summary: A deployment, acts as receiver for collectd data sent by client machines, +prometheus pulls data from this exporter + +Configurable Parameters: + spec.template.spec.ports: Port of the container + + +File: collectd-exporter.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/collectd-exporter/collectd-exporter.yaml + +Task: To create a collectd service + +Summary: A NodePort service for collectd-exporter to hold data for prometheus +to scrape + +Configurable Parameters: + spec.ports: Add / Edit ports + + +Grafana +^^^^^^^^^ + +File: grafana-datasource-config.yaml +'''''''''''''''''''''''''''''''''''''''''' +Path : monitoring/files/grafana/grafana-datasource-config.yaml + +Task: To create config file for grafana + +Summary: A configmap, adds prometheus datasource in grafana + + +File: grafana-deployment.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/grafana/grafana-deployment.yaml + +Task: To create a grafana deployment + +Summary: The grafana deployment creates a single replica of grafana, +with preconfigured prometheus datasource. + +Configurable Parameters: + spec.template.spec.ports: Edit ports + spec.template.spec.env: Add / Edit environment variables + + +File: grafana-pv.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/grafana/grafana-pv.yaml + +Task: To create a persistent volume for grafana + +Summary: A persistent volume for grafana. + +Configurable Parameters: + spec.capacity.storage: Increase / decrease size + spec.accessModes: To change the way PV is accessed. + spec.nfs.server: To change the ip address of NFS server + spec.nfs.path: To change the path of the server + + +File: grafana-pvc.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/grafana/grafana-pvc.yaml + +Task: To create a persistent volume claim for grafana + +Summary: A persistent volume claim for grafana. + +Configurable Parameters: + spec.resources.requests.storage: Increase / decrease size + + +File: grafana-service.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/grafana/grafana-service.yaml + +Task: To create a service for grafana + +Summary: A Nodeport type of service, so that users actually connect to, +view the dashboard / UI. + +Configurable Parameters: + spec.type: Options : NodePort, ClusterIP, LoadBalancer + spec.ports: Edit / add ports to be handled by the service + + +Kube State Metrics +^^^^^^^^^^^^^^^^^^^^ + +File: kube-state-metrics-deployment.yaml +'''''''''''''''''''''''''''''''''''''''''' +Path : monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml + +Task: To create a kube-state-metrics instance + +Summary: A deployment, used to collect metrics of the kubernetes cluster iteself + +Configurable Parameters: + spec.template.spec.containers.ports: Port of the container + + +File: kube-state-metrics-service.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml + +Task: To create a collectd service + +Summary: A NodePort service for collectd-exporter to hold data for prometheus +to scrape + +Configurable Parameters: + spec.ports: Add / Edit ports + + +Node Exporter +^^^^^^^^^^^^^^^ + +File: node-exporter-daemonset.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/node-exporter/node-exporter-daemonset.yaml + +Task: To create a node exporter daemonset + +Summary: A daemonset, used to scrape data of the host machines / node, +its a daemonset so an instance is run on every node. + +Configurable Parameters: + spec.template.spec.ports: Port of the container + + +File: node-exporter-service.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/node-exporter/node-exporter-service.yaml + +Task: To create a node exporter service + +Summary: A ClusterIP service for node exporter to communicate with Prometheus + +Configurable Parameters: + spec.ports: Add / Edit ports + + +Prometheus +^^^^^^^^^^^^^ + +File: prometheus-config.yaml +'''''''''''''''''''''''''''''''''''''''''' +Path : monitoring/files/prometheus/prometheus-config.yaml + +Task: To create a config file for Prometheus + +Summary: A configmap, adds alert rules. + +Configurable Parameters: + data.alert.rules: Add / Edit alert rules + + +File: prometheus-deployment.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/prometheus/prometheus-deployment.yaml + +Task: To create a Prometheus deployment + +Summary: The Prometheus deployment creates a single replica of Prometheus, +with preconfigured Prometheus datasource. + +Configurable Parameters: + spec.template.spec.affinity: To change the node affinity, + make sure only 1 instance of prometheus is + running on 1 node. + + spec.template.spec.ports: Add / Edit container port + + +File: prometheus-pv.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/prometheus/prometheus-pv.yaml + +Task: To create a persistent volume for Prometheus + +Summary: A persistent volume for Prometheus. + +Configurable Parameters: + spec.capacity.storage: Increase / decrease size + spec.accessModes: To change the way PV is accessed. + spec.hostpath.path: To change the path of the volume + + +File: prometheus-pvc.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/prometheus/prometheus-pvc.yaml + +Task: To create a persistent volume claim for Prometheus + +Summary: A persistent volume claim for Prometheus. + +Configurable Parameters: + spec.resources.requests.storage: Increase / decrease size + + +File: prometheus-service.yaml +''''''''''''''''''''''''''''''''' +Path : monitoring/files/prometheus/prometheus-service.yaml + +Task: To create a service for prometheus + +Summary: A Nodeport type of service, prometheus native dashboard +available here. + +Configurable Parameters: + spec.type: Options : NodePort, ClusterIP, LoadBalancer + spec.ports: Edit / add ports to be handled by the service + + +File: main-prometheus-server.yaml +''''''''''''''''''''''''''''''''''' +Path: monitoring/files/prometheus/main-prometheus-service.yaml + +Task: A service that connects both prometheus instances. + +Summary: A Nodeport service for other services to connect to the Prometheus cluster. +As HA Prometheus needs to independent instances of Prometheus scraping the same inputs +having the same configuration + +**Note: prometheus-deployment, prometheus1-service are the same as +prometheus-deployment and prometheus-service respectively.** + + +Ansible Client Roles +---------------------- + +Role: Collectd +~~~~~~~~~~~~~~~~~~ + +File: main.yml +^^^^^^^^^^^^^^^^ +Path: collectd/tasks/main.yaml + +Task: Install collectd along with prerequisites + +Associated template file: + +- collectd.conf.j2 +Path: collectd/files/collectd.conf.j2 + +Summary: Edit this file to change the default configuration to +be installed on the client's machine diff --git a/docs/lma/metrics/images/dataflow.png b/docs/lma/metrics/images/dataflow.png new file mode 100644 index 00000000..ca1ec908 Binary files /dev/null and b/docs/lma/metrics/images/dataflow.png differ diff --git a/docs/lma/metrics/images/setup.png b/docs/lma/metrics/images/setup.png new file mode 100644 index 00000000..ce6a1274 Binary files /dev/null and b/docs/lma/metrics/images/setup.png differ diff --git a/docs/lma/metrics/userguide.rst b/docs/lma/metrics/userguide.rst new file mode 100644 index 00000000..0ee4a238 --- /dev/null +++ b/docs/lma/metrics/userguide.rst @@ -0,0 +1,230 @@ +================= +Metrics +================= +Table of Contents +================= +.. contents:: +.. section-numbering:: + +Setup +======= + +Prerequisites +------------------------- +- Require 3 VMs to setup K8s +- ``$ sudo yum install ansible`` +- ``$ pip install openshift pyyaml kubernetes`` (required for ansible K8s module) +- Update IPs in all these files (if changed) + - ``ansible-server/group_vars/all.yml`` (IP of apiserver and hostname) + - ``ansible-server/hosts`` (IP of VMs to install) + - ``ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml`` (IP of NFS-Server) + - ``ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml`` (IP of alert-receiver) + +Setup Structure +--------------- +.. image:: images/setup.png + +Installation - Client Side +---------------------------- + +Nodes +````` +- **Node1** = 10.10.120.21 +- **Node4** = 10.10.120.24 + +How installation is done? +````````````````````````` +Ansible playbook available in ``tools/lma/ansible-client`` folder + +- ``cd tools/lma/ansible-client`` +- ``ansible-playbook setup.yaml`` + +This deploys collectd and configures it to send data to collectd exporter +configured at 10.10.120.211 (ip address of current instance of collectd-exporter) +Please make appropriate changes in the config file present in ``tools/lma/ansible-client/roles/collectd/files/`` + +Installation - Server Side +---------------------------- + +Nodes +`````` + +Inside Jumphost - POD12 + - **VM1** = 10.10.120.211 + - **VM2** = 10.10.120.203 + - **VM3** = 10.10.120.204 + + +How installation is done? +````````````````````````` +**Using Ansible:** + - **K8s** + - **Prometheus:** 2 independent deployments + - **Alertmanager:** 2 independent deployments (cluster peers) + - **Grafana:** 1 Replica deployment + - **cAdvisor:** 1 daemonset, i.e 3 replicas, one on each node + - **collectd-exporter:** 1 Replica + - **node-exporter:** 1 statefulset with 3 replicas + - **kube-state-metrics:** 1 deployment + - **NFS Server:** at each VM to store grafana data at following path + - ``/usr/share/monitoring_data/grafana`` + +How to setup? +````````````` +- **To setup K8s cluster, EFK and PAG:** Run the ansible-playbook ``ansible/playbooks/setup.yaml`` +- **To clean everything:** Run the ansible-playbook ``ansible/playbooks/clean.yaml`` + +Do we have HA? +```````````````` +Yes + +Configuration +============= + +K8s +--- +Path to all yamls (Server Side) +```````````````````````````````` +``tools/lma/ansible-server/roles/monitoring/files/`` + +K8s namespace +````````````` +``monitoring`` + +Configuration +--------------------------- + +Serivces and Ports +`````````````````````````` + +Services and their ports are listed below, +one can go to IP of any node on the following ports, +service will correctly redirect you + + + ====================== ======= + Service Port + ====================== ======= + Prometheus 30900 + Prometheus1 30901 + Main-Prometheus 30902 + Alertmanager 30930 + Alertmanager1 30931 + Grafana 30000 + Collectd-exporter 30130 + ====================== ======= + +How to change Configuration? +------------------------------ +- Ports, names of the containers, pretty much every configuration can be modified by changing the required values in the respective yaml files (``/tools/lma/ansible-server/roles/monitoring/``) +- For metrics, on the client's machine, edit the collectd's configuration (jinja2 template) file, and add required plugins (``/tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2``). + For more details refer `this `_ + +Where to send metrics? +------------------------ + +Metrics are sent to collectd exporter. +UDP packets are sent to port 38026 +(can be configured and checked at +``tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml``) + +Data Management +================================ + +DataFlow: +-------------- +.. image:: images/dataFlow.png + +Where is the data stored now? +---------------------------------- + - Grafana data (including dashboards) ==> On master, at ``/usr/share/monitoring_data/grafana`` (its accessed by Presistent volume via NFS) + - Prometheus Data ==> On VM2 and VM3, at /usr/share/monitoring_data/prometheus + + **Note: Promethei data also are independent of each other, a shared data solution gave errors** + +Do we have backup of data? +------------------------------- + Promethei even though independent scrape same targets, + have same alert rules, therefore generate very similar data. + + Grafana's NFS part of the data has no backup + Dashboards' json are available in the ``/tools/lma/metrics/dashboards`` directory + +When containers are restarted, the data is still accessible? +----------------------------------------------------------------- + Yes, unless the data directories are deleted ``(/usr/share/monitoring_data/*)`` from each node + +Alert Management +================== + +Configure Alert receiver +-------------------------- +- Go to file ``/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml`` +- Under the config.yml section under receivers, add, update, delete receivers +- Currently ip of unified alert receiver is used. +- Alertmanager supports multiple types of receivers, you can get a `list here `_ + +Add new alerts +-------------------------------------- +- Go to file ``/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml`` +- Under the data section alert.rules file is mounted on the config-map. +- In this file alerts are divided in 4 groups, namely: + - targets + - host and hardware + - container + - kubernetes +- Add alerts under exisiting group or add new group. Please follow the structure of the file for adding new group +- To add new alert: + - Use the following structure: + + alert: alertname + + expr: alert rule (generally promql conditional query) + + for: time-range (eg. 5m, 10s, etc, the amount of time the condition needs to be true for the alert to be triggered) + + labels: + + severity: critical (other severity options and other labels can be added here) + + type: hardware + + annotations: + + summary: + + description: + +- For an exhaustive alerts list you can have a look `here `_ + +Troubleshooting +=============== +No metrics received in grafana plot +--------------------------------------------- +- Check if all configurations are correctly done. +- Go to main-prometheus's port and any one VMs' ip, and check if prometheus is getting the metrics +- If prometheus is getting them, read grafana's logs (``kubectl -n monitoring logs ``) +- Else, have a look at collectd exporter's metrics endpoint (eg. 10.10.120.211:30103/metrics) +- If collectd is getting them, check prometheus's config file if collectd's ip is correct over there. +- Else ssh to master, check which node collectd-exporter is scheduled (lets say vm2) +- Now ssh to vm2 +- Use ``tcpdump -i ens3 #the interface used to connect to the internet > testdump`` +- Grep your client node's ip and check if packets are reaching our monitoring cluster (``cat testdump | grep ``) +- Ideally you should see packets reaching the node, if so please see if the collectd-exporter is running correctly, check its logs. +- If no packets are received, error is on the client side, check collectd's config file and make sure correct collectd-exporter ip is used in the ```` section. + +If no notification received +--------------------------- +- Go to main-prometheus's port and any one VMs' ip,(eg. 10.10.120.211:30902) and check if prometheus is getting the metrics +- If no, read "No metrics received in grafana plot" section, else read ahead. +- Check IP of alert-receiver, you can see this by going to alertmanager-ip:port and check if alertmanager is configured correctly. +- If yes, paste the alert rule in the prometheus' query-box and see if any metric staisfy the condition. +- You may need to change alert rules in the alert.rules section of prometheus-config.yaml if there was a bug in the alert's rule. (please read the "Add new alerts" section for detailed instructions) + +Reference +========= +- `Prometheus K8S deployment `_ +- `HA Prometheus `_ +- `Data Flow Diagram `_ +- `Collectd Configuration `_ +- `Alertmanager Rule Config `_ -- cgit 1.2.3-korg