aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--docs/lma/metrics/devguide.rst474
-rw-r--r--docs/lma/metrics/images/dataflow.pngbin0 -> 42443 bytes
-rw-r--r--docs/lma/metrics/images/setup.pngbin0 -> 15019 bytes
-rw-r--r--docs/lma/metrics/userguide.rst230
-rw-r--r--tools/lma/ansible-client/playbooks/clean.yaml25
-rw-r--r--tools/lma/ansible-client/roles/clean-collectd/main.yml44
-rw-r--r--tools/lma/ansible-client/roles/collectd/files/collectd.conf.j244
-rw-r--r--tools/lma/ansible-client/roles/collectd/tasks/main.yml60
-rw-r--r--tools/lma/ansible-server/playbooks/clean.yaml52
-rw-r--r--tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml34
-rw-r--r--tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml65
-rw-r--r--tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml26
-rw-r--r--tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml48
-rw-r--r--tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml44
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml37
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml62
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml41
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml62
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml42
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml79
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml30
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml51
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml35
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml35
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml68
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml31
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml33
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml36
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml36
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml26
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml18
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml80
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml33
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml35
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml609
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml73
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml30
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml33
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml34
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml73
-rw-r--r--tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml35
-rw-r--r--tools/lma/ansible-server/roles/monitoring/tasks/main.yml273
-rw-r--r--tools/lma/metrics/dashboard/cpu_usage_using.json750
-rw-r--r--tools/lma/metrics/dashboard/memory_using.json337
-rw-r--r--tools/lma/metrics/dashboard/ovs_stats_using.json854
-rw-r--r--tools/lma/metrics/dashboard/rdt_using.json833
46 files changed, 5950 insertions, 0 deletions
diff --git a/docs/lma/metrics/devguide.rst b/docs/lma/metrics/devguide.rst
new file mode 100644
index 00000000..93d33016
--- /dev/null
+++ b/docs/lma/metrics/devguide.rst
@@ -0,0 +1,474 @@
+====================
+Metrics Dev Guide
+====================
+Table of Contents
+=================
+.. contents::
+.. section-numbering::
+
+
+Anible File Organization
+============================
+
+Ansible-Server
+----------------
+
+Please follow the following file structure:
+
+.. code-block:: bash
+
+ ansible-server
+ | ansible.cfg
+ | hosts
+ |
+ +---group_vars
+ | all.yml
+ |
+ +---playbooks
+ | clean.yaml
+ | setup.yaml
+ |
+ \---roles
+ +---clean-monitoring
+ | \---tasks
+ | main.yml
+ |
+ +---monitoring
+ +---files
+ | | monitoring-namespace.yaml
+ | |
+ | +---alertmanager
+ | | alertmanager-config.yaml
+ | | alertmanager-deployment.yaml
+ | | alertmanager-service.yaml
+ | | alertmanager1-deployment.yaml
+ | | alertmanager1-service.yaml
+ | |
+ | +---cadvisor
+ | | cadvisor-daemonset.yaml
+ | | cadvisor-service.yaml
+ | |
+ | +---collectd-exporter
+ | | collectd-exporter-deployment.yaml
+ | | collectd-exporter-service.yaml
+ | |
+ | +---grafana
+ | | grafana-datasource-config.yaml
+ | | grafana-deployment.yaml
+ | | grafana-pv.yaml
+ | | grafana-pvc.yaml
+ | | grafana-service.yaml
+ | |
+ | +---kube-state-metrics
+ | | kube-state-metrics-deployment.yaml
+ | | kube-state-metrics-service.yaml
+ | |
+ | +---node-exporter
+ | | nodeexporter-daemonset.yaml
+ | | nodeexporter-service.yaml
+ | |
+ | \---prometheus
+ | main-prometheus-service.yaml
+ | prometheus-config.yaml
+ | prometheus-deployment.yaml
+ | prometheus-pv.yaml
+ | prometheus-pvc.yaml
+ | prometheus-service.yaml
+ | prometheus1-deployment.yaml
+ | prometheus1-service.yaml
+ |
+ \---tasks
+ main.yml
+
+
+Ansible - Client
+------------------
+
+Please follow the following file structure:
+
+.. code-block:: bash
+
+ ansible-server
+ | ansible.cfg
+ | hosts
+ |
+ +---group_vars
+ | all.yml
+ |
+ +---playbooks
+ | clean.yaml
+ | setup.yaml
+ |
+ \---roles
+ +---clean-collectd
+ | \---tasks
+ | main.yml
+ |
+ +---collectd
+ +---files
+ | collectd.conf.j2
+ |
+ \---tasks
+ main.yml
+
+
+Summary of Roles
+==================
+
+A brief description of the Ansible playbook roles,
+which are used to deploy the monitoring cluster
+
+Ansible Server Roles
+----------------------
+
+Ansible Server, this part consists of the roles used to deploy
+Prometheus Alertmanager Grafana stack on the server-side
+
+Role: Monitoring
+~~~~~~~~~~~~~~~~~~
+
+Deployment and configuration of PAG stack along with collectd-exporter,
+cadvisor and node-exporter.
+
+Role: Clean-Monitoring
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+Removes all the components deployed by the Monitoring role.
+
+
+File-Task Mapping and Configurable Parameters
+================================================
+
+Ansible Server
+----------------
+
+Role: Monitoring
+~~~~~~~~~~~~~~~~~~~
+
+Alert Manager
+^^^^^^^^^^^^^^^
+
+File: alertmanager-config.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/alertmanager/alertmanager-config.yaml
+
+Task: Configures Receivers for alertmanager
+
+Summary: A configmap, currently configures webhook for alertmanager,
+can be used to configure any kind of receiver
+
+Configurable Parameters:
+ receiver.url: change to the webhook receiver's URL
+ route: Can be used to add receivers
+
+
+File: alertmanager-deployment.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/alertmanager/alertmanager-deployment.yaml
+
+Task: Deploys alertmanager instance
+
+Summary: A Deployment, deploys 1 replica of alertmanager
+
+
+File: alertmanager-service.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/alertmanager/alertmanager-service.yaml
+
+Task: Creates a K8s service for alertmanager
+
+Summary: A Nodeport type of service, so that user can create "silences",
+view the status of alerts from the native alertmanager dashboard / UI.
+
+Configurable Parameters:
+ spec.type: Options : NodePort, ClusterIP, LoadBalancer
+ spec.ports: Edit / add ports to be handled by the service
+
+**Note: alertmanager1-deployment, alertmanager1-service are the same as
+alertmanager-deployment and alertmanager-service respectively.**
+
+CAdvisor
+^^^^^^^^^^^
+
+File: cadvisor-daemonset.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/cadvisor/cadvisor-daemonset.yaml
+
+Task: To create a cadvisor daemonset
+
+Summary: A daemonset, used to scrape data of the kubernetes cluster itself,
+its a daemonset so an instance is run on every node.
+
+Configurable Parameters:
+ spec.template.spec.ports: Port of the container
+
+
+File: cadvisor-service.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/cadvisor/cadvisor-service.yaml
+
+Task: To create a cadvisor service
+
+Summary: A ClusterIP service for cadvisor to communicate with prometheus
+
+Configurable Parameters:
+ spec.ports: Add / Edit ports
+
+
+Collectd Exporter
+^^^^^^^^^^^^^^^^^^^^
+
+File: collectd-exporter-deployment.yaml
+''''''''''''''''''''''''''''''''''''''''''
+Path : monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml
+
+Task: To create a collectd replica
+
+Summary: A deployment, acts as receiver for collectd data sent by client machines,
+prometheus pulls data from this exporter
+
+Configurable Parameters:
+ spec.template.spec.ports: Port of the container
+
+
+File: collectd-exporter.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/collectd-exporter/collectd-exporter.yaml
+
+Task: To create a collectd service
+
+Summary: A NodePort service for collectd-exporter to hold data for prometheus
+to scrape
+
+Configurable Parameters:
+ spec.ports: Add / Edit ports
+
+
+Grafana
+^^^^^^^^^
+
+File: grafana-datasource-config.yaml
+''''''''''''''''''''''''''''''''''''''''''
+Path : monitoring/files/grafana/grafana-datasource-config.yaml
+
+Task: To create config file for grafana
+
+Summary: A configmap, adds prometheus datasource in grafana
+
+
+File: grafana-deployment.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/grafana/grafana-deployment.yaml
+
+Task: To create a grafana deployment
+
+Summary: The grafana deployment creates a single replica of grafana,
+with preconfigured prometheus datasource.
+
+Configurable Parameters:
+ spec.template.spec.ports: Edit ports
+ spec.template.spec.env: Add / Edit environment variables
+
+
+File: grafana-pv.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/grafana/grafana-pv.yaml
+
+Task: To create a persistent volume for grafana
+
+Summary: A persistent volume for grafana.
+
+Configurable Parameters:
+ spec.capacity.storage: Increase / decrease size
+ spec.accessModes: To change the way PV is accessed.
+ spec.nfs.server: To change the ip address of NFS server
+ spec.nfs.path: To change the path of the server
+
+
+File: grafana-pvc.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/grafana/grafana-pvc.yaml
+
+Task: To create a persistent volume claim for grafana
+
+Summary: A persistent volume claim for grafana.
+
+Configurable Parameters:
+ spec.resources.requests.storage: Increase / decrease size
+
+
+File: grafana-service.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/grafana/grafana-service.yaml
+
+Task: To create a service for grafana
+
+Summary: A Nodeport type of service, so that users actually connect to,
+view the dashboard / UI.
+
+Configurable Parameters:
+ spec.type: Options : NodePort, ClusterIP, LoadBalancer
+ spec.ports: Edit / add ports to be handled by the service
+
+
+Kube State Metrics
+^^^^^^^^^^^^^^^^^^^^
+
+File: kube-state-metrics-deployment.yaml
+''''''''''''''''''''''''''''''''''''''''''
+Path : monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml
+
+Task: To create a kube-state-metrics instance
+
+Summary: A deployment, used to collect metrics of the kubernetes cluster iteself
+
+Configurable Parameters:
+ spec.template.spec.containers.ports: Port of the container
+
+
+File: kube-state-metrics-service.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml
+
+Task: To create a collectd service
+
+Summary: A NodePort service for collectd-exporter to hold data for prometheus
+to scrape
+
+Configurable Parameters:
+ spec.ports: Add / Edit ports
+
+
+Node Exporter
+^^^^^^^^^^^^^^^
+
+File: node-exporter-daemonset.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/node-exporter/node-exporter-daemonset.yaml
+
+Task: To create a node exporter daemonset
+
+Summary: A daemonset, used to scrape data of the host machines / node,
+its a daemonset so an instance is run on every node.
+
+Configurable Parameters:
+ spec.template.spec.ports: Port of the container
+
+
+File: node-exporter-service.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/node-exporter/node-exporter-service.yaml
+
+Task: To create a node exporter service
+
+Summary: A ClusterIP service for node exporter to communicate with Prometheus
+
+Configurable Parameters:
+ spec.ports: Add / Edit ports
+
+
+Prometheus
+^^^^^^^^^^^^^
+
+File: prometheus-config.yaml
+''''''''''''''''''''''''''''''''''''''''''
+Path : monitoring/files/prometheus/prometheus-config.yaml
+
+Task: To create a config file for Prometheus
+
+Summary: A configmap, adds alert rules.
+
+Configurable Parameters:
+ data.alert.rules: Add / Edit alert rules
+
+
+File: prometheus-deployment.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/prometheus/prometheus-deployment.yaml
+
+Task: To create a Prometheus deployment
+
+Summary: The Prometheus deployment creates a single replica of Prometheus,
+with preconfigured Prometheus datasource.
+
+Configurable Parameters:
+ spec.template.spec.affinity: To change the node affinity,
+ make sure only 1 instance of prometheus is
+ running on 1 node.
+
+ spec.template.spec.ports: Add / Edit container port
+
+
+File: prometheus-pv.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/prometheus/prometheus-pv.yaml
+
+Task: To create a persistent volume for Prometheus
+
+Summary: A persistent volume for Prometheus.
+
+Configurable Parameters:
+ spec.capacity.storage: Increase / decrease size
+ spec.accessModes: To change the way PV is accessed.
+ spec.hostpath.path: To change the path of the volume
+
+
+File: prometheus-pvc.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/prometheus/prometheus-pvc.yaml
+
+Task: To create a persistent volume claim for Prometheus
+
+Summary: A persistent volume claim for Prometheus.
+
+Configurable Parameters:
+ spec.resources.requests.storage: Increase / decrease size
+
+
+File: prometheus-service.yaml
+'''''''''''''''''''''''''''''''''
+Path : monitoring/files/prometheus/prometheus-service.yaml
+
+Task: To create a service for prometheus
+
+Summary: A Nodeport type of service, prometheus native dashboard
+available here.
+
+Configurable Parameters:
+ spec.type: Options : NodePort, ClusterIP, LoadBalancer
+ spec.ports: Edit / add ports to be handled by the service
+
+
+File: main-prometheus-server.yaml
+'''''''''''''''''''''''''''''''''''
+Path: monitoring/files/prometheus/main-prometheus-service.yaml
+
+Task: A service that connects both prometheus instances.
+
+Summary: A Nodeport service for other services to connect to the Prometheus cluster.
+As HA Prometheus needs to independent instances of Prometheus scraping the same inputs
+having the same configuration
+
+**Note: prometheus-deployment, prometheus1-service are the same as
+prometheus-deployment and prometheus-service respectively.**
+
+
+Ansible Client Roles
+----------------------
+
+Role: Collectd
+~~~~~~~~~~~~~~~~~~
+
+File: main.yml
+^^^^^^^^^^^^^^^^
+Path: collectd/tasks/main.yaml
+
+Task: Install collectd along with prerequisites
+
+Associated template file:
+
+- collectd.conf.j2
+Path: collectd/files/collectd.conf.j2
+
+Summary: Edit this file to change the default configuration to
+be installed on the client's machine
diff --git a/docs/lma/metrics/images/dataflow.png b/docs/lma/metrics/images/dataflow.png
new file mode 100644
index 00000000..ca1ec908
--- /dev/null
+++ b/docs/lma/metrics/images/dataflow.png
Binary files differ
diff --git a/docs/lma/metrics/images/setup.png b/docs/lma/metrics/images/setup.png
new file mode 100644
index 00000000..ce6a1274
--- /dev/null
+++ b/docs/lma/metrics/images/setup.png
Binary files differ
diff --git a/docs/lma/metrics/userguide.rst b/docs/lma/metrics/userguide.rst
new file mode 100644
index 00000000..0ee4a238
--- /dev/null
+++ b/docs/lma/metrics/userguide.rst
@@ -0,0 +1,230 @@
+=================
+Metrics
+=================
+Table of Contents
+=================
+.. contents::
+.. section-numbering::
+
+Setup
+=======
+
+Prerequisites
+-------------------------
+- Require 3 VMs to setup K8s
+- ``$ sudo yum install ansible``
+- ``$ pip install openshift pyyaml kubernetes`` (required for ansible K8s module)
+- Update IPs in all these files (if changed)
+ - ``ansible-server/group_vars/all.yml`` (IP of apiserver and hostname)
+ - ``ansible-server/hosts`` (IP of VMs to install)
+ - ``ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml`` (IP of NFS-Server)
+ - ``ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml`` (IP of alert-receiver)
+
+Setup Structure
+---------------
+.. image:: images/setup.png
+
+Installation - Client Side
+----------------------------
+
+Nodes
+`````
+- **Node1** = 10.10.120.21
+- **Node4** = 10.10.120.24
+
+How installation is done?
+`````````````````````````
+Ansible playbook available in ``tools/lma/ansible-client`` folder
+
+- ``cd tools/lma/ansible-client``
+- ``ansible-playbook setup.yaml``
+
+This deploys collectd and configures it to send data to collectd exporter
+configured at 10.10.120.211 (ip address of current instance of collectd-exporter)
+Please make appropriate changes in the config file present in ``tools/lma/ansible-client/roles/collectd/files/``
+
+Installation - Server Side
+----------------------------
+
+Nodes
+``````
+
+Inside Jumphost - POD12
+ - **VM1** = 10.10.120.211
+ - **VM2** = 10.10.120.203
+ - **VM3** = 10.10.120.204
+
+
+How installation is done?
+`````````````````````````
+**Using Ansible:**
+ - **K8s**
+ - **Prometheus:** 2 independent deployments
+ - **Alertmanager:** 2 independent deployments (cluster peers)
+ - **Grafana:** 1 Replica deployment
+ - **cAdvisor:** 1 daemonset, i.e 3 replicas, one on each node
+ - **collectd-exporter:** 1 Replica
+ - **node-exporter:** 1 statefulset with 3 replicas
+ - **kube-state-metrics:** 1 deployment
+ - **NFS Server:** at each VM to store grafana data at following path
+ - ``/usr/share/monitoring_data/grafana``
+
+How to setup?
+`````````````
+- **To setup K8s cluster, EFK and PAG:** Run the ansible-playbook ``ansible/playbooks/setup.yaml``
+- **To clean everything:** Run the ansible-playbook ``ansible/playbooks/clean.yaml``
+
+Do we have HA?
+````````````````
+Yes
+
+Configuration
+=============
+
+K8s
+---
+Path to all yamls (Server Side)
+````````````````````````````````
+``tools/lma/ansible-server/roles/monitoring/files/``
+
+K8s namespace
+`````````````
+``monitoring``
+
+Configuration
+---------------------------
+
+Serivces and Ports
+``````````````````````````
+
+Services and their ports are listed below,
+one can go to IP of any node on the following ports,
+service will correctly redirect you
+
+
+ ====================== =======
+ Service Port
+ ====================== =======
+ Prometheus 30900
+ Prometheus1 30901
+ Main-Prometheus 30902
+ Alertmanager 30930
+ Alertmanager1 30931
+ Grafana 30000
+ Collectd-exporter 30130
+ ====================== =======
+
+How to change Configuration?
+------------------------------
+- Ports, names of the containers, pretty much every configuration can be modified by changing the required values in the respective yaml files (``/tools/lma/ansible-server/roles/monitoring/``)
+- For metrics, on the client's machine, edit the collectd's configuration (jinja2 template) file, and add required plugins (``/tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2``).
+ For more details refer `this <https://collectd.org/wiki/index.php/First_steps>`_
+
+Where to send metrics?
+------------------------
+
+Metrics are sent to collectd exporter.
+UDP packets are sent to port 38026
+(can be configured and checked at
+``tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml``)
+
+Data Management
+================================
+
+DataFlow:
+--------------
+.. image:: images/dataFlow.png
+
+Where is the data stored now?
+----------------------------------
+ - Grafana data (including dashboards) ==> On master, at ``/usr/share/monitoring_data/grafana`` (its accessed by Presistent volume via NFS)
+ - Prometheus Data ==> On VM2 and VM3, at /usr/share/monitoring_data/prometheus
+
+ **Note: Promethei data also are independent of each other, a shared data solution gave errors**
+
+Do we have backup of data?
+-------------------------------
+ Promethei even though independent scrape same targets,
+ have same alert rules, therefore generate very similar data.
+
+ Grafana's NFS part of the data has no backup
+ Dashboards' json are available in the ``/tools/lma/metrics/dashboards`` directory
+
+When containers are restarted, the data is still accessible?
+-----------------------------------------------------------------
+ Yes, unless the data directories are deleted ``(/usr/share/monitoring_data/*)`` from each node
+
+Alert Management
+==================
+
+Configure Alert receiver
+--------------------------
+- Go to file ``/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml``
+- Under the config.yml section under receivers, add, update, delete receivers
+- Currently ip of unified alert receiver is used.
+- Alertmanager supports multiple types of receivers, you can get a `list here <https://prometheus.io/docs/alerting/latest/configuration/>`_
+
+Add new alerts
+--------------------------------------
+- Go to file ``/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml``
+- Under the data section alert.rules file is mounted on the config-map.
+- In this file alerts are divided in 4 groups, namely:
+ - targets
+ - host and hardware
+ - container
+ - kubernetes
+- Add alerts under exisiting group or add new group. Please follow the structure of the file for adding new group
+- To add new alert:
+ - Use the following structure:
+
+ alert: alertname
+
+ expr: alert rule (generally promql conditional query)
+
+ for: time-range (eg. 5m, 10s, etc, the amount of time the condition needs to be true for the alert to be triggered)
+
+ labels:
+
+ severity: critical (other severity options and other labels can be added here)
+
+ type: hardware
+
+ annotations:
+
+ summary: <summary of the alert>
+
+ description: <descibe the alert here>
+
+- For an exhaustive alerts list you can have a look `here <https://awesome-prometheus-alerts.grep.to/>`_
+
+Troubleshooting
+===============
+No metrics received in grafana plot
+---------------------------------------------
+- Check if all configurations are correctly done.
+- Go to main-prometheus's port and any one VMs' ip, and check if prometheus is getting the metrics
+- If prometheus is getting them, read grafana's logs (``kubectl -n monitoring logs <name_of_grafana_pod>``)
+- Else, have a look at collectd exporter's metrics endpoint (eg. 10.10.120.211:30103/metrics)
+- If collectd is getting them, check prometheus's config file if collectd's ip is correct over there.
+- Else ssh to master, check which node collectd-exporter is scheduled (lets say vm2)
+- Now ssh to vm2
+- Use ``tcpdump -i ens3 #the interface used to connect to the internet > testdump``
+- Grep your client node's ip and check if packets are reaching our monitoring cluster (``cat testdump | grep <ip of client>``)
+- Ideally you should see packets reaching the node, if so please see if the collectd-exporter is running correctly, check its logs.
+- If no packets are received, error is on the client side, check collectd's config file and make sure correct collectd-exporter ip is used in the ``<network>`` section.
+
+If no notification received
+---------------------------
+- Go to main-prometheus's port and any one VMs' ip,(eg. 10.10.120.211:30902) and check if prometheus is getting the metrics
+- If no, read "No metrics received in grafana plot" section, else read ahead.
+- Check IP of alert-receiver, you can see this by going to alertmanager-ip:port and check if alertmanager is configured correctly.
+- If yes, paste the alert rule in the prometheus' query-box and see if any metric staisfy the condition.
+- You may need to change alert rules in the alert.rules section of prometheus-config.yaml if there was a bug in the alert's rule. (please read the "Add new alerts" section for detailed instructions)
+
+Reference
+=========
+- `Prometheus K8S deployment <https://www.metricfire.com/blog/how-to-deploy-prometheus-on-kubernetes/>`_
+- `HA Prometheus <https://prometheus.io/docs/introduction/faq/#can-prometheus-be-made-highly-available>`_
+- `Data Flow Diagram <https://drive.google.com/file/d/1D--LXFqU_H-fqpD57H3lJFOqcqWHoF0U/view?usp=sharing>`_
+- `Collectd Configuration <https://docs.opnfv.org/en/stable-fraser/submodules/barometer/docs/release/userguide/docker.userguide.html#build-the-collectd-docker-image>`_
+- `Alertmanager Rule Config <https://awesome-prometheus-alerts.grep.to/>`_
diff --git a/tools/lma/ansible-client/playbooks/clean.yaml b/tools/lma/ansible-client/playbooks/clean.yaml
new file mode 100644
index 00000000..4f77b062
--- /dev/null
+++ b/tools/lma/ansible-client/playbooks/clean.yaml
@@ -0,0 +1,25 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#clean td-agent
+- name: clean td-agent
+ hosts: all
+ roles:
+ - clean-td-agent
+
+#clean collectd
+- name: clean collectd
+ hosts: all
+ roles:
+ - clean-collectd
diff --git a/tools/lma/ansible-client/roles/clean-collectd/main.yml b/tools/lma/ansible-client/roles/clean-collectd/main.yml
new file mode 100644
index 00000000..97100cad
--- /dev/null
+++ b/tools/lma/ansible-client/roles/clean-collectd/main.yml
@@ -0,0 +1,44 @@
+# Copyright 2020 Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+- hosts: localhost
+
+ tasks:
+ - name: Check and install dependencies
+ yum:
+ name: docker
+ state: present
+
+ - name: Install python sdk
+ yum:
+ name: python-docker-py
+ state: present
+
+ - name: Stopping collectd container
+ docker_container:
+ name: collectd
+ state: stopped
+
+ - name: Removing collectd container
+ docker_container:
+ name: collectd
+ state: absent
+
+ # Removes the image (not recommended)
+ # - name: Remove image
+ # docker_image:
+ # state: absent
+ # name: opnfv/barometer-collectd
+ # tag: latest
diff --git a/tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2 b/tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2
new file mode 100644
index 00000000..ba953e3a
--- /dev/null
+++ b/tools/lma/ansible-client/roles/collectd/files/collectd.conf.j2
@@ -0,0 +1,44 @@
+Hostname "{{ host_name }}"
+Interval 10
+LoadPlugin intel_rdt
+LoadPlugin processes
+LoadPlugin interface
+LoadPlugin network
+LoadPlugin ovs_stats
+LoadPlugin cpu
+LoadPlugin memory
+#LoadPlugin csv
+#LoadPlugin write_http
+#LoadPlugin dpdkstat
+##############################################################################
+# Plugin configuration #
+##############################################################################
+<Plugin processes>
+ ProcessMatch "ovs-vswitchd" "ovs-vswitchd"
+ ProcessMatch "ovsdb-server" "ovsdb-server"
+ ProcessMatch "collectd" "collectd"
+</Plugin>
+
+<Plugin cpu>
+ ReportByCpu true
+ ReportByState true
+ ValuesPercentage true
+ ReportNumCpu true
+ ReportGuestState false
+ SubtractGuestState false
+</Plugin>
+
+<Plugin network>
+ Server "10.10.120.211" "30826"
+</Plugin>
+
+<Plugin ovs_stats>
+ Port "6640"
+ Address "127.0.0.1"
+ Socket "/usr/local/var/run/openvswitch/db.sock"
+ Bridges "vsperf-br0"
+</Plugin>
+
+<Plugin "intel_rdt">
+ Cores "2" "4-5" "6-7" "8" "9" "22" "23" "24" "25" "26" "27"
+</Plugin>
diff --git a/tools/lma/ansible-client/roles/collectd/tasks/main.yml b/tools/lma/ansible-client/roles/collectd/tasks/main.yml
new file mode 100644
index 00000000..0befb22b
--- /dev/null
+++ b/tools/lma/ansible-client/roles/collectd/tasks/main.yml
@@ -0,0 +1,60 @@
+# Copyright 2020 Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+# Dependency check
+- name: Check and install dependencies
+ yum:
+ name: ['docker', 'python-docker-py']
+ state: present
+
+- name: Install pip
+ yum:
+ name: python-pip
+ state: present
+
+- name: install docker-py
+ pip: name=docker-py
+
+- name: Cloning barometer
+ git:
+ repo: https://gerrit.opnfv.org/gerrit/barometer
+ dest: /tmp/barometer
+
+- name: Create Folder
+ file:
+ path: /tmp/barometer/docker/src/collectd_sample_configs
+ state: directory
+
+# Build collectd
+- name: Downlaod and Build Image
+ command: chdir=/tmp/ {{ item }}
+ become: true
+ with_items:
+ - docker build -t opnfv/barometer-collectd -f barometer/docker/barometer-collectd/Dockerfile barometer/docker/barometer-collectd
+
+# Configuring collectd0
+- name: Ensure collectd is configured
+ template:
+ src: ../files/collectd.conf.j2
+ dest: /tmp/barometer/docker/src/collectd_sample_configs/collectd.conf
+
+# Running Collectd container #####################
+- name: Running collectd
+ command : chdir=/tmp/ {{ item }}
+ become: true
+ with_items:
+ - docker run -tid --name collectd --net=host -v /tmp/barometer/docker/src/collectd_sample_configs:/opt/collectd/etc/collectd.conf.d -v /var/run:/var/run -v /tmp:/tmp --privileged opnfv/barometer-collectd /run_collectd.sh
+ - docker ps
diff --git a/tools/lma/ansible-server/playbooks/clean.yaml b/tools/lma/ansible-server/playbooks/clean.yaml
new file mode 100644
index 00000000..b4da66da
--- /dev/null
+++ b/tools/lma/ansible-server/playbooks/clean.yaml
@@ -0,0 +1,52 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# clean monitoring
+- name: Clean PAG setup
+ hosts: master
+ roles:
+ - clean-monitoring
+
+#clean logging
+- name: Clean EFK setup
+ hosts: master
+ roles:
+ - clean-logging
+
+#IF KUBELET IS RUNNING THEN RUN THIS
+#clean k8s cluster
+- name: Clean k8s cluster
+ hosts: master
+ roles:
+ - clean-k8s-cluster
+
+#reset worker-nodes
+- name: Reset worker-nodes
+ hosts: worker-nodes
+ roles:
+ - clean-k8s-worker-reset
+
+#unistall pre-requisites for k8s
+- name: unistall pre-requisites for k8s
+ hosts: all
+ roles:
+ - clean-k8s-pre
+
+#*************************************************************************************************************
+#THIS WILL DELETE DATA OF ELASTICSEARCH
+#*************************************************************************************************************
+# - name: Clean nfs server
+# hosts: all
+# roles:
+# - clean-nfs
diff --git a/tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml
new file mode 100644
index 00000000..83ac086d
--- /dev/null
+++ b/tools/lma/ansible-server/roles/clean-k8s-cluster/tasks/main.yml
@@ -0,0 +1,34 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+#check kubelet is running or not
+- name: check for kubelet
+ shell: "systemctl status kubelet"
+ register: _svc_kubelet
+ failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr)
+
+#IF KUBELET IS RUNNING, THEN
+#reset k8s
+- name: reset k8s
+ shell: |
+ kubectl drain {{vm3}} --delete-local-data --force --ignore-daemonsets
+ kubectl drain {{vm2}} --delete-local-data --force --ignore-daemonsets
+ kubectl drain {{vm1}} --delete-local-data --force --ignore-daemonsets
+ kubectl delete node {{vm3}}
+ kubectl delete node {{vm2}}
+ kubectl delete node {{vm1}}
+ sudo kubeadm reset -f
+ sudo rm $HOME/.kube/config
+ when: "_svc_kubelet.rc == 0"
+
diff --git a/tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml
new file mode 100644
index 00000000..6d12bd5f
--- /dev/null
+++ b/tools/lma/ansible-server/roles/clean-k8s-pre/tasks/main.yml
@@ -0,0 +1,65 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+
+#Uninstalling K8s
+- name: Uninstalling K8s
+ yum:
+ name: ['kubeadm', 'kubectl', 'kubelet', 'docker-ce']
+ state: absent
+
+#Enabling Swap
+- name: Enabling Swap on all nodes
+ shell: swapon -a
+ ignore_errors: yes
+
+#Uncommenting Swap entries
+- name: Uncommenting Swap entries in /etc/fstab
+ replace:
+ path: /etc/fstab
+ regexp: '^# /(.*swap.*)'
+ replace: '\1'
+
+
+#Starting firewalld
+- name: 'Starting firewall'
+ service:
+ name: firewalld
+ state: started
+ enabled: yes
+
+# Enabling SELinux
+- name: Enabling SELinux on all nodes
+ shell: |
+ setenforce 1
+ sudo sed -i 's/^SELINUX=permissive$/SELINUX=enforcing/' /etc/selinux/config
+
+#removing Docker repo
+- name: removing Docker repo
+ command: yum-config-manager --disable docker-ce-stable
+
+#removing K8s repo
+- name: removing repository details in Kubernetes repo file.
+ blockinfile:
+ path: /etc/yum.repos.d/kubernetes.repo
+ state: absent
+ block: |
+ [kubernetes]
+ name=Kubernetes
+ baseurl=https://packages.cloud.google.com/yum/repos/kubernetes-el7-x86_64
+ enabled=1
+ gpgcheck=1
+ repo_gpgcheck=1
+ gpgkey=https://packages.cloud.google.com/yum/doc/yum-key.gpg
+ https://packages.cloud.google.com/yum/doc/rpm-package-key.gpg
diff --git a/tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml b/tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml
new file mode 100644
index 00000000..3ba9c9ea
--- /dev/null
+++ b/tools/lma/ansible-server/roles/clean-k8s-worker-reset/tasks/main.yml
@@ -0,0 +1,26 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+#check kubelet is running or not
+- name: check for kubelet
+ shell: "systemctl status kubelet"
+ register: _svc_kubelet
+ failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr)
+
+#IF KUBELET IS RUNNING, THEN
+#reset k8s
+- name: reset k8s
+ command: kubeadm reset -f
+ when: "_svc_kubelet.rc == 0"
+
diff --git a/tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml b/tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml
new file mode 100644
index 00000000..49943ec0
--- /dev/null
+++ b/tools/lma/ansible-server/roles/clean-monitoring/tasks/main.yml
@@ -0,0 +1,48 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+#Deleting PAG setup from k8s cluster
+
+#check kubelet is running or not
+- name: check for kubelet
+ shell: "systemctl status kubelet"
+ register: _svc_kubelet
+ failed_when: _svc_kubelet.rc != 0 and ("could not be found" not in _svc_kubelet.stderr)
+
+#***********************************************************************************************************
+#copy yaml to /tmp/files/
+#***********************************************************************************************************
+- name: copy namespace yaml to /tmp/files/
+ copy:
+ src: ../../monitoring/files/monitoring-namespace.yaml
+ dest: /tmp/monitoring-namespace.yaml
+
+#***********************************************************************************************************
+#Deleting Namespace
+#***********************************************************************************************************
+- name: Deleting Namespace
+ k8s:
+ state: absent
+ src: /tmp/monitoring-namespace.yaml
+ namespace: monitoring
+ when: "_svc_kubelet.rc == 0"
+
+#***********************************************************************************************************
+#removing /tmp/files
+#***********************************************************************************************************
+- name: Removing /tmp/monitoring-namespace.yaml
+ file:
+ path: "/tmp/monitoring-namespace.yaml"
+ state: absent
diff --git a/tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml b/tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml
new file mode 100644
index 00000000..157db849
--- /dev/null
+++ b/tools/lma/ansible-server/roles/clean-nfs/tasks/main.yml
@@ -0,0 +1,44 @@
+# Copyright 2020 Adarsh yadav, Aditya Srivastava
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+---
+#Edit /etc/export
+- name: Edit /etc/export file for NFS
+ lineinfile:
+ path: /etc/exports
+ line: "{{item.line}}"
+ state: absent
+ with_items:
+ - {line: "/srv/nfs/master *(rw,sync,no_root_squash,no_subtree_check)"}
+ - {line: "/srv/nfs/data *(rw,sync,no_root_squash,no_subtree_check)"}
+ - {line: "/usr/share/monitoring_data/grafana *(rw,sync,no_root_squash,no_subtree_check)"}
+
+#uninstall NFS server
+- name: Uninstalling NFS server utils
+ yum:
+ name: nfs-utils
+ state: absent
+
+#remove Elasticsearch data
+- name: Removing Directory for elasticsearch
+ file:
+ path: "/srv/nfs/{{item}}"
+ state: absent
+ with_items:
+ - ['data', 'master']
+
+#remove Grafana data
+- name: Removing Directory for grafana
+ file:
+ path: "/usr/share/monitoring_data/grafana"
+ state: absent
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml
new file mode 100644
index 00000000..7b9abc47
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-config.yaml
@@ -0,0 +1,37 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: ConfigMap
+apiVersion: v1
+metadata:
+ name: alertmanager-config
+ namespace: monitoring
+data:
+ config.yml: |-
+ global:
+ route:
+ receiver: "webhook"
+ group_by: ['alertname', 'priority']
+ group_wait: 1s
+ group_interval: 5s
+ repeat_interval: 5s
+ routes:
+ - match:
+ severity: critical
+
+ receivers:
+ - name: "webhook"
+ webhook_configs:
+ - url: 'http://10.10.120.20/alertmanager'
+ send_resolved: true
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml
new file mode 100644
index 00000000..f1c3d78e
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-deployment.yaml
@@ -0,0 +1,62 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ labels:
+ app: alertmanager
+ adi10hero.monitoring: alertmanager
+ name: alertmanager
+ namespace: monitoring
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: alertmanager
+ adi10hero.monitoring: alertmanager
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ name: alertmanager
+ labels:
+ app: alertmanager
+ adi10hero.monitoring: alertmanager
+ spec:
+ containers:
+ - name: alertmanager
+ image: prom/alertmanager
+ args:
+ - --config.file=/etc/alertmanager/config.yml
+ - --storage.path=/alertmanager
+ - --cluster.peer=alertmanager1:6783
+ - --cluster.listen-address=0.0.0.0:6783
+ ports:
+ - containerPort: 9093
+ - containerPort: 6783
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - name: config-volume
+ mountPath: /etc/alertmanager
+ - name: alertmanager
+ mountPath: /alertmanager
+ restartPolicy: Always
+ volumes:
+ - name: config-volume
+ configMap:
+ name: alertmanager-config
+ - name: alertmanager
+ emptyDir: {}
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml
new file mode 100644
index 00000000..c67517d3
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager-service.yaml
@@ -0,0 +1,41 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: alertmanager
+ app: alertmanager
+ name: alertmanager
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/path: /
+ prometheus.io/port: '8080'
+
+spec:
+ selector:
+ app: alertmanager
+ adi10hero.monitoring: alertmanager
+ type: NodePort
+ ports:
+ - name: "9093"
+ port: 9093
+ targetPort: 9093
+ nodePort: 30930
+ - name: "6783"
+ port: 6783
+ targetPort: 6783
+ nodePort: 30679
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml
new file mode 100644
index 00000000..18b76456
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-deployment.yaml
@@ -0,0 +1,62 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ labels:
+ app: alertmanager1
+ adi10hero.monitoring: alertmanager1
+ name: alertmanager1
+ namespace: monitoring
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: alertmanager1
+ adi10hero.monitoring: alertmanager1
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ name: alertmanager1
+ labels:
+ app: alertmanager1
+ adi10hero.monitoring: alertmanager1
+ spec:
+ containers:
+ - name: alertmanager1
+ image: prom/alertmanager
+ args:
+ - --config.file=/etc/alertmanager/config.yml
+ - --storage.path=/alertmanager
+ - --cluster.peer=alertmanager:6783
+ - --cluster.listen-address=0.0.0.0:6783
+ ports:
+ - containerPort: 9093
+ - containerPort: 6783
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - name: config-volume
+ mountPath: /etc/alertmanager
+ - name: alertmanager
+ mountPath: /alertmanager
+ restartPolicy: Always
+ volumes:
+ - name: config-volume
+ configMap:
+ name: alertmanager-config
+ - name: alertmanager
+ emptyDir: {}
diff --git a/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml
new file mode 100644
index 00000000..66d0d2b1
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/alertmanager/alertmanager1-service.yaml
@@ -0,0 +1,42 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: alertmanager1
+ app: alertmanager1
+ name: alertmanager1
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/path: /
+ prometheus.io/port: '8080'
+
+spec:
+ selector:
+ app: alertmanager1
+ adi10hero.monitoring: alertmanager1
+ type: NodePort
+ ports:
+ - name: "9093"
+ port: 9093
+ targetPort: 9093
+ nodePort: 30931
+ - name: "6783"
+ port: 6783
+ targetPort: 6783
+ nodePort: 30678
+
diff --git a/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml
new file mode 100644
index 00000000..6a62985e
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-deamonset.yaml
@@ -0,0 +1,79 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: cadvisor
+ namespace: monitoring
+ labels:
+ adi10hero.monitoring: cadvisor
+ app: cadvisor
+spec:
+ selector:
+ matchLabels:
+ app: cadvisor
+ adi10hero.monitoring: cadvisor
+ template:
+ metadata:
+ name: cadvisor
+ labels:
+ adi10hero.monitoring: cadvisor
+ app: cadvisor
+ spec:
+ containers:
+ - image: gcr.io/google-containers/cadvisor
+ name: cadvisor
+ ports:
+ - containerPort: 8080
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - mountPath: /rootfs
+ name: cadvisor-hostpath0
+ readOnly: true
+ - mountPath: /var/run
+ name: cadvisor-hostpath1
+ - mountPath: /sys
+ name: cadvisor-hostpath2
+ readOnly: true
+ - mountPath: /sys/fs/cgroup
+ name: cadvisor-hostpath3
+ readOnly: true
+ - mountPath: /dev/disk
+ name: cadvisor-hostpath4
+ readOnly: true
+ - mountPath: /var/lib/docker
+ name: cadvisor-hostpath5
+ readOnly: true
+ restartPolicy: Always
+ volumes:
+ - hostPath:
+ path: /
+ name: cadvisor-hostpath0
+ - hostPath:
+ path: /var/run
+ name: cadvisor-hostpath1
+ - hostPath:
+ path: /sys
+ name: cadvisor-hostpath2
+ - hostPath:
+ path: /cgroup
+ name: cadvisor-hostpath3
+ - hostPath:
+ path: /dev/disk/
+ name: cadvisor-hostpath4
+ - hostPath:
+ path: /var/lib/docker/
+ name: cadvisor-hostpath5
diff --git a/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml
new file mode 100644
index 00000000..734240b8
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/cadvisor/cadvisor-service.yaml
@@ -0,0 +1,30 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ app: cadvisor
+ adi10hero.monitoring: cadvisor
+ name: cadvisor
+ namespace: monitoring
+spec:
+ ports:
+ - name: "8080"
+ port: 8080
+ targetPort: 8080
+ selector:
+ app: cadvisor
+ adi10hero.monitoring: cadvisor
diff --git a/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml
new file mode 100644
index 00000000..b6bfe0b6
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-deployment.yaml
@@ -0,0 +1,51 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: collectd-exporter
+ namespace: monitoring
+ labels:
+ app: collectd-exporter
+ adi10hero.monitoring: collectd-exporter
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ app: collectd-exporter
+ adi10hero.monitoring: collectd-exporter
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ name: collectd-exporter
+ labels:
+ app: collectd-exporter
+ adi10hero.monitoring: collectd-exporter
+ spec:
+ containers:
+ - args:
+ - --collectd.listen-address=0.0.0.0:25826
+ image: prom/collectd-exporter
+ name: collectd-exporter
+ ports:
+ - containerPort: 9103
+ - containerPort: 25826
+ protocol: UDP
+ securityContext:
+ runAsUser: 0
+ restartPolicy: Always
+ volumes: null
+
diff --git a/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml
new file mode 100644
index 00000000..5609d04a
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/collectd-exporter/collectd-exporter-service.yaml
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ name: collectd-exporter
+ namespace: monitoring
+ labels:
+ app: collectd-exporter
+ adi10hero.monitoring: collectd-exporter
+spec:
+ ports:
+ - name: "9103"
+ port: 9103
+ nodePort: 30103
+ - name: "25826"
+ port: 25826
+ protocol: UDP
+ nodePort: 30826
+ selector:
+ app: collectd-exporter
+ adi10hero.monitoring: collectd-exporter
+ type: NodePort
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml
new file mode 100644
index 00000000..e2b8c9fa
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-datasource-config.yaml
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: grafana-datasources
+ namespace: monitoring
+data:
+ prometheus.yaml: |-
+ {
+ "apiVersion": 1,
+ "datasources": [
+ {
+ "access":"proxy",
+ "editable": true,
+ "name": "prometheus",
+ "orgId": 1,
+ "type": "prometheus",
+ "url": "http://prometheus-main:9090",
+ "version": 1
+ }
+ ]
+ }
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml
new file mode 100644
index 00000000..afb00948
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-deployment.yaml
@@ -0,0 +1,68 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ labels:
+ adi10hero.monitoring: grafana
+ app: grafana
+ name: grafana
+ namespace: monitoring
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ adi10hero.monitoring: grafana
+ app: grafana
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ name: grafana
+ labels:
+ adi10hero.monitoring: grafana
+ app: grafana
+ spec:
+ containers:
+ - name: grafana
+ image: grafana/grafana
+ ports:
+ - containerPort: 3000
+ env:
+ - name: GF_SECURITY_ADMIN_PASSWORD
+ value: admin
+ - name: GF_SECURITY_ADMIN_USER
+ value: admin
+ - name: GF_SERVER_DOMAIN
+ value: 10.10.120.20
+ - name: GF_SERVER_ROOT_URL
+ value: "%(protocol)s://%(domain)s:/metrics"
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - mountPath: /var/lib/grafana
+ name: grafana-storage
+ - mountPath: /etc/grafana/provisioning/datasources
+ name: grafana-datasources
+ readOnly: false
+ restartPolicy: Always
+ volumes:
+ - name: grafana-storage
+ persistentVolumeClaim:
+ claimName: grafana-pvc
+ - name: grafana-datasources
+ configMap:
+ defaultMode: 420
+ name: grafana-datasources
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml
new file mode 100644
index 00000000..06bcc31b
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pv.yaml
@@ -0,0 +1,31 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: grafana-pv
+ namespace: monitoring
+ labels:
+ app: grafana-pv
+ adi10hero.monitoring: grafana-pv
+spec:
+ storageClassName: monitoring
+ capacity:
+ storage: 5Gi
+ accessModes:
+ - ReadWriteMany
+ nfs:
+ server: 10.10.120.211
+ path: "/usr/share/monitoring_data/grafana"
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml
new file mode 100644
index 00000000..2c2955c8
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-pvc.yaml
@@ -0,0 +1,33 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: grafana-pvc
+ namespace: monitoring
+ labels:
+ app: grafana-pvc
+ adi10hero.monitoring: grafana-pvc
+spec:
+ accessModes:
+ - ReadWriteMany
+ storageClassName: monitoring
+ resources:
+ requests:
+ storage: 4Gi
+ selector:
+ matchLabels:
+ app: grafana-pv
+ adi10hero.monitoring: grafana-pv
diff --git a/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml
new file mode 100644
index 00000000..d1c9c9cc
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/grafana/grafana-service.yaml
@@ -0,0 +1,36 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ name: grafana
+ namespace: monitoring
+ labels:
+ app: grafana
+ adi10hero.monitoring: grafana
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/port: '3000'
+spec:
+ selector:
+ app: grafana
+ adi10hero.monitoring: grafana
+ type: NodePort
+ ports:
+ - name: "3000"
+ port: 3000
+ targetPort: 3000
+ nodePort: 30000
+
diff --git a/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml
new file mode 100644
index 00000000..af3c5469
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-deployment.yaml
@@ -0,0 +1,36 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: kube-state-metrics
+ namespace: kube-system
+spec:
+ selector:
+ matchLabels:
+ app: kube-state-metrics
+ replicas: 1
+ template:
+ metadata:
+ labels:
+ app: kube-state-metrics
+ spec:
+ #serviceAccountName: prometheus
+ containers:
+ - name: kube-state-metrics
+ image: quay.io/coreos/kube-state-metrics:v1.2.0
+ ports:
+ - containerPort: 8080
+ name: monitoring
diff --git a/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml
new file mode 100644
index 00000000..8d294391
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/kube-state-metrics/kube-state-metrics-service.yaml
@@ -0,0 +1,26 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: Service
+apiVersion: v1
+metadata:
+ name: kube-state-metrics
+ namespace: kube-system
+spec:
+ selector:
+ app: kube-state-metrics
+ ports:
+ - protocol: TCP
+ port: 8080
+ targetPort: 8080
diff --git a/tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml b/tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml
new file mode 100644
index 00000000..f1c9b889
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/monitoring-namespace.yaml
@@ -0,0 +1,18 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: monitoring
diff --git a/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml
new file mode 100644
index 00000000..9334b2f4
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-daemonset.yaml
@@ -0,0 +1,80 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: DaemonSet
+metadata:
+ name: node-exporter-daemonset
+ namespace: monitoring
+ labels:
+ app: node-exporter
+ adi10hero.monitoring: node-exporter
+spec:
+ selector:
+ matchLabels:
+ app: node-exporter
+ adi10hero.monitoring: node-exporter
+ template:
+ metadata:
+ labels:
+ app: node-exporter
+ adi10hero.monitoring: node-exporter
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/port: "9100"
+ spec:
+ hostPID: true
+ hostIPC: true
+ hostNetwork: true
+ containers:
+ - ports:
+ - containerPort: 9100
+ protocol: TCP
+ resources:
+ requests:
+ cpu: 0.15
+ securityContext:
+ runAsUser: 0
+ privileged: true
+ image: prom/node-exporter:v0.15.2
+ args:
+ - --path.procfs
+ - /host/proc
+ - --path.sysfs
+ - /host/sys
+ - --collector.filesystem.ignored-mount-points
+ - '"^/(sys|proc|dev|host|etc)($|/)"'
+ name: node-exporter
+ volumeMounts:
+ - name: dev
+ mountPath: /host/dev
+ - name: proc
+ mountPath: /host/proc
+ - name: sys
+ mountPath: /host/sys
+ - name: rootfs
+ mountPath: /rootfs
+ volumes:
+ - name: proc
+ hostPath:
+ path: /proc
+ - name: dev
+ hostPath:
+ path: /dev
+ - name: sys
+ hostPath:
+ path: /sys
+ - name: rootfs
+ hostPath:
+ path: /
diff --git a/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml
new file mode 100644
index 00000000..dd0aea4d
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/node-exporter/nodeexporter-service.yaml
@@ -0,0 +1,33 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: node-exporter
+ app: node-exporter
+ name: node-exporter
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: "true"
+ prometheus.io/port: "9100"
+spec:
+ ports:
+ - name: "node-exporter"
+ port: 9100
+ targetPort: 9100
+ selector:
+ adi10hero.monitoring: node-exporter
+ app: node-exporter
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml
new file mode 100644
index 00000000..58b220a8
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/main-prometheus-service.yaml
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: prometheus-main
+ app: prometheus-main
+ name: prometheus-main
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/port: '9090'
+spec:
+ type: NodePort
+ ports:
+ - name: prometheus-main
+ protocol: TCP
+ port: 9090
+ nodePort: 30902
+ selector:
+ adi10hero.monitoring: prometheus1
+ app: prometheus
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml
new file mode 100644
index 00000000..917f978f
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-config.yaml
@@ -0,0 +1,609 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: ConfigMap
+metadata:
+ name: prometheus-config
+ namespace: monitoring
+data:
+ alert.rules: |-
+ groups:
+ - name: targets
+ rules:
+ - alert: MonitorServiceDown
+ expr: up == 0
+ for: 30s
+ labels:
+ severity: critical
+ annotations:
+ summary: "Monitor service non-operational"
+ description: "Service {{ $labels.instance }} is down."
+ - alert: HighCpuLoad
+ expr: node_load1 > 1.9
+ for: 15s
+ labels:
+ severity: critical
+ annotations:
+ summary: "Service under high load"
+ description: "Docker host is under high load, the avg load 1m is at {{ $value}}. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+ - name: host and hardware
+ rules:
+ - alert: HostHighCpuLoad
+ expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host high CPU load (instance {{ $labels.instance }})"
+ description: "CPU load is > 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostSwapIsFillingUp
+ expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host swap is filling up (instance {{ $labels.instance }})"
+ description: "Swap is filling up (>80%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HighMemoryLoad
+ expr: (sum(node_memory_MemTotal_bytes) - sum(node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) ) / sum(node_memory_MemTotal_bytes) * 100 > 85
+ for: 30s
+ labels:
+ severity: warning
+ annotations:
+ summary: "Server memory is almost full"
+ description: "Docker host memory usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+ - alert: HighStorageLoad
+ expr: (node_filesystem_size_bytes{fstype="aufs"} - node_filesystem_free_bytes{fstype="aufs"}) / node_filesystem_size_bytes{fstype="aufs"} * 100 > 85
+ for: 30s
+ labels:
+ severity: warning
+ annotations:
+ summary: "Server storage is almost full"
+ description: "Docker host storage usage is {{ humanize $value}}%. Reported by instance {{ $labels.instance }} of job {{ $labels.job }}."
+
+ - alert: HostNetworkTransmitErrors
+ expr: increase(node_network_transmit_errs_total[5m]) > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host Network Transmit Errors (instance {{ $labels.instance }})"
+ description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostOutOfMemory
+ expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host out of memory (instance {{ $labels.instance }})"
+ description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostMemoryUnderMemoryPressure
+ expr: rate(node_vmstat_pgmajfault[1m]) > 1000
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host memory under memory pressure (instance {{ $labels.instance }})"
+ description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostUnusualNetworkThroughputIn
+ expr: sum by (instance) (irate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host unusual network throughput in (instance {{ $labels.instance }})"
+ description: "Host network interfaces are probably receiving too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostUnusualNetworkThroughputOut
+ expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host unusual network throughput out (instance {{ $labels.instance }})"
+ description: "Host network interfaces are probably sending too much data (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostUnusualDiskRateRead
+ expr: sum by (instance) (irate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host unusual disk read rate (instance {{ $labels.instance }})"
+ description: "Disk is probably reading too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostUnusualDiskRateWrite
+ expr: sum by (instance) (irate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host unusual disk write rate (instance {{ $labels.instance }})"
+ description: "Disk is probably writing too much data (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostOutOfDiskSpace
+ expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host out of disk space (instance {{ $labels.instance }})"
+ description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostDiskWillFillIn4Hours
+ expr: predict_linear(node_filesystem_free_bytes{fstype!~"tmpfs"}[1h], 4 * 3600) < 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host disk will fill in 4 hours (instance {{ $labels.instance }})"
+ description: "Disk will fill in 4 hours at current write rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostPhysicalComponentTooHot
+ expr: node_hwmon_temp_celsius > 75
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host physical component too hot (instance {{ $labels.instance }})"
+ description: "Physical hardware component too hot\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostNodeOvertemperatureAlarm
+ expr: node_hwmon_temp_alarm == 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Host node overtemperature alarm (instance {{ $labels.instance }})"
+ description: "Physical node temperature alarm triggered\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostKernelVersionDeviations
+ expr: count(sum(label_replace(node_uname_info, "kernel", "$1", "release", "([0-9]+.[0-9]+.[0-9]+).*")) by (kernel)) > 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host kernel version deviations (instance {{ $labels.instance }})"
+ description: "Different kernel versions are running\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostOomKillDetected
+ expr: increase(node_vmstat_oom_kill[5m]) > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host OOM kill detected (instance {{ $labels.instance }})"
+ description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostEdacCorrectableErrorsDetected
+ expr: increase(node_edac_correctable_errors_total[5m]) > 0
+ for: 5m
+ labels:
+ severity: info
+ annotations:
+ summary: "Host EDAC Correctable Errors detected (instance {{ $labels.instance }})"
+ description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} correctable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostEdacUncorrectableErrorsDetected
+ expr: node_edac_uncorrectable_errors_total > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})"
+ description: "{{ $labels.instance }} has had {{ printf \"%.0f\" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostNetworkReceiveErrors
+ expr: increase(node_network_receive_errs_total[5m]) > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host Network Receive Errors (instance {{ $labels.instance }})"
+ description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} receive errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: HostNetworkTransmitErrors
+ expr: increase(node_network_transmit_errs_total[5m]) > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Host Network Transmit Errors (instance {{ $labels.instance }})"
+ description: "{{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf \"%.0f\" $value }} transmit errors in the last five minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - name: container
+ rules:
+ - alert: ContainerKilled
+ expr: time() - container_last_seen > 60
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container killed (instance {{ $labels.instance }})"
+ description: "A container has disappeared\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: ContainerCpuUsage
+ expr: sum by(instance, name) (rate(container_cpu_usage_seconds_total[3m]) * 100 > 80)
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container CPU usage (instance {{ $labels.instance }})"
+ description: "Container CPU usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: ContainerMemoryUsage
+ expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes > 0) BY (instance, name) * 100) > 125
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container Memory usage (instance {{ $labels.instance }})"
+ description: "Container Memory usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: ContainerVolumeUsage
+ expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container Volume usage (instance {{ $labels.instance }})"
+ description: "Container Volume usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: ContainerVolumeIoUsage
+ expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container Volume IO usage (instance {{ $labels.instance }})"
+ description: "Container Volume IO usage is above 80%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: ContainerHighThrottleRate
+ expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Container high throttle rate (instance {{ $labels.instance }})"
+ description: "Container is being throttled\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - name: kubernetes
+ rules:
+ - alert: KubernetesNodeReady
+ expr: kube_node_status_condition{condition="Ready",status="true"} == 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes Node ready (instance {{ $labels.instance }})"
+ description: "Node {{ $labels.node }} has been unready for a long time\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesMemoryPressure
+ expr: kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes memory pressure (instance {{ $labels.instance }})"
+ description: "{{ $labels.node }} has MemoryPressure condition\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesDiskPressure
+ expr: kube_node_status_condition{condition="DiskPressure",status="true"} == 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes disk pressure (instance {{ $labels.instance }})"
+ description: "{{ $labels.node }} has DiskPressure condition\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesOutOfDisk
+ expr: kube_node_status_condition{condition="OutOfDisk",status="true"} == 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes out of disk (instance {{ $labels.instance }})"
+ description: "{{ $labels.node }} has OutOfDisk condition\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesJobFailed
+ expr: kube_job_status_failed > 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes Job failed (instance {{ $labels.instance }})"
+ description: "Job {{$labels.namespace}}/{{$labels.exported_job}} failed to complete\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesCronjobSuspended
+ expr: kube_cronjob_spec_suspend != 0
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes CronJob suspended (instance {{ $labels.instance }})"
+ description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is suspended\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesPersistentvolumeclaimPending
+ expr: kube_persistentvolumeclaim_status_phase{phase="Pending"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes PersistentVolumeClaim pending (instance {{ $labels.instance }})"
+ description: "PersistentVolumeClaim {{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is pending\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesVolumeOutOfDiskSpace
+ expr: kubelet_volume_stats_available_bytes / kubelet_volume_stats_capacity_bytes * 100 < 10
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes Volume out of disk space (instance {{ $labels.instance }})"
+ description: "Volume is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesVolumeFullInFourDays
+ expr: predict_linear(kubelet_volume_stats_available_bytes[6h], 4 * 24 * 3600) < 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes Volume full in four days (instance {{ $labels.instance }})"
+ description: "{{ $labels.namespace }}/{{ $labels.persistentvolumeclaim }} is expected to fill up within four days. Currently {{ $value | humanize }}% is available.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesPersistentvolumeError
+ expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending",job="kube-state-metrics"} > 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes PersistentVolume error (instance {{ $labels.instance }})"
+ description: "Persistent volume is in bad state\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesStatefulsetDown
+ expr: (kube_statefulset_status_replicas_ready / kube_statefulset_status_replicas_current) != 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes StatefulSet down (instance {{ $labels.instance }})"
+ description: "A StatefulSet went down\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesHpaScalingAbility
+ expr: kube_hpa_status_condition{condition="false", status="AbleToScale"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes HPA scaling ability (instance {{ $labels.instance }})"
+ description: "Pod is unable to scale\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesHpaMetricAvailability
+ expr: kube_hpa_status_condition{condition="false", status="ScalingActive"} == 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes HPA metric availability (instance {{ $labels.instance }})"
+ description: "HPA is not able to colelct metrics\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesHpaScaleCapability
+ expr: kube_hpa_status_desired_replicas >= kube_hpa_spec_max_replicas
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes HPA scale capability (instance {{ $labels.instance }})"
+ description: "The maximum number of desired Pods has been hit\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesPodNotHealthy
+ expr: min_over_time(sum by (namespace, pod) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed"})[1h:]) > 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes Pod not healthy (instance {{ $labels.instance }})"
+ description: "Pod has been in a non-ready state for longer than an hour.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesPodCrashLooping
+ expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 5 > 5
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes pod crash looping (instance {{ $labels.instance }})"
+ description: "Pod {{ $labels.pod }} is crash looping\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesReplicassetMismatch
+ expr: kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes ReplicasSet mismatch (instance {{ $labels.instance }})"
+ description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesDeploymentReplicasMismatch
+ expr: kube_deployment_spec_replicas != kube_deployment_status_replicas_available
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes Deployment replicas mismatch (instance {{ $labels.instance }})"
+ description: "Deployment Replicas mismatch\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesStatefulsetReplicasMismatch
+ expr: kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes StatefulSet replicas mismatch (instance {{ $labels.instance }})"
+ description: "A StatefulSet has not matched the expected number of replicas for longer than 15 minutes.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesDeploymentGenerationMismatch
+ expr: kube_deployment_status_observed_generation != kube_deployment_metadata_generation
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes Deployment generation mismatch (instance {{ $labels.instance }})"
+ description: "A Deployment has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesStatefulsetGenerationMismatch
+ expr: kube_statefulset_status_observed_generation != kube_statefulset_metadata_generation
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes StatefulSet generation mismatch (instance {{ $labels.instance }})"
+ description: "A StatefulSet has failed but has not been rolled back.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesStatefulsetUpdateNotRolledOut
+ expr: max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated)
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes StatefulSet update not rolled out (instance {{ $labels.instance }})"
+ description: "StatefulSet update has not been rolled out.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesDaemonsetRolloutStuck
+ expr: kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100 or kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes DaemonSet rollout stuck (instance {{ $labels.instance }})"
+ description: "Some Pods of DaemonSet are not scheduled or not ready\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesDaemonsetMisscheduled
+ expr: kube_daemonset_status_number_misscheduled > 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes DaemonSet misscheduled (instance {{ $labels.instance }})"
+ description: "Some DaemonSet Pods are running where they are not supposed to run\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesCronjobTooLong
+ expr: time() - kube_cronjob_next_schedule_time > 3600
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes CronJob too long (instance {{ $labels.instance }})"
+ description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} is taking more than 1h to complete.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesJobCompletion
+ expr: kube_job_spec_completions - kube_job_status_succeeded > 0 or kube_job_status_failed > 0
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes job completion (instance {{ $labels.instance }})"
+ description: "Kubernetes Job failed to complete\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesApiServerErrors
+ expr: sum(rate(apiserver_request_count{job="apiserver",code=~"^(?:5..)$"}[2m])) / sum(rate(apiserver_request_count{job="apiserver"}[2m])) * 100 > 3
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes API server errors (instance {{ $labels.instance }})"
+ description: "Kubernetes API server is experiencing high error rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesApiClientErrors
+ expr: (sum(rate(rest_client_requests_total{code=~"(4|5).."}[2m])) by (instance, job) / sum(rate(rest_client_requests_total[2m])) by (instance, job)) * 100 > 1
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes API client errors (instance {{ $labels.instance }})"
+ description: "Kubernetes API client is experiencing high error rate\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesClientCertificateExpiresNextWeek
+ expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7*24*60*60
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes client certificate expires next week (instance {{ $labels.instance }})"
+ description: "A client certificate used to authenticate to the apiserver is expiring next week.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesClientCertificateExpiresSoon
+ expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0 and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 24*60*60
+ for: 5m
+ labels:
+ severity: critical
+ annotations:
+ summary: "Kubernetes client certificate expires soon (instance {{ $labels.instance }})"
+ description: "A client certificate used to authenticate to the apiserver is expiring in less than 24.0 hours.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+ - alert: KubernetesApiServerLatency
+ expr: histogram_quantile(0.99, sum(apiserver_request_latencies_bucket{verb!~"CONNECT|WATCHLIST|WATCH|PROXY"}) WITHOUT (instance, resource)) / 1e+06 > 1
+ for: 5m
+ labels:
+ severity: warning
+ annotations:
+ summary: "Kubernetes API server latency (instance {{ $labels.instance }})"
+ description: "Kubernetes API server has a 99th percentile latency of {{ $value }} seconds for {{ $labels.verb }} {{ $labels.resource }}.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
+
+
+ prometheus.yml: |-
+ global:
+ scrape_interval: 15s
+ evaluation_interval: 15s
+
+ rule_files:
+ - "/etc/prometheus/alert.rules"
+
+ scrape_configs:
+ - job_name: 'collectd-exporter'
+ scrape_interval: 5s
+ static_configs:
+ - targets: ['collectd-exporter:9103']
+
+ - job_name: 'cadvisor'
+ scrape_interval: 5s
+ static_configs:
+ - targets: ['cadvisor:8080']
+
+ - job_name: 'node-exporter'
+ scrape_interval: 5s
+ static_configs:
+ - targets: ['node-exporter:9100']
+
+ - job_name: 'prometheus'
+ scrape_interval: 10s
+ static_configs:
+ - targets: ['localhost:9090']
+
+ - job_name: 'kube-state-metrics'
+ scrape_interval: 10s
+ static_configs:
+ - targets: ['kube-state-metrics.kube-system.svc.cluster.local:8080']
+
+ alerting:
+ alertmanagers:
+ - scheme: http
+ static_configs:
+ - targets: ['alertmanager:9093', 'alertmanager1:9093']
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml
new file mode 100644
index 00000000..5b98b154
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-deployment.yaml
@@ -0,0 +1,73 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: prometheus-deployment
+ namespace: monitoring
+ labels:
+ app: prometheus
+ adi10hero.monitoring: prometheus
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ adi10hero.monitoring: prometheus
+ app: prometheus
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ labels:
+ adi10hero.monitoring: prometheus
+ app: prometheus
+ spec:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: kubernetes.io/hostname
+ operator: In
+ values:
+ - vm2
+ containers:
+ - name: prometheus
+ image: prom/prometheus
+ args:
+ - --config.file=/etc/prometheus/prometheus.yml
+ - --storage.tsdb.path=/prometheus
+ - --storage.tsdb.retention.size=3GB
+ - --storage.tsdb.retention.time=30d
+ - --web.console.libraries=/etc/prometheus/console_libraries
+ - --web.console.templates=/etc/prometheus/consoles
+ ports:
+ - containerPort: 9090
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - name: prometheus-config-volume
+ mountPath: /etc/prometheus/
+ - name: prometheus-storage-volume
+ mountPath: /prometheus/
+ restartPolicy: Always
+ volumes:
+ - name: prometheus-config-volume
+ configMap:
+ defaultMode: 420
+ name: prometheus-config
+ - name: prometheus-storage-volume
+ persistentVolumeClaim:
+ claimName: prometheus-pvc
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml
new file mode 100644
index 00000000..f10cd073
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pv.yaml
@@ -0,0 +1,30 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+ name: prometheus-pv
+ namespace: monitoring
+ labels:
+ app: prometheus-pv
+ adi10hero.monitoring: prometheus-pv
+spec:
+ storageClassName: monitoring
+ capacity:
+ storage: 6Gi
+ accessModes:
+ - ReadWriteMany
+ hostPath:
+ path: "/usr/share/monitoring_data/prometheus"
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml
new file mode 100644
index 00000000..812fcc73
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-pvc.yaml
@@ -0,0 +1,33 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+ name: prometheus-pvc
+ namespace: monitoring
+ labels:
+ app: prometheus-pvc
+ adi10hero.monitoring: prometheus-pvc
+spec:
+ accessModes:
+ - ReadWriteMany
+ storageClassName: monitoring
+ resources:
+ requests:
+ storage: 3Gi
+ selector:
+ matchLabels:
+ app: prometheus-pv
+ adi10hero.monitoring: prometheus-pv
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml
new file mode 100644
index 00000000..5be76d3e
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus-service.yaml
@@ -0,0 +1,34 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: prometheus
+ app: prometheus
+ name: prometheus
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/port: '9090'
+spec:
+ type: NodePort
+ ports:
+ - name: prometheus
+ protocol: TCP
+ port: 9090
+ nodePort: 30900
+ selector:
+ adi10hero.monitoring: prometheus
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml
new file mode 100644
index 00000000..149bea84
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-deployment.yaml
@@ -0,0 +1,73 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+ name: prometheus1-deployment
+ namespace: monitoring
+ labels:
+ app: prometheus1
+ adi10hero.monitoring: prometheus1
+spec:
+ replicas: 1
+ selector:
+ matchLabels:
+ adi10hero.monitoring: prometheus1
+ app: prometheus1
+ strategy:
+ type: Recreate
+ template:
+ metadata:
+ labels:
+ adi10hero.monitoring: prometheus1
+ app: prometheus1
+ spec:
+ affinity:
+ nodeAffinity:
+ requiredDuringSchedulingIgnoredDuringExecution:
+ nodeSelectorTerms:
+ - matchExpressions:
+ - key: kubernetes.io/hostname
+ operator: In
+ values:
+ - vm3
+ containers:
+ - name: prometheus
+ image: prom/prometheus
+ args:
+ - --config.file=/etc/prometheus/prometheus.yml
+ - --storage.tsdb.path=/prometheus
+ - --storage.tsdb.retention.size=3GB
+ - --storage.tsdb.retention.time=30d
+ - --web.console.libraries=/etc/prometheus/console_libraries
+ - --web.console.templates=/etc/prometheus/consoles
+ ports:
+ - containerPort: 9090
+ securityContext:
+ runAsUser: 0
+ volumeMounts:
+ - name: prometheus-config-volume
+ mountPath: /etc/prometheus/
+ - name: prometheus-storage-volume
+ mountPath: /prometheus/
+ restartPolicy: Always
+ volumes:
+ - name: prometheus-config-volume
+ configMap:
+ defaultMode: 420
+ name: prometheus-config
+ - name: prometheus-storage-volume
+ persistentVolumeClaim:
+ claimName: prometheus-pvc
diff --git a/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml
new file mode 100644
index 00000000..439deec1
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/files/prometheus/prometheus1-service.yaml
@@ -0,0 +1,35 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: v1
+kind: Service
+metadata:
+ labels:
+ adi10hero.monitoring: prometheus1
+ app: prometheus1
+ name: prometheus1
+ namespace: monitoring
+ annotations:
+ prometheus.io/scrape: 'true'
+ prometheus.io/port: '9090'
+spec:
+ type: NodePort
+ ports:
+ - name: prometheus1
+ protocol: TCP
+ port: 9090
+ nodePort: 30901
+ selector:
+ adi10hero.monitoring: prometheus1
+ app: prometheus1
diff --git a/tools/lma/ansible-server/roles/monitoring/tasks/main.yml b/tools/lma/ansible-server/roles/monitoring/tasks/main.yml
new file mode 100644
index 00000000..cd4e6aca
--- /dev/null
+++ b/tools/lma/ansible-server/roles/monitoring/tasks/main.yml
@@ -0,0 +1,273 @@
+# Copyright 2020 Aditya Srivastava.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+#PAG setup in k8s cluster
+
+#***********************************************************************************************************
+#copy all yaml to /tmp/files/
+#***********************************************************************************************************
+- name: copy all yaml to /tmp/files/
+ copy:
+ src: ../files/
+ dest: /tmp/files/
+
+#***********************************************************************************************************
+#Creating Namespace
+#***********************************************************************************************************
+- name: Creating Monitoring Namespace
+ k8s:
+ state: present
+ src: /tmp/files/monitoring-namespace.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume
+#***********************************************************************************************************
+- name: creating Persistent Volume for Prometheus
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus-pv.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume
+#***********************************************************************************************************
+- name: creating Persistent Volume for Grafana
+ k8s:
+ state: present
+ src: /tmp/files/grafana/grafana-pv.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume Claim
+#***********************************************************************************************************
+- name: creating Persistent Volume Claim for Prometheus
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus-pvc.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#creating Persistent Volume Claim
+#***********************************************************************************************************
+- name: creating Persistent Volume Claim for Grafana
+ k8s:
+ state: present
+ src: /tmp/files/grafana/grafana-pvc.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the CAdvisor deamonset
+#***********************************************************************************************************
+- name: Creating cAdvisor deamonset
+ k8s:
+ state: present
+ src: /tmp/files/cadvisor/cadvisor-deamonset.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Starting the CAdvisor service
+#***********************************************************************************************************
+- name: Starting cAdvisor service
+ k8s:
+ state: present
+ src: /tmp/files/cadvisor/cadvisor-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Deploying and Starting the kube-system-metrics service
+#***********************************************************************************************************
+- name: Deploying kube-system-metrics
+ k8s:
+ state: present
+ src: /tmp/files/kube-state-metrics/kube-state-metrics-deployment.yaml
+ namespace: kube-system
+
+- name: Starting kube-system-metrics service
+ k8s:
+ state: present
+ src: /tmp/files/kube-state-metrics/kube-state-metrics-service.yaml
+ namespace: kube-system
+
+#***********************************************************************************************************
+#Making the NodeExporter deamonset
+#***********************************************************************************************************
+- name: Creating NodeExporter deamonset
+ k8s:
+ state: present
+ src: /tmp/files/node-exporter/nodeexporter-daemonset.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Starting the NodeExporter service
+#***********************************************************************************************************
+- name: Starting NodeExporter service
+ k8s:
+ state: present
+ src: /tmp/files/node-exporter/nodeexporter-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the collectd-exporter deployment
+#***********************************************************************************************************
+- name: Creating collectd-exporter deamonset
+ k8s:
+ state: present
+ src: /tmp/files/collectd-exporter/collectd-exporter-deployment.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the collectd-exporter service
+#***********************************************************************************************************
+- name: Creating collectd-exporter service
+ k8s:
+ state: present
+ src: /tmp/files/collectd-exporter/collectd-exporter-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Webhook goes here
+#***********************************************************************************************************
+
+#***********************************************************************************************************
+#Making the config file for Alertmanagers
+#***********************************************************************************************************
+- name: Creating config map for Alertmanagers
+ k8s:
+ state: present
+ src: /tmp/files/alertmanager/alertmanager-config.yaml
+ namespace: monitoring
+
+# - name: Creating config map for Alertmanagers
+# k8s:
+# state: present
+# src: /tmp/files/alertmanager1-config.yaml
+# namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 1st alertmanager deployment
+#***********************************************************************************************************
+- name: Creating 1st alertmanager deployment
+ k8s:
+ state: present
+ src: /tmp/files/alertmanager/alertmanager-deployment.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 1st alertmanager service
+#***********************************************************************************************************
+- name: Creating 1st alertmanager service
+ k8s:
+ state: present
+ src: /tmp/files/alertmanager/alertmanager-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 2nd alertmanager deployment
+#***********************************************************************************************************
+- name: Creating 2nd alertmanager deployment
+ k8s:
+ state: present
+ src: /tmp/files/alertmanager/alertmanager1-deployment.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the 2nd alertmanager service
+#***********************************************************************************************************
+- name: Creating 2nd alertmanager service
+ k8s:
+ state: present
+ src: /tmp/files/alertmanager/alertmanager1-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Making the config file for Prometheus
+#***********************************************************************************************************
+- name: Creating 1st Prometheus Config
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus-config.yaml
+ namespace: monitoring
+
+# - name: Creating 2nd Prometheus Config
+# k8s:
+# state: present
+# src: /tmp/files/prometheus1-config.yaml
+# namespace: monitoring
+
+#***********************************************************************************************************
+#Starting Prometheus
+#***********************************************************************************************************
+- name: Starting Prometheus 1
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus-deployment.yaml
+ namespace: monitoring
+
+- name: Starting Prometheus 2
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus1-deployment.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Starting Prometheus Service
+#***********************************************************************************************************
+- name: Starting Prometheus 1 Service
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus-service.yaml
+ namespace: monitoring
+
+- name: Starting Prometheus 2 Service
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/prometheus1-service.yaml
+ namespace: monitoring
+
+- name: Starting Main Prometheus Service
+ k8s:
+ state: present
+ src: /tmp/files/prometheus/main-prometheus-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#Starting Grafana
+#***********************************************************************************************************
+- name: Creating Grafana Datasource Config
+ k8s:
+ state: present
+ src: /tmp/files/grafana/grafana-datasource-config.yaml
+ namespace: monitoring
+
+- name: Starting Grafana
+ k8s:
+ state: present
+ src: /tmp/files/grafana/grafana-deployment.yaml
+ namespace: monitoring
+
+- name: Starting Grafana Service
+ k8s:
+ state: present
+ src: /tmp/files/grafana/grafana-service.yaml
+ namespace: monitoring
+
+#***********************************************************************************************************
+#removing /tmp/files
+#***********************************************************************************************************
+- name: Removing /tmp/files
+ file:
+ path: "/tmp/files"
+ state: absent
diff --git a/tools/lma/metrics/dashboard/cpu_usage_using.json b/tools/lma/metrics/dashboard/cpu_usage_using.json
new file mode 100644
index 00000000..85f7f122
--- /dev/null
+++ b/tools/lma/metrics/dashboard/cpu_usage_using.json
@@ -0,0 +1,750 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "prometheus",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "limit": 100,
+ "name": "Monitoring",
+ "showIn": 0,
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": 4,
+ "iteration": 1596637894836,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "editable": true,
+ "error": false,
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "grid": {},
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "hiddenSeries": false,
+ "id": 3,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": false,
+ "hideZero": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "connected",
+ "percentage": false,
+ "pluginVersion": "7.1.1",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "collectd_cpu_percent{exported_instance='$host'}",
+ "hide": false,
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU Usage",
+ "tooltip": {
+ "msResolution": true,
+ "shared": true,
+ "sort": 0,
+ "value_type": "cumulative"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 7
+ },
+ "hiddenSeries": false,
+ "id": 4,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pluginVersion": "7.1.1",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "collectd_cpu_percent{cpu='$core', exported_instance='$host'}",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU utilization per core",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 7,
+ "w": 24,
+ "x": 0,
+ "y": 14
+ },
+ "hiddenSeries": false,
+ "id": 5,
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pluginVersion": "7.1.1",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "collectd_cpu_percent{cpu='$core',exported_instance='$host'}",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "CPU Usage per core",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "10s",
+ "schemaVersion": 26,
+ "style": "dark",
+ "tags": [
+ "monitoring"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": true,
+ "text": "prometheus",
+ "value": "prometheus"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "queryValue": "",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {
+ "selected": false,
+ "text": "pod12-node4",
+ "value": "pod12-node4"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "host",
+ "options": [
+ {
+ "selected": true,
+ "text": "pod12-node4",
+ "value": "pod12-node4"
+ }
+ ],
+ "query": "pod12-node4,",
+ "queryValue": "",
+ "skipUrlSync": false,
+ "type": "custom"
+ },
+ {
+ "allValue": null,
+ "current": {
+ "selected": true,
+ "text": "0",
+ "value": "0"
+ },
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "core",
+ "options": [
+ {
+ "selected": false,
+ "text": "All",
+ "value": "$__all"
+ },
+ {
+ "selected": true,
+ "text": "0",
+ "value": "0"
+ },
+ {
+ "selected": false,
+ "text": "1",
+ "value": "1"
+ },
+ {
+ "selected": false,
+ "text": "2",
+ "value": "2"
+ },
+ {
+ "selected": false,
+ "text": "3",
+ "value": "3"
+ },
+ {
+ "selected": false,
+ "text": "4",
+ "value": "4"
+ },
+ {
+ "selected": false,
+ "text": "5",
+ "value": "5"
+ },
+ {
+ "selected": false,
+ "text": "6",
+ "value": "6"
+ },
+ {
+ "selected": false,
+ "text": "7",
+ "value": "7"
+ },
+ {
+ "selected": false,
+ "text": "8",
+ "value": "8"
+ },
+ {
+ "selected": false,
+ "text": "9",
+ "value": "9"
+ },
+ {
+ "selected": false,
+ "text": "10",
+ "value": "10"
+ },
+ {
+ "selected": false,
+ "text": "11",
+ "value": "11"
+ },
+ {
+ "selected": false,
+ "text": "12",
+ "value": "12"
+ },
+ {
+ "selected": false,
+ "text": "13",
+ "value": "13"
+ },
+ {
+ "selected": false,
+ "text": "14",
+ "value": "14"
+ },
+ {
+ "selected": false,
+ "text": "15",
+ "value": "15"
+ },
+ {
+ "selected": false,
+ "text": "16",
+ "value": "16"
+ },
+ {
+ "selected": false,
+ "text": "17",
+ "value": "17"
+ },
+ {
+ "selected": false,
+ "text": "18",
+ "value": "18"
+ },
+ {
+ "selected": false,
+ "text": "19",
+ "value": "19"
+ },
+ {
+ "selected": false,
+ "text": "20",
+ "value": "20"
+ },
+ {
+ "selected": false,
+ "text": "21",
+ "value": "21"
+ },
+ {
+ "selected": false,
+ "text": "22",
+ "value": "22"
+ },
+ {
+ "selected": false,
+ "text": "23",
+ "value": "23"
+ },
+ {
+ "selected": false,
+ "text": "24",
+ "value": "24"
+ },
+ {
+ "selected": false,
+ "text": "25",
+ "value": "25"
+ },
+ {
+ "selected": false,
+ "text": "26",
+ "value": "26"
+ },
+ {
+ "selected": false,
+ "text": "27",
+ "value": "27"
+ },
+ {
+ "selected": false,
+ "text": "28",
+ "value": "28"
+ },
+ {
+ "selected": false,
+ "text": "29",
+ "value": "29"
+ },
+ {
+ "selected": false,
+ "text": "30",
+ "value": "30"
+ },
+ {
+ "selected": false,
+ "text": "31",
+ "value": "31"
+ },
+ {
+ "selected": false,
+ "text": "32",
+ "value": "32"
+ },
+ {
+ "selected": false,
+ "text": "33",
+ "value": "33"
+ },
+ {
+ "selected": false,
+ "text": "34",
+ "value": "34"
+ },
+ {
+ "selected": false,
+ "text": "35",
+ "value": "35"
+ },
+ {
+ "selected": false,
+ "text": "36",
+ "value": "36"
+ },
+ {
+ "selected": false,
+ "text": "37",
+ "value": "37"
+ },
+ {
+ "selected": false,
+ "text": "38",
+ "value": "38"
+ },
+ {
+ "selected": false,
+ "text": "39",
+ "value": "39"
+ },
+ {
+ "selected": false,
+ "text": "40",
+ "value": "40"
+ },
+ {
+ "selected": false,
+ "text": "41",
+ "value": "41"
+ },
+ {
+ "selected": false,
+ "text": "42",
+ "value": "42"
+ },
+ {
+ "selected": false,
+ "text": "43",
+ "value": "43"
+ },
+ {
+ "selected": false,
+ "text": "44",
+ "value": "44"
+ },
+ {
+ "selected": false,
+ "text": "45",
+ "value": "45"
+ },
+ {
+ "selected": false,
+ "text": "46",
+ "value": "46"
+ },
+ {
+ "selected": false,
+ "text": "47",
+ "value": "47"
+ },
+ {
+ "selected": false,
+ "text": "48",
+ "value": "48"
+ },
+ {
+ "selected": false,
+ "text": "49",
+ "value": "49"
+ },
+ {
+ "selected": false,
+ "text": "50",
+ "value": "50"
+ },
+ {
+ "selected": false,
+ "text": "51",
+ "value": "51"
+ },
+ {
+ "selected": false,
+ "text": "52",
+ "value": "52"
+ },
+ {
+ "selected": false,
+ "text": "53",
+ "value": "53"
+ },
+ {
+ "selected": false,
+ "text": "54",
+ "value": "54"
+ },
+ {
+ "selected": false,
+ "text": "55",
+ "value": "55"
+ },
+ {
+ "selected": false,
+ "text": "56",
+ "value": "56"
+ },
+ {
+ "selected": false,
+ "text": "57",
+ "value": "57"
+ },
+ {
+ "selected": false,
+ "text": "58",
+ "value": "58"
+ },
+ {
+ "selected": false,
+ "text": "59",
+ "value": "59"
+ },
+ {
+ "selected": false,
+ "text": "60",
+ "value": "60"
+ },
+ {
+ "selected": false,
+ "text": "61",
+ "value": "61"
+ },
+ {
+ "selected": false,
+ "text": "62",
+ "value": "62"
+ },
+ {
+ "selected": false,
+ "text": "63",
+ "value": "63"
+ },
+ {
+ "selected": false,
+ "text": "64",
+ "value": "64"
+ }
+ ],
+ "query": "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64",
+ "queryValue": "",
+ "skipUrlSync": false,
+ "type": "custom"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-5m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "CPU Usage",
+ "uid": "XeDwSiSGk",
+ "version": 13
+} \ No newline at end of file
diff --git a/tools/lma/metrics/dashboard/memory_using.json b/tools/lma/metrics/dashboard/memory_using.json
new file mode 100644
index 00000000..3b92d8f5
--- /dev/null
+++ b/tools/lma/metrics/dashboard/memory_using.json
@@ -0,0 +1,337 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "prometheus",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "limit": 100,
+ "name": "Monitoring",
+ "showIn": 0,
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": 6,
+ "iteration": 1597616052316,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "description": "",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 15,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "hiddenSeries": false,
+ "id": 1,
+ "interval": "1s",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": false,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pluginVersion": "7.1.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(collectd_memory{exported_instance='$host', memory='$type'}[$range])",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Bytes",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 26,
+ "style": "dark",
+ "tags": [
+ "monitoring"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": false,
+ "text": "prometheus",
+ "value": "prometheus"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "queryValue": "",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {
+ "selected": false,
+ "text": "pod12-node4",
+ "value": "pod12-node4"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "host",
+ "options": [
+ {
+ "selected": true,
+ "text": "pod12-node4",
+ "value": "pod12-node4"
+ }
+ ],
+ "query": "pod12-node4,",
+ "queryValue": "",
+ "skipUrlSync": false,
+ "type": "custom"
+ },
+ {
+ "auto": false,
+ "auto_count": 30,
+ "auto_min": "10s",
+ "current": {
+ "selected": false,
+ "text": "30s",
+ "value": "30s"
+ },
+ "hide": 0,
+ "label": null,
+ "name": "range",
+ "options": [
+ {
+ "selected": true,
+ "text": "30s",
+ "value": "30s"
+ },
+ {
+ "selected": false,
+ "text": "1m",
+ "value": "1m"
+ },
+ {
+ "selected": false,
+ "text": "5m",
+ "value": "5m"
+ },
+ {
+ "selected": false,
+ "text": "10m",
+ "value": "10m"
+ },
+ {
+ "selected": false,
+ "text": "30m",
+ "value": "30m"
+ },
+ {
+ "selected": false,
+ "text": "1h",
+ "value": "1h"
+ },
+ {
+ "selected": false,
+ "text": "6h",
+ "value": "6h"
+ },
+ {
+ "selected": false,
+ "text": "12h",
+ "value": "12h"
+ },
+ {
+ "selected": false,
+ "text": "1d",
+ "value": "1d"
+ },
+ {
+ "selected": false,
+ "text": "7d",
+ "value": "7d"
+ },
+ {
+ "selected": false,
+ "text": "14d",
+ "value": "14d"
+ },
+ {
+ "selected": false,
+ "text": "30d",
+ "value": "30d"
+ }
+ ],
+ "query": "30s,1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+ "queryValue": "",
+ "refresh": 2,
+ "skipUrlSync": false,
+ "type": "interval"
+ },
+ {
+ "allValue": null,
+ "current": {
+ "selected": true,
+ "text": "used",
+ "value": "used"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "type",
+ "options": [
+ {
+ "selected": false,
+ "text": "buffered",
+ "value": "buffered"
+ },
+ {
+ "selected": false,
+ "text": "cached",
+ "value": "cached"
+ },
+ {
+ "selected": false,
+ "text": "free",
+ "value": "free"
+ },
+ {
+ "selected": false,
+ "text": "slab_recl",
+ "value": "slab_recl"
+ },
+ {
+ "selected": false,
+ "text": "slab_unrecl",
+ "value": "slab_unrecl"
+ },
+ {
+ "selected": true,
+ "text": "used",
+ "value": "used"
+ }
+ ],
+ "query": "buffered,cached,free,slab_recl,slab_unrecl,used",
+ "queryValue": "",
+ "skipUrlSync": false,
+ "type": "custom"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-5m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "Memory",
+ "uid": "kuro-mem",
+ "version": 4
+} \ No newline at end of file
diff --git a/tools/lma/metrics/dashboard/ovs_stats_using.json b/tools/lma/metrics/dashboard/ovs_stats_using.json
new file mode 100644
index 00000000..1e679fbe
--- /dev/null
+++ b/tools/lma/metrics/dashboard/ovs_stats_using.json
@@ -0,0 +1,854 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "prometheus",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "limit": 100,
+ "name": "Monitoring",
+ "showIn": 0,
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": 6,
+ "iteration": 1596643135141,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "hiddenSeries": false,
+ "id": 1,
+ "interval": "1s",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pluginVersion": "7.1.1",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(collectd_ovs_stats_if_rx_octets_total{exported_instance='$host'}[$__interval])",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Average RX values",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 24,
+ "x": 0,
+ "y": 6
+ },
+ "hiddenSeries": false,
+ "id": 2,
+ "interval": "1s",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pluginVersion": "7.1.1",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(collectd_ovs_stats_if_tx_octets_total{exported_instance='$host'}[$__interval])",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Average TX values",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 24,
+ "x": 0,
+ "y": 12
+ },
+ "hiddenSeries": false,
+ "id": 3,
+ "interval": "1s",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": true,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pluginVersion": "7.1.1",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(collectd_ovs_stats_if_collisions_total{exported_instance='$host'}[$range])",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(collectd_ovs_stats_if_dropped_0_total{exported_instance='$host'}[$range])",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "B"
+ },
+ {
+ "expr": "rate(collectd_ovs_stats_if_dropped_1_total{exported_instance='$host'}[$range])",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "C"
+ },
+ {
+ "expr": "rate(collectd_ovs_stats_if_errors_0_total{exported_instance='$host'}[$range])",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "D"
+ },
+ {
+ "expr": "rate(collectd_ovs_stats_if_errors_1_total{exported_instance='$host'}[$range])",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "E"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Average Collisions, Drops and Error values",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 26,
+ "style": "dark",
+ "tags": [
+ "monitoring"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": false,
+ "text": "prometheus",
+ "value": "prometheus"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "queryValue": "",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {
+ "selected": false,
+ "text": "pod12-node4",
+ "value": "pod12-node4"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "host",
+ "options": [
+ {
+ "selected": true,
+ "text": "pod12-node4",
+ "value": "pod12-node4"
+ }
+ ],
+ "query": "pod12-node4,",
+ "queryValue": "",
+ "skipUrlSync": false,
+ "type": "custom"
+ },
+ {
+ "allValue": null,
+ "current": {
+ "selected": true,
+ "text": "0",
+ "value": "0"
+ },
+ "hide": 0,
+ "includeAll": true,
+ "label": null,
+ "multi": false,
+ "name": "core",
+ "options": [
+ {
+ "selected": false,
+ "text": "All",
+ "value": "$__all"
+ },
+ {
+ "selected": true,
+ "text": "0",
+ "value": "0"
+ },
+ {
+ "selected": false,
+ "text": "1",
+ "value": "1"
+ },
+ {
+ "selected": false,
+ "text": "2",
+ "value": "2"
+ },
+ {
+ "selected": false,
+ "text": "3",
+ "value": "3"
+ },
+ {
+ "selected": false,
+ "text": "4",
+ "value": "4"
+ },
+ {
+ "selected": false,
+ "text": "5",
+ "value": "5"
+ },
+ {
+ "selected": false,
+ "text": "6",
+ "value": "6"
+ },
+ {
+ "selected": false,
+ "text": "7",
+ "value": "7"
+ },
+ {
+ "selected": false,
+ "text": "8",
+ "value": "8"
+ },
+ {
+ "selected": false,
+ "text": "9",
+ "value": "9"
+ },
+ {
+ "selected": false,
+ "text": "10",
+ "value": "10"
+ },
+ {
+ "selected": false,
+ "text": "11",
+ "value": "11"
+ },
+ {
+ "selected": false,
+ "text": "12",
+ "value": "12"
+ },
+ {
+ "selected": false,
+ "text": "13",
+ "value": "13"
+ },
+ {
+ "selected": false,
+ "text": "14",
+ "value": "14"
+ },
+ {
+ "selected": false,
+ "text": "15",
+ "value": "15"
+ },
+ {
+ "selected": false,
+ "text": "16",
+ "value": "16"
+ },
+ {
+ "selected": false,
+ "text": "17",
+ "value": "17"
+ },
+ {
+ "selected": false,
+ "text": "18",
+ "value": "18"
+ },
+ {
+ "selected": false,
+ "text": "19",
+ "value": "19"
+ },
+ {
+ "selected": false,
+ "text": "20",
+ "value": "20"
+ },
+ {
+ "selected": false,
+ "text": "21",
+ "value": "21"
+ },
+ {
+ "selected": false,
+ "text": "22",
+ "value": "22"
+ },
+ {
+ "selected": false,
+ "text": "23",
+ "value": "23"
+ },
+ {
+ "selected": false,
+ "text": "24",
+ "value": "24"
+ },
+ {
+ "selected": false,
+ "text": "25",
+ "value": "25"
+ },
+ {
+ "selected": false,
+ "text": "26",
+ "value": "26"
+ },
+ {
+ "selected": false,
+ "text": "27",
+ "value": "27"
+ },
+ {
+ "selected": false,
+ "text": "28",
+ "value": "28"
+ },
+ {
+ "selected": false,
+ "text": "29",
+ "value": "29"
+ },
+ {
+ "selected": false,
+ "text": "30",
+ "value": "30"
+ },
+ {
+ "selected": false,
+ "text": "31",
+ "value": "31"
+ },
+ {
+ "selected": false,
+ "text": "32",
+ "value": "32"
+ },
+ {
+ "selected": false,
+ "text": "33",
+ "value": "33"
+ },
+ {
+ "selected": false,
+ "text": "34",
+ "value": "34"
+ },
+ {
+ "selected": false,
+ "text": "35",
+ "value": "35"
+ },
+ {
+ "selected": false,
+ "text": "36",
+ "value": "36"
+ },
+ {
+ "selected": false,
+ "text": "37",
+ "value": "37"
+ },
+ {
+ "selected": false,
+ "text": "38",
+ "value": "38"
+ },
+ {
+ "selected": false,
+ "text": "39",
+ "value": "39"
+ },
+ {
+ "selected": false,
+ "text": "40",
+ "value": "40"
+ },
+ {
+ "selected": false,
+ "text": "41",
+ "value": "41"
+ },
+ {
+ "selected": false,
+ "text": "42",
+ "value": "42"
+ },
+ {
+ "selected": false,
+ "text": "43",
+ "value": "43"
+ },
+ {
+ "selected": false,
+ "text": "44",
+ "value": "44"
+ },
+ {
+ "selected": false,
+ "text": "45",
+ "value": "45"
+ },
+ {
+ "selected": false,
+ "text": "46",
+ "value": "46"
+ },
+ {
+ "selected": false,
+ "text": "47",
+ "value": "47"
+ },
+ {
+ "selected": false,
+ "text": "48",
+ "value": "48"
+ },
+ {
+ "selected": false,
+ "text": "49",
+ "value": "49"
+ },
+ {
+ "selected": false,
+ "text": "50",
+ "value": "50"
+ },
+ {
+ "selected": false,
+ "text": "51",
+ "value": "51"
+ },
+ {
+ "selected": false,
+ "text": "52",
+ "value": "52"
+ },
+ {
+ "selected": false,
+ "text": "53",
+ "value": "53"
+ },
+ {
+ "selected": false,
+ "text": "54",
+ "value": "54"
+ },
+ {
+ "selected": false,
+ "text": "55",
+ "value": "55"
+ },
+ {
+ "selected": false,
+ "text": "56",
+ "value": "56"
+ },
+ {
+ "selected": false,
+ "text": "57",
+ "value": "57"
+ },
+ {
+ "selected": false,
+ "text": "58",
+ "value": "58"
+ },
+ {
+ "selected": false,
+ "text": "59",
+ "value": "59"
+ },
+ {
+ "selected": false,
+ "text": "60",
+ "value": "60"
+ },
+ {
+ "selected": false,
+ "text": "61",
+ "value": "61"
+ },
+ {
+ "selected": false,
+ "text": "62",
+ "value": "62"
+ },
+ {
+ "selected": false,
+ "text": "63",
+ "value": "63"
+ },
+ {
+ "selected": false,
+ "text": "64",
+ "value": "64"
+ }
+ ],
+ "query": "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64",
+ "queryValue": "",
+ "skipUrlSync": false,
+ "type": "custom"
+ },
+ {
+ "auto": false,
+ "auto_count": 30,
+ "auto_min": "10s",
+ "current": {
+ "selected": false,
+ "text": "30s",
+ "value": "30s"
+ },
+ "hide": 0,
+ "label": null,
+ "name": "range",
+ "options": [
+ {
+ "selected": true,
+ "text": "30s",
+ "value": "30s"
+ },
+ {
+ "selected": false,
+ "text": "1m",
+ "value": "1m"
+ },
+ {
+ "selected": false,
+ "text": "5m",
+ "value": "5m"
+ },
+ {
+ "selected": false,
+ "text": "10m",
+ "value": "10m"
+ },
+ {
+ "selected": false,
+ "text": "30m",
+ "value": "30m"
+ },
+ {
+ "selected": false,
+ "text": "1h",
+ "value": "1h"
+ },
+ {
+ "selected": false,
+ "text": "6h",
+ "value": "6h"
+ },
+ {
+ "selected": false,
+ "text": "12h",
+ "value": "12h"
+ },
+ {
+ "selected": false,
+ "text": "1d",
+ "value": "1d"
+ },
+ {
+ "selected": false,
+ "text": "7d",
+ "value": "7d"
+ },
+ {
+ "selected": false,
+ "text": "14d",
+ "value": "14d"
+ },
+ {
+ "selected": false,
+ "text": "30d",
+ "value": "30d"
+ }
+ ],
+ "query": "30s,1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+ "queryValue": "",
+ "refresh": 2,
+ "skipUrlSync": false,
+ "type": "interval"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-5m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "OVS Stats",
+ "uid": "K1N5ciIGz",
+ "version": 7
+ } \ No newline at end of file
diff --git a/tools/lma/metrics/dashboard/rdt_using.json b/tools/lma/metrics/dashboard/rdt_using.json
new file mode 100644
index 00000000..a0ce7987
--- /dev/null
+++ b/tools/lma/metrics/dashboard/rdt_using.json
@@ -0,0 +1,833 @@
+{
+ "annotations": {
+ "list": [
+ {
+ "builtIn": 1,
+ "datasource": "prometheus",
+ "enable": true,
+ "hide": true,
+ "iconColor": "rgba(0, 211, 255, 1)",
+ "limit": 100,
+ "name": "Monitoring",
+ "showIn": 0,
+ "type": "dashboard"
+ }
+ ]
+ },
+ "editable": true,
+ "gnetId": null,
+ "graphTooltip": 0,
+ "id": 7,
+ "iteration": 1597615840124,
+ "links": [],
+ "panels": [
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 24,
+ "x": 0,
+ "y": 0
+ },
+ "hiddenSeries": false,
+ "id": 1,
+ "interval": "1s",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": false,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pluginVersion": "7.1.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(collectd_intel_rdt_bytes{exported_instance='$host', intel_rdt='$intel_rdt'}[$range])",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "RDT Bytes",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 6,
+ "w": 24,
+ "x": 0,
+ "y": 6
+ },
+ "hiddenSeries": false,
+ "id": 2,
+ "interval": "1s",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": false,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pluginVersion": "7.1.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(collectd_intel_rdt_ipc{exported_instance='$host', intel_rdt='$intel_rdt'}[$range])",
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "IPC values",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ },
+ {
+ "aliasColors": {},
+ "bars": false,
+ "dashLength": 10,
+ "dashes": false,
+ "datasource": "$datasource",
+ "fieldConfig": {
+ "defaults": {
+ "custom": {}
+ },
+ "overrides": []
+ },
+ "fill": 1,
+ "fillGradient": 0,
+ "gridPos": {
+ "h": 5,
+ "w": 24,
+ "x": 0,
+ "y": 12
+ },
+ "hiddenSeries": false,
+ "id": 3,
+ "interval": "1s",
+ "legend": {
+ "alignAsTable": true,
+ "avg": true,
+ "current": true,
+ "max": true,
+ "min": true,
+ "rightSide": true,
+ "show": false,
+ "total": false,
+ "values": true
+ },
+ "lines": true,
+ "linewidth": 1,
+ "links": [],
+ "nullPointMode": "null",
+ "percentage": false,
+ "pluginVersion": "7.1.3",
+ "pointradius": 5,
+ "points": false,
+ "renderer": "flot",
+ "seriesOverrides": [],
+ "spaceLength": 10,
+ "span": 12,
+ "stack": false,
+ "steppedLine": false,
+ "targets": [
+ {
+ "expr": "rate(collectd_intel_rdt_memory_bandwidth_total{exported_instance='$host', type='local'}[$range])",
+ "hide": false,
+ "interval": "",
+ "legendFormat": "",
+ "refId": "A"
+ },
+ {
+ "expr": "rate(collectd_intel_rdt_memory_bandwidth_total{exported_instance='$host', type='remote'}[$range])",
+ "hide": false,
+ "interval": "",
+ "legendFormat": "",
+ "refId": "B"
+ }
+ ],
+ "thresholds": [],
+ "timeFrom": null,
+ "timeRegions": [],
+ "timeShift": null,
+ "title": "Memory Bandwidth Total",
+ "tooltip": {
+ "shared": true,
+ "sort": 0,
+ "value_type": "individual"
+ },
+ "type": "graph",
+ "xaxis": {
+ "buckets": null,
+ "mode": "time",
+ "name": null,
+ "show": true,
+ "values": []
+ },
+ "yaxes": [
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ },
+ {
+ "format": "short",
+ "label": null,
+ "logBase": 1,
+ "max": null,
+ "min": null,
+ "show": true
+ }
+ ],
+ "yaxis": {
+ "align": false,
+ "alignLevel": null
+ }
+ }
+ ],
+ "refresh": "30s",
+ "schemaVersion": 26,
+ "style": "dark",
+ "tags": [
+ "monitoring"
+ ],
+ "templating": {
+ "list": [
+ {
+ "current": {
+ "selected": false,
+ "text": "prometheus",
+ "value": "prometheus"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "datasource",
+ "options": [],
+ "query": "prometheus",
+ "queryValue": "",
+ "refresh": 1,
+ "regex": "",
+ "skipUrlSync": false,
+ "type": "datasource"
+ },
+ {
+ "allValue": null,
+ "current": {
+ "selected": false,
+ "text": "pod12-node4",
+ "value": "pod12-node4"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "host",
+ "options": [
+ {
+ "selected": true,
+ "text": "pod12-node4",
+ "value": "pod12-node4"
+ }
+ ],
+ "query": "pod12-node4,",
+ "queryValue": "",
+ "skipUrlSync": false,
+ "type": "custom"
+ },
+ {
+ "auto": false,
+ "auto_count": 30,
+ "auto_min": "10s",
+ "current": {
+ "selected": false,
+ "text": "30s",
+ "value": "30s"
+ },
+ "hide": 0,
+ "label": null,
+ "name": "range",
+ "options": [
+ {
+ "selected": true,
+ "text": "30s",
+ "value": "30s"
+ },
+ {
+ "selected": false,
+ "text": "1m",
+ "value": "1m"
+ },
+ {
+ "selected": false,
+ "text": "5m",
+ "value": "5m"
+ },
+ {
+ "selected": false,
+ "text": "10m",
+ "value": "10m"
+ },
+ {
+ "selected": false,
+ "text": "30m",
+ "value": "30m"
+ },
+ {
+ "selected": false,
+ "text": "1h",
+ "value": "1h"
+ },
+ {
+ "selected": false,
+ "text": "6h",
+ "value": "6h"
+ },
+ {
+ "selected": false,
+ "text": "12h",
+ "value": "12h"
+ },
+ {
+ "selected": false,
+ "text": "1d",
+ "value": "1d"
+ },
+ {
+ "selected": false,
+ "text": "7d",
+ "value": "7d"
+ },
+ {
+ "selected": false,
+ "text": "14d",
+ "value": "14d"
+ },
+ {
+ "selected": false,
+ "text": "30d",
+ "value": "30d"
+ }
+ ],
+ "query": "30s,1m,5m,10m,30m,1h,6h,12h,1d,7d,14d,30d",
+ "queryValue": "",
+ "refresh": 2,
+ "skipUrlSync": false,
+ "type": "interval"
+ },
+ {
+ "allValue": null,
+ "current": {
+ "selected": true,
+ "text": "2",
+ "value": "2"
+ },
+ "hide": 0,
+ "includeAll": false,
+ "label": null,
+ "multi": false,
+ "name": "intel_rdt",
+ "options": [
+ {
+ "selected": false,
+ "text": "0",
+ "value": "0"
+ },
+ {
+ "selected": false,
+ "text": "1",
+ "value": "1"
+ },
+ {
+ "selected": true,
+ "text": "2",
+ "value": "2"
+ },
+ {
+ "selected": false,
+ "text": "3",
+ "value": "3"
+ },
+ {
+ "selected": false,
+ "text": "4",
+ "value": "4"
+ },
+ {
+ "selected": false,
+ "text": "5",
+ "value": "5"
+ },
+ {
+ "selected": false,
+ "text": "6",
+ "value": "6"
+ },
+ {
+ "selected": false,
+ "text": "7",
+ "value": "7"
+ },
+ {
+ "selected": false,
+ "text": "8",
+ "value": "8"
+ },
+ {
+ "selected": false,
+ "text": "9",
+ "value": "9"
+ },
+ {
+ "selected": false,
+ "text": "10",
+ "value": "10"
+ },
+ {
+ "selected": false,
+ "text": "11",
+ "value": "11"
+ },
+ {
+ "selected": false,
+ "text": "12",
+ "value": "12"
+ },
+ {
+ "selected": false,
+ "text": "13",
+ "value": "13"
+ },
+ {
+ "selected": false,
+ "text": "14",
+ "value": "14"
+ },
+ {
+ "selected": false,
+ "text": "15",
+ "value": "15"
+ },
+ {
+ "selected": false,
+ "text": "16",
+ "value": "16"
+ },
+ {
+ "selected": false,
+ "text": "17",
+ "value": "17"
+ },
+ {
+ "selected": false,
+ "text": "18",
+ "value": "18"
+ },
+ {
+ "selected": false,
+ "text": "19",
+ "value": "19"
+ },
+ {
+ "selected": false,
+ "text": "20",
+ "value": "20"
+ },
+ {
+ "selected": false,
+ "text": "21",
+ "value": "21"
+ },
+ {
+ "selected": false,
+ "text": "22",
+ "value": "22"
+ },
+ {
+ "selected": false,
+ "text": "23",
+ "value": "23"
+ },
+ {
+ "selected": false,
+ "text": "24",
+ "value": "24"
+ },
+ {
+ "selected": false,
+ "text": "25",
+ "value": "25"
+ },
+ {
+ "selected": false,
+ "text": "26",
+ "value": "26"
+ },
+ {
+ "selected": false,
+ "text": "27",
+ "value": "27"
+ },
+ {
+ "selected": false,
+ "text": "28",
+ "value": "28"
+ },
+ {
+ "selected": false,
+ "text": "29",
+ "value": "29"
+ },
+ {
+ "selected": false,
+ "text": "30",
+ "value": "30"
+ },
+ {
+ "selected": false,
+ "text": "31",
+ "value": "31"
+ },
+ {
+ "selected": false,
+ "text": "32",
+ "value": "32"
+ },
+ {
+ "selected": false,
+ "text": "33",
+ "value": "33"
+ },
+ {
+ "selected": false,
+ "text": "34",
+ "value": "34"
+ },
+ {
+ "selected": false,
+ "text": "35",
+ "value": "35"
+ },
+ {
+ "selected": false,
+ "text": "36",
+ "value": "36"
+ },
+ {
+ "selected": false,
+ "text": "37",
+ "value": "37"
+ },
+ {
+ "selected": false,
+ "text": "38",
+ "value": "38"
+ },
+ {
+ "selected": false,
+ "text": "39",
+ "value": "39"
+ },
+ {
+ "selected": false,
+ "text": "40",
+ "value": "40"
+ },
+ {
+ "selected": false,
+ "text": "41",
+ "value": "41"
+ },
+ {
+ "selected": false,
+ "text": "42",
+ "value": "42"
+ },
+ {
+ "selected": false,
+ "text": "43",
+ "value": "43"
+ },
+ {
+ "selected": false,
+ "text": "44",
+ "value": "44"
+ },
+ {
+ "selected": false,
+ "text": "45",
+ "value": "45"
+ },
+ {
+ "selected": false,
+ "text": "46",
+ "value": "46"
+ },
+ {
+ "selected": false,
+ "text": "47",
+ "value": "47"
+ },
+ {
+ "selected": false,
+ "text": "48",
+ "value": "48"
+ },
+ {
+ "selected": false,
+ "text": "49",
+ "value": "49"
+ },
+ {
+ "selected": false,
+ "text": "50",
+ "value": "50"
+ },
+ {
+ "selected": false,
+ "text": "51",
+ "value": "51"
+ },
+ {
+ "selected": false,
+ "text": "52",
+ "value": "52"
+ },
+ {
+ "selected": false,
+ "text": "53",
+ "value": "53"
+ },
+ {
+ "selected": false,
+ "text": "54",
+ "value": "54"
+ },
+ {
+ "selected": false,
+ "text": "55",
+ "value": "55"
+ },
+ {
+ "selected": false,
+ "text": "56",
+ "value": "56"
+ },
+ {
+ "selected": false,
+ "text": "57",
+ "value": "57"
+ },
+ {
+ "selected": false,
+ "text": "58",
+ "value": "58"
+ },
+ {
+ "selected": false,
+ "text": "59",
+ "value": "59"
+ },
+ {
+ "selected": false,
+ "text": "60",
+ "value": "60"
+ },
+ {
+ "selected": false,
+ "text": "61",
+ "value": "61"
+ },
+ {
+ "selected": false,
+ "text": "62",
+ "value": "62"
+ },
+ {
+ "selected": false,
+ "text": "63",
+ "value": "63"
+ },
+ {
+ "selected": false,
+ "text": "64",
+ "value": "64"
+ }
+ ],
+ "query": "0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64",
+ "queryValue": "",
+ "skipUrlSync": false,
+ "type": "custom"
+ }
+ ]
+ },
+ "time": {
+ "from": "now-5m",
+ "to": "now"
+ },
+ "timepicker": {
+ "refresh_intervals": [
+ "10s",
+ "30s",
+ "1m",
+ "5m",
+ "15m",
+ "30m",
+ "1h",
+ "2h",
+ "1d"
+ ],
+ "time_options": [
+ "5m",
+ "15m",
+ "1h",
+ "6h",
+ "12h",
+ "24h",
+ "2d",
+ "7d",
+ "30d"
+ ]
+ },
+ "timezone": "browser",
+ "title": "RDT (L3 Cache)",
+ "uid": "kuro-rdt",
+ "version": 9
+} \ No newline at end of file