From 9423f2e5911a9933744e827c9b09e2e9025b4aee Mon Sep 17 00:00:00 2001 From: QiLiang Date: Thu, 15 Mar 2018 12:20:59 +0800 Subject: Initial commit for monitoring by prometheus - install prometheus - validate the installation - add prometheus query function - TODO: test collecting telemetry data from istio JIRA: CLOVER-7 Change-Id: I983be2db78c8c5c20c0acee9ae81e891884e07fb Signed-off-by: QiLiang --- clover/monitoring/monitoring.py | 140 ++++++++++++++++++++++++++++++++++++++++ clover/monitoring/validate.py | 70 ++++++++++++++++++++ docs/monitoring.rst | 31 +++++++++ 3 files changed, 241 insertions(+) create mode 100644 clover/monitoring/monitoring.py create mode 100644 clover/monitoring/validate.py create mode 100644 docs/monitoring.rst diff --git a/clover/monitoring/monitoring.py b/clover/monitoring/monitoring.py new file mode 100644 index 0000000..9726fd1 --- /dev/null +++ b/clover/monitoring/monitoring.py @@ -0,0 +1,140 @@ +# Copyright (c) Authors of Clover +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 + +from datetime import timedelta +import pprint +import requests +import time + +PROMETHEUS_URL = "http://127.0.0.1:9090" + + +class Monitoring(object): + PROMETHEUS_HEALTH_UP = "up" + PROMETHEUS_ISTIO_TARGETS = {"envoy", + "istio-mesh", + "kubernetes-apiservers", + "kubernetes-cadvisor", + "kubernetes-nodes", + "kubernetes-service-endpoints", + "mixer", + "pilot"} + PROMETHEUS_API_TARGETS = "/api/v1/targets" + PROMETHEUS_API_QUERY = "/api/v1/query" + PROMETHEUS_API_QUERY_RANGE = "/api/v1/query_range" + + def __init__(self, host): + self.host = host + + def get_targets(self): + try: + # Reference api: https://prometheus.io/docs/prometheus/latest/querying/api/#targets + response = requests.get('%s%s' % (self.host, Monitoring.PROMETHEUS_API_TARGETS)) + if response.status_code != 200: + print("ERROR: get targets status code: %r" % response.status_code) + return False + except Exception as e: + print("ERROR: Cannot connect to prometheus\n%s" % e) + return False + + return response.json() + + def is_targets_healthy(self): + targets = set() + + raw_targets = self.get_targets() + if raw_targets == False: + return False + + for target in raw_targets["data"]["activeTargets"]: + if target["health"] != Monitoring.PROMETHEUS_HEALTH_UP: + print("ERROR: target unhealth job: %s, health: %s" % \ + (target["labels"]["job"], target["health"])) + return False + targets.add(target["labels"]["job"]) + + diff = Monitoring.PROMETHEUS_ISTIO_TARGETS - targets + if len(diff): + print("ERROR: targets %r not found!" % diff) + return False + + return True + + # Reference links: + # - https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries + # - https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries + # - https://github.com/prometheus/prombench/blob/master/apps/load-generator/main.py + def query(self, query_params): + try: + start = time.time() + + query_type = query_params.get("type", "instant") + params = {"query": query_params["query"]} + if query_type == "instant": + url = "%s%s" % (self.host, Monitoring.PROMETHEUS_API_QUERY) + elif query_type == "range": + url = "%s%s" % (self.host, Monitoring.PROMETHEUS_API_QUERY_RANGE) + params["start"] = start - duration_seconds(query_params.get("start", "0h")) + params["end"] = start - duration_seconds(query_params.get("end", "0h")) + params["step"] = query_params.get("step", "15s") + else: + print("ERROR: invalidate query type") + return + + resp = requests.get(url, params) + dur = time.time() - start + + print("query %s %s, status=%s, size=%d, dur=%.3f" % \ + (self.host, query_params["query"], resp.status_code, len(resp.text), dur)) + pp = pprint.PrettyPrinter(indent=2) + pp.pprint(resp.json()) + + except Exception as e: + print("ERROR: Could not query prometheus instance %s. \n %s" % (url, e)) + + +def duration_seconds(s): + num = int(s[:-1]) + + if s.endswith('s'): + return timedelta(seconds=num).total_seconds() + elif s.endswith('m'): + return timedelta(minutes=num).total_seconds() + elif s.endswith('h'): + return timedelta(hours=num).total_seconds() + + raise "ERROR: unknown duration %s" % s + + +def main(): + m = Monitoring(PROMETHEUS_URL) + if not m.is_targets_healthy(): + print("ERROR: Prometheus targets is unhealthy!") + else: + print("Prometheus targets are all healthy!") + + print "\n### query instant" + query_params = { + "type": "instant", + "query": "istio_double_request_count{destination='details.default.svc.cluster.local'}" + } + m.query(query_params) + + print "\n### query range" + query_range_param = { + "type": "range", + "query": "istio_double_request_count{destination='details.default.svc.cluster.local'}", + "start": "5m", + "end": "3m", + "step": "30s" + } + m.query(query_range_param) + + +if __name__ == '__main__': + main() + diff --git a/clover/monitoring/validate.py b/clover/monitoring/validate.py new file mode 100644 index 0000000..fafe5df --- /dev/null +++ b/clover/monitoring/validate.py @@ -0,0 +1,70 @@ +# Copyright (c) Authors of Clover +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 + +from monitoring import Monitoring +from kubernetes import client, config + +PROMETHEUS_URL = "http://127.0.0.1:9090" +PROMETHEUS_DEPLOYMENT = "prometheus" +PROMETHEUS_LABELS = "app=prometheus" +ISTIO_NAMESPACE = "istio-system" + + +def validateDeploy(): + config.load_kube_config() + appsv1 = client.AppsV1Api() + corev1 = client.CoreV1Api() + find_flag = False + prom_pod_name = None + + # check prometheus deploytment + ret = appsv1.list_deployment_for_all_namespaces(watch=False) + for i in ret.items: + if PROMETHEUS_DEPLOYMENT == i.metadata.name and \ + ISTIO_NAMESPACE == i.metadata.namespace: + find_flag = True + break + if find_flag == False: + print("ERROR: Deployment: {} doesn't present in {} namespace".format( + PROMETHEUS_DEPLOYMENT, ISTIO_NAMESPACE)) + return False + + # find prometheus pod by label selector + ret = corev1.list_namespaced_pod(ISTIO_NAMESPACE, label_selector=PROMETHEUS_LABELS) + for i in ret.items: + prom_pod_name = i.metadata.name + if prom_pod_name == None: + print("ERROR: prometheus pod not found") + return False + + # check prometheus pod status + ret = corev1.read_namespaced_pod_status(prom_pod_name, ISTIO_NAMESPACE) + if ret.status.phase != "Running": + print("ERROR: prometheus pod %s is under %s state" % (prom_pod_name, ret.status.phase)) + return False + + return True + + +def validateService(): + m = Monitoring(PROMETHEUS_URL) + + return m.is_targets_healthy() + + +def main(): + if validateDeploy() and validateService(): + print"Prometheus monitoring validation has passed" + return True + else: + print"ERROR: Prometheus monitoring validation has failed" + return False + + +if __name__ == '__main__': + main() + diff --git a/docs/monitoring.rst b/docs/monitoring.rst new file mode 100644 index 0000000..44b01e3 --- /dev/null +++ b/docs/monitoring.rst @@ -0,0 +1,31 @@ +########## +Monitoring +########## + +************ +Installation +************ + +Currently, we use the Istio build-in prometheus addon to install prometheus:: + + cd + kubectl apply -f install/kubernetes/addons/prometheus.yaml + +******** +Validate +******** + +Setup port-forwarding for prometheus by executing the following command:: + + kubectl -n istio-system port-forward $(kubectl -n istio-system get pod -l app=prometheus -o jsonpath='{.items[0].metadata.name}') 9090:9090 & + +Run the scripts in ``clover/monitoring`` validates prometheus installation:: + + python clover/monitoring/validate.py + +It validates the installation with the following criterias + +#. [DONE] prometheus pod is in Running state +#. [DONE] prometheus is conneted to monitoring targets +#. [TODO] test collecting telemetry data from istio +#. [TODO] TBD -- cgit 1.2.3-korg