diff options
-rw-r--r-- | Pipfile | 15 | ||||
-rw-r--r-- | Pipfile.lock | 182 | ||||
-rw-r--r-- | clover/logging/install/fluentd-istio.yaml | 40 | ||||
-rw-r--r-- | clover/logging/install/logging-stack.yaml | 205 | ||||
-rw-r--r-- | clover/logging/validate.py | 56 | ||||
-rw-r--r-- | clover/monitoring/monitoring.py | 140 | ||||
-rw-r--r-- | clover/monitoring/validate.py | 70 | ||||
-rw-r--r-- | clover/tracing/tracing.py | 201 | ||||
-rw-r--r-- | clover/tracing/tracing_sample.py | 47 | ||||
-rw-r--r-- | clover/tracing/validate.py | 66 | ||||
-rw-r--r-- | docs/logging.rst | 28 | ||||
-rw-r--r-- | docs/monitoring.rst | 31 | ||||
-rw-r--r-- | docs/tracing.rst | 44 |
13 files changed, 1125 insertions, 0 deletions
@@ -0,0 +1,15 @@ +[[source]] + +url = "https://pypi.python.org/simple" +verify_ssl = true +name = "pypi" + + +[dev-packages] + + + +[packages] + +kubernetes = "*" +sh = "*" diff --git a/Pipfile.lock b/Pipfile.lock new file mode 100644 index 0000000..d840d68 --- /dev/null +++ b/Pipfile.lock @@ -0,0 +1,182 @@ +{ + "_meta": { + "hash": { + "sha256": "3e773b6eb42a9dba3e5cb71bcac1c832939ab3b069641084d9f5ecd0967ce7cf" + }, + "pipfile-spec": 6, + "requires": {}, + "sources": [ + { + "name": "pypi", + "url": "https://pypi.python.org/simple", + "verify_ssl": true + } + ] + }, + "default": { + "cachetools": { + "hashes": [ + "sha256:4319bbb78172e7bcf99423e1ecd6914b32336ccfe97d2058ffe62e641a7f3abe", + "sha256:ede01f2d3cbd6ddc9e35e16c2b0ce011d8bb70ce0dbaf282f5b4df24b213bc5d" + ], + "version": "==2.0.1" + }, + "certifi": { + "hashes": [ + "sha256:14131608ad2fd56836d33a71ee60fa1c82bc9d2c8d98b7bdbc631fe1b3cd1296", + "sha256:edbc3f203427eef571f79a7692bb160a2b0f7ccaa31953e99bd17e307cf63f7d" + ], + "version": "==2018.1.18" + }, + "chardet": { + "hashes": [ + "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae", + "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691" + ], + "version": "==3.0.4" + }, + "google-auth": { + "hashes": [ + "sha256:34088434cb2a2409360b8f3cbc04195a465df1fb2aafad71ebbded77cbf08803", + "sha256:9051802d3dae256036cca9e34633a32c0ed1427730d4ebc513dff91ec8b6dd45" + ], + "version": "==1.4.1" + }, + "idna": { + "hashes": [ + "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f", + "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4" + ], + "version": "==2.6" + }, + "ipaddress": { + "hashes": [ + "sha256:200d8686011d470b5e4de207d803445deee427455cd0cb7c982b68cf82524f81" + ], + "version": "==1.0.19" + }, + "kubernetes": { + "hashes": [ + "sha256:2f1a05a9bb2549d6afb6d138b2767d61d8aeb735a7a12bf554440524205e2894", + "sha256:f81f145882471a1dd9d23360e99bd77027f07744729ef2728af4af7130cd19fd" + ], + "version": "==5.0.0" + }, + "oauthlib": { + "hashes": [ + "sha256:ce57b501e906ff4f614e71c36a3ab9eacbb96d35c24d1970d2539bbc3ec70ce1" + ], + "version": "==2.0.6" + }, + "pyasn1": { + "hashes": [ + "sha256:0d7f6e959fe53f3960a23d73f35e1fce61348b30915b6664309ca756de7c1f89", + "sha256:5a0db897b311d265cde49615cf783f1c78613138605cdd0f907ecfa5b2aba3ee", + "sha256:758cb50abddc03e4563fd9e7f03db56e3e87b58c0bd01247360326e5c0c7ffa5", + "sha256:7d626683e3d792cccc608da02498aff37ab4f3dafd8905d6bf755d11f9b26b43", + "sha256:a7efe807c4b83a859e2735c692b92ed7b567cfddc4163763412920041d876c2b", + "sha256:b5a9ca48055b9a20f6d1b3d68e38692e5431c86a0f99ea602e61294e891fee5b", + "sha256:c07d6e587b2f928366b1f67c09bda026a3e6fcc99e80a744dc67f8fca3895626", + "sha256:d258b0a71994f7770599835249cece1caef3c70def868c4915e6e5ca49b67d15", + "sha256:d5cd6ed995dba16fad0c521cfe31cd2d68400b53fcc2bce93326829be73ab6d1", + "sha256:d84c2aea3cf43780e9e6a19f4e4dddee9f6976519020e64e47c57e5c7a8c3dd2", + "sha256:e85895087905c65b5b594eb91f7522664c85545b147d5f4d4e7b1b07da8dcbdc", + "sha256:f81c96761fca60d64b1c9b79ec2e40cf9495a745cf570613079ef324aeb9672b" + ], + "version": "==0.4.2" + }, + "pyasn1-modules": { + "hashes": [ + "sha256:041e9fbafac548d095f5b6c3b328b80792f006196e15a232b731a83c93d59493", + "sha256:0cdca76a68dcb701fff58c397de0ef9922b472b1cb3ea9695ca19d03f1869787", + "sha256:0cea139045c38f84abaa803bcb4b5e8775ea12a42af10019d942f227acc426c3", + "sha256:0f2e50d20bc670be170966638fa0ae603f0bc9ed6ebe8e97a6d1d4cef30cc889", + "sha256:47fb6757ab78fe966e7c58b2030b546854f78416d653163f0ce9290cf2278e8b", + "sha256:598a6004ec26a8ab40a39ea955068cf2a3949ad9c0030da970f2e1ca4c9f1cc9", + "sha256:72fd8b0c11191da088147c6e4678ec53e573923ecf60b57eeac9e97433e09fc2", + "sha256:854700bbdd01394e2ada9c1bfbd0ed9f5d0c551350dbbd023e88b11d2771ae06", + "sha256:af00ea8f2022b6287dc375b2c70f31ab5af83989fc6fe9eacd4976ce26cd7ccc", + "sha256:b1f395cae2d669e0830cb023aa86f9f283b7a9aa32317d7f80d8e78aa2745812", + "sha256:c6747146e95d2b14cc2a8399b2b0bde3f93778f8f9ec704690d2b589c376c137", + "sha256:f53fe5bcebdf318f51399b250fe8325ef3a26d927f012cc0c8e0f9e9af7f9deb" + ], + "version": "==0.2.1" + }, + "python-dateutil": { + "hashes": [ + "sha256:07009062406cffd554a9b4135cd2ff167c9bf6b7aac61fe946c93e69fad1bbd8", + "sha256:8f95bb7e6edbb2456a51a1fb58c8dca942024b4f5844cae62c90aa88afe6e300" + ], + "version": "==2.7.0" + }, + "pyyaml": { + "hashes": [ + "sha256:0c507b7f74b3d2dd4d1322ec8a94794927305ab4cebbe89cc47fe5e81541e6e8", + "sha256:16b20e970597e051997d90dc2cddc713a2876c47e3d92d59ee198700c5427736", + "sha256:3262c96a1ca437e7e4763e2843746588a965426550f3797a79fca9c6199c431f", + "sha256:326420cbb492172dec84b0f65c80942de6cedb5233c413dd824483989c000608", + "sha256:4474f8ea030b5127225b8894d626bb66c01cda098d47a2b0d3429b6700af9fd8", + "sha256:592766c6303207a20efc445587778322d7f73b161bd994f227adaa341ba212ab", + "sha256:5ac82e411044fb129bae5cfbeb3ba626acb2af31a8d17d175004b70862a741a7", + "sha256:5f84523c076ad14ff5e6c037fe1c89a7f73a3e04cf0377cb4d017014976433f3", + "sha256:827dc04b8fa7d07c44de11fabbc888e627fa8293b695e0f99cb544fdfa1bf0d1", + "sha256:b4c423ab23291d3945ac61346feeb9a0dc4184999ede5e7c43e1ffb975130ae6", + "sha256:bc6bced57f826ca7cb5125a10b23fd0f2fff3b7c4701d64c439a300ce665fff8", + "sha256:c01b880ec30b5a6e6aa67b09a2fe3fb30473008c85cd6a67359a1b15ed6d83a4", + "sha256:ca233c64c6e40eaa6c66ef97058cdc80e8d0157a443655baa1b2966e812807ca", + "sha256:e863072cdf4c72eebf179342c94e6989c67185842d9997960b3e69290b2fa269" + ], + "version": "==3.12" + }, + "requests": { + "hashes": [ + "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b", + "sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e" + ], + "version": "==2.18.4" + }, + "requests-oauthlib": { + "hashes": [ + "sha256:50a8ae2ce8273e384895972b56193c7409601a66d4975774c60c2aed869639ca", + "sha256:883ac416757eada6d3d07054ec7092ac21c7f35cb1d2cf82faf205637081f468" + ], + "version": "==0.8.0" + }, + "rsa": { + "hashes": [ + "sha256:25df4e10c263fb88b5ace923dd84bf9aa7f5019687b5e55382ffcdb8bede9db5", + "sha256:43f682fea81c452c98d09fc316aae12de6d30c4b5c84226642cf8f8fd1c93abd" + ], + "version": "==3.4.2" + }, + "sh": { + "hashes": [ + "sha256:ae3258c5249493cebe73cb4e18253a41ed69262484bad36fdb3efcb8ad8870bb", + "sha256:b52bf5833ed01c7b5c5fb73a7f71b3d98d48e9b9b8764236237bdc7ecae850fc" + ], + "version": "==1.12.14" + }, + "six": { + "hashes": [ + "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9", + "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb" + ], + "version": "==1.11.0" + }, + "urllib3": { + "hashes": [ + "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b", + "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f" + ], + "version": "==1.22" + }, + "websocket-client": { + "hashes": [ + "sha256:188b68b14fdb2d8eb1a111f21b9ffd2dbf1dbc4e4c1d28cf2c37cdbf1dd1cae6", + "sha256:a453dc4dfa6e0db3d8fd7738a308a88effe6240c59f3226eb93e8f020c216149" + ], + "version": "==0.47.0" + } + }, + "develop": {} +} diff --git a/clover/logging/install/fluentd-istio.yaml b/clover/logging/install/fluentd-istio.yaml new file mode 100644 index 0000000..1853831 --- /dev/null +++ b/clover/logging/install/fluentd-istio.yaml @@ -0,0 +1,40 @@ +# Configuration for logentry instances +apiVersion: "config.istio.io/v1alpha2" +kind: logentry +metadata: + name: newlog + namespace: istio-system +spec: + severity: '"info"' + timestamp: request.time + variables: + source: source.labels["app"] | source.service | "unknown" + user: source.user | "unknown" + destination: destination.labels["app"] | destination.service | "unknown" + responseCode: response.code | 0 + responseSize: response.size | 0 + latency: response.duration | "0ms" + monitored_resource_type: '"UNSPECIFIED"' +--- +# Configuration for a fluentd handler +apiVersion: "config.istio.io/v1alpha2" +kind: fluentd +metadata: + name: handler + namespace: istio-system +spec: + address: "fluentd-es.logging:24224" +--- +# Rule to send logentry instances to the fluentd handler +apiVersion: "config.istio.io/v1alpha2" +kind: rule +metadata: + name: newlogtofluentd + namespace: istio-system +spec: + match: "true" # match for all requests + actions: + - handler: handler.fluentd + instances: + - newlog.logentry +--- diff --git a/clover/logging/install/logging-stack.yaml b/clover/logging/install/logging-stack.yaml new file mode 100644 index 0000000..9542496 --- /dev/null +++ b/clover/logging/install/logging-stack.yaml @@ -0,0 +1,205 @@ +# Logging Namespace. All below are a part of this namespace. +apiVersion: v1 +kind: Namespace +metadata: + name: logging +--- +# Elasticsearch Service +apiVersion: v1 +kind: Service +metadata: + name: elasticsearch + namespace: logging + labels: + app: elasticsearch +spec: + ports: + - port: 9200 + protocol: TCP + targetPort: db + selector: + app: elasticsearch + type: NodePort +--- +# Elasticsearch Deployment +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: elasticsearch + namespace: logging + labels: + app: elasticsearch + annotations: + sidecar.istio.io/inject: "false" +spec: + template: + metadata: + labels: + app: elasticsearch + spec: + containers: + - image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.1.1 + name: elasticsearch + resources: + # need more cpu upon initialization, therefore burstable class + limits: + cpu: 1000m + requests: + cpu: 100m + env: + - name: discovery.type + value: single-node + ports: + - containerPort: 9200 + name: db + protocol: TCP + - containerPort: 9300 + name: transport + protocol: TCP + volumeMounts: + - name: elasticsearch + mountPath: /data + volumes: + - name: elasticsearch + emptyDir: {} +--- +# Fluentd Service +apiVersion: v1 +kind: Service +metadata: + name: fluentd-es + namespace: logging + labels: + app: fluentd-es +spec: + ports: + - name: fluentd-tcp + port: 24224 + protocol: TCP + targetPort: 24224 + - name: fluentd-udp + port: 24224 + protocol: UDP + targetPort: 24224 + selector: + app: fluentd-es +--- +# Fluentd Deployment +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: fluentd-es + namespace: logging + labels: + app: fluentd-es + annotations: + sidecar.istio.io/inject: "false" +spec: + template: + metadata: + labels: + app: fluentd-es + spec: + containers: + - name: fluentd-es + image: gcr.io/google-containers/fluentd-elasticsearch:v2.0.1 + env: + - name: FLUENTD_ARGS + value: --no-supervisor -q + resources: + limits: + memory: 500Mi + requests: + cpu: 100m + memory: 200Mi + volumeMounts: + - name: config-volume + mountPath: /etc/fluent/config.d + terminationGracePeriodSeconds: 30 + volumes: + - name: config-volume + configMap: + name: fluentd-es-config +--- +# Fluentd ConfigMap, contains config files. +kind: ConfigMap +apiVersion: v1 +data: + forward.input.conf: |- + # Takes the messages sent over TCP + <source> + type forward + </source> + output.conf: |- + <match **> + type elasticsearch + log_level info + include_tag_key true + host elasticsearch + port 9200 + logstash_format true + # Set the chunk limits. + buffer_chunk_limit 2M + buffer_queue_limit 8 + flush_interval 5s + # Never wait longer than 5 minutes between retries. + max_retry_wait 30 + # Disable the limit on the number of retries (retry forever). + disable_retry_limit + # Use multiple threads for processing. + num_threads 2 + </match> +metadata: + name: fluentd-es-config + namespace: logging +--- +# Kibana Service +apiVersion: v1 +kind: Service +metadata: + name: kibana + namespace: logging + labels: + app: kibana +spec: + ports: + - port: 5601 + protocol: TCP + targetPort: ui + selector: + app: kibana + type: NodePort +--- +# Kibana Deployment +apiVersion: extensions/v1beta1 +kind: Deployment +metadata: + name: kibana + namespace: logging + labels: + app: kibana + annotations: + sidecar.istio.io/inject: "false" +spec: + template: + metadata: + labels: + app: kibana + spec: + containers: + - name: kibana + image: docker.elastic.co/kibana/kibana-oss:6.1.1 + resources: + # need more cpu upon initialization, therefore burstable class + limits: + cpu: 1000m + requests: + cpu: 100m + env: + - name: ELASTICSEARCH_URL + value: http://elasticsearch:9200 + ports: + - containerPort: 5601 + name: ui + protocol: TCP +--- diff --git a/clover/logging/validate.py b/clover/logging/validate.py new file mode 100644 index 0000000..821f912 --- /dev/null +++ b/clover/logging/validate.py @@ -0,0 +1,56 @@ +# Copyright (c) Authors of Clover +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 + +from kubernetes import client, config +from kubernetes.stream import stream +import sh +import re + +FLUENTD_NAMESPACE = 'logging' +FLUENTD_PATTERN = 'fluentd-.*' +FLUENTD_LABELS = 'app=fluentd-es' +FLUENTD_INPUT = """<source> + type forward +</source>""" + +def main(): + # Load config from default location. + config.load_kube_config() + + v1 = client.CoreV1Api() + + fluentd_pod_name = None + + # find by name + print("Find fluentd pod by name '{}'".format(FLUENTD_PATTERN)) + fluentd_regex = re.compile(FLUENTD_PATTERN) + resp = v1.list_namespaced_pod(FLUENTD_NAMESPACE) + for i in resp.items: + if fluentd_regex.search(i.metadata.name) is not None: + print(i.metadata.name) + + # find by label selector + print("Find fluentd pod by label selector '{}'".format(FLUENTD_LABELS)) + resp = v1.list_namespaced_pod(FLUENTD_NAMESPACE, label_selector=FLUENTD_LABELS) + for i in resp.items: + print(i.metadata.name) + fluentd_pod_name = i.metadata.name + + # check fluentd configuration + # NOTE: exec in Python librarry does not work well, use shell command as a workaround + # See https://github.com/kubernetes-client/python/issues/485 + result = sh.kubectl(( + 'exec -n logging ' + + fluentd_pod_name + + ' cat /etc/fluent/config.d/forward.input.conf').split()) + if FLUENTD_INPUT in result: + print("fluentd input configured correctly") + else: + print("fluentd input not configured\n{}".format(FLUENTD_INPUT)) + +if __name__ == '__main__': + main() diff --git a/clover/monitoring/monitoring.py b/clover/monitoring/monitoring.py new file mode 100644 index 0000000..9726fd1 --- /dev/null +++ b/clover/monitoring/monitoring.py @@ -0,0 +1,140 @@ +# Copyright (c) Authors of Clover +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 + +from datetime import timedelta +import pprint +import requests +import time + +PROMETHEUS_URL = "http://127.0.0.1:9090" + + +class Monitoring(object): + PROMETHEUS_HEALTH_UP = "up" + PROMETHEUS_ISTIO_TARGETS = {"envoy", + "istio-mesh", + "kubernetes-apiservers", + "kubernetes-cadvisor", + "kubernetes-nodes", + "kubernetes-service-endpoints", + "mixer", + "pilot"} + PROMETHEUS_API_TARGETS = "/api/v1/targets" + PROMETHEUS_API_QUERY = "/api/v1/query" + PROMETHEUS_API_QUERY_RANGE = "/api/v1/query_range" + + def __init__(self, host): + self.host = host + + def get_targets(self): + try: + # Reference api: https://prometheus.io/docs/prometheus/latest/querying/api/#targets + response = requests.get('%s%s' % (self.host, Monitoring.PROMETHEUS_API_TARGETS)) + if response.status_code != 200: + print("ERROR: get targets status code: %r" % response.status_code) + return False + except Exception as e: + print("ERROR: Cannot connect to prometheus\n%s" % e) + return False + + return response.json() + + def is_targets_healthy(self): + targets = set() + + raw_targets = self.get_targets() + if raw_targets == False: + return False + + for target in raw_targets["data"]["activeTargets"]: + if target["health"] != Monitoring.PROMETHEUS_HEALTH_UP: + print("ERROR: target unhealth job: %s, health: %s" % \ + (target["labels"]["job"], target["health"])) + return False + targets.add(target["labels"]["job"]) + + diff = Monitoring.PROMETHEUS_ISTIO_TARGETS - targets + if len(diff): + print("ERROR: targets %r not found!" % diff) + return False + + return True + + # Reference links: + # - https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries + # - https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries + # - https://github.com/prometheus/prombench/blob/master/apps/load-generator/main.py + def query(self, query_params): + try: + start = time.time() + + query_type = query_params.get("type", "instant") + params = {"query": query_params["query"]} + if query_type == "instant": + url = "%s%s" % (self.host, Monitoring.PROMETHEUS_API_QUERY) + elif query_type == "range": + url = "%s%s" % (self.host, Monitoring.PROMETHEUS_API_QUERY_RANGE) + params["start"] = start - duration_seconds(query_params.get("start", "0h")) + params["end"] = start - duration_seconds(query_params.get("end", "0h")) + params["step"] = query_params.get("step", "15s") + else: + print("ERROR: invalidate query type") + return + + resp = requests.get(url, params) + dur = time.time() - start + + print("query %s %s, status=%s, size=%d, dur=%.3f" % \ + (self.host, query_params["query"], resp.status_code, len(resp.text), dur)) + pp = pprint.PrettyPrinter(indent=2) + pp.pprint(resp.json()) + + except Exception as e: + print("ERROR: Could not query prometheus instance %s. \n %s" % (url, e)) + + +def duration_seconds(s): + num = int(s[:-1]) + + if s.endswith('s'): + return timedelta(seconds=num).total_seconds() + elif s.endswith('m'): + return timedelta(minutes=num).total_seconds() + elif s.endswith('h'): + return timedelta(hours=num).total_seconds() + + raise "ERROR: unknown duration %s" % s + + +def main(): + m = Monitoring(PROMETHEUS_URL) + if not m.is_targets_healthy(): + print("ERROR: Prometheus targets is unhealthy!") + else: + print("Prometheus targets are all healthy!") + + print "\n### query instant" + query_params = { + "type": "instant", + "query": "istio_double_request_count{destination='details.default.svc.cluster.local'}" + } + m.query(query_params) + + print "\n### query range" + query_range_param = { + "type": "range", + "query": "istio_double_request_count{destination='details.default.svc.cluster.local'}", + "start": "5m", + "end": "3m", + "step": "30s" + } + m.query(query_range_param) + + +if __name__ == '__main__': + main() + diff --git a/clover/monitoring/validate.py b/clover/monitoring/validate.py new file mode 100644 index 0000000..fafe5df --- /dev/null +++ b/clover/monitoring/validate.py @@ -0,0 +1,70 @@ +# Copyright (c) Authors of Clover +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 + +from monitoring import Monitoring +from kubernetes import client, config + +PROMETHEUS_URL = "http://127.0.0.1:9090" +PROMETHEUS_DEPLOYMENT = "prometheus" +PROMETHEUS_LABELS = "app=prometheus" +ISTIO_NAMESPACE = "istio-system" + + +def validateDeploy(): + config.load_kube_config() + appsv1 = client.AppsV1Api() + corev1 = client.CoreV1Api() + find_flag = False + prom_pod_name = None + + # check prometheus deploytment + ret = appsv1.list_deployment_for_all_namespaces(watch=False) + for i in ret.items: + if PROMETHEUS_DEPLOYMENT == i.metadata.name and \ + ISTIO_NAMESPACE == i.metadata.namespace: + find_flag = True + break + if find_flag == False: + print("ERROR: Deployment: {} doesn't present in {} namespace".format( + PROMETHEUS_DEPLOYMENT, ISTIO_NAMESPACE)) + return False + + # find prometheus pod by label selector + ret = corev1.list_namespaced_pod(ISTIO_NAMESPACE, label_selector=PROMETHEUS_LABELS) + for i in ret.items: + prom_pod_name = i.metadata.name + if prom_pod_name == None: + print("ERROR: prometheus pod not found") + return False + + # check prometheus pod status + ret = corev1.read_namespaced_pod_status(prom_pod_name, ISTIO_NAMESPACE) + if ret.status.phase != "Running": + print("ERROR: prometheus pod %s is under %s state" % (prom_pod_name, ret.status.phase)) + return False + + return True + + +def validateService(): + m = Monitoring(PROMETHEUS_URL) + + return m.is_targets_healthy() + + +def main(): + if validateDeploy() and validateService(): + print"Prometheus monitoring validation has passed" + return True + else: + print"ERROR: Prometheus monitoring validation has failed" + return False + + +if __name__ == '__main__': + main() + diff --git a/clover/tracing/tracing.py b/clover/tracing/tracing.py new file mode 100644 index 0000000..16b952c --- /dev/null +++ b/clover/tracing/tracing.py @@ -0,0 +1,201 @@ +# Copyright (c) Authors of Clover +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 + +import requests +import time +import redis + +TRACING_IP = "localhost" +TRACING_PORT = "30888" + + +class Tracing: + + def __init__( + self, tracing_ip, tracing_port, redis_ip='localhost', use_redis=True): + self.tracing_ip = tracing_ip + self.tracing_port = tracing_port + self.testid = '0' + self.test_start_time = 0 + self.use_redis = use_redis + if use_redis: + try: + self.r = redis.StrictRedis(host=redis_ip, port=6379, db=0) + except Exception: + print("Failed to connect to redis") + + def setRedisSet(self, rkey, rvalue): + if self.use_redis: + self.r.sadd(rkey, rvalue) + + def setRedisList(self, rkey, rvalue): + if self.use_redis: + self.r.lpush(rkey, rvalue) + + def setRedisHash(self, rkey, rvalue): + if self.use_redis: + self.r.hmset(rkey, rvalue) + + def getRedisTestid(self, index): + testid = self.r.lrange("testids", index, index) + return testid[0] + + def getRedisTraceids(self, testid): + rkey = "traceids:" + str(testid) + traceids = self.r.smembers(rkey) + return traceids + + def getRedisSpanids(self, traceid): + rkey = "spanids:" + str(traceid) + spanids = self.r.smembers(rkey) + return spanids + + def getRedisSpan(self, spanid, traceid): + rkey = "spans:" + str(traceid) + ':' + str(spanid) + span = self.r.hgetall(rkey) + return span + + def getRedisSpanValue(self, spanid, traceid, span_key): + rkey = "spans:" + str(traceid) + ':' + str(spanid) + span_value = self.r.hget(rkey, span_key) + return span_value + + def getRedisTags(self, spanid, traceid): + rkey = "tags:" + str(spanid) + ':' + str(traceid) + tags = self.r.hgetall(rkey) + return tags + + def getRedisTagsValue(self, spanid, traceid, tag_key): + rkey = "tags:" + str(spanid) + ':' + str(traceid) + tag_value = self.r.hget(rkey, tag_key) + return tag_value + + def getRedisTestAll(self, testid): + traceids = self.getRedisTraceids(testid) + for trace in traceids: + spanids = self.getRedisSpanids(trace) + for span in spanids: + # print(self.getRedisSpan(span, trace)) + print(self.getRedisSpanValue(span, trace, 'duration')) + # print(self.getRedisTags(span, trace)) + print(self.getRedisTagsValue(span, trace, 'node_id')) + + def setTest(self, testid): + self.testid = testid + self.setRedisList("testids", testid) + self.test_start_time = int(time.time()) + + def getServices(self): + req_url = 'http://' + self.tracing_ip + ':' + self.tracing_port + \ + '/api/services' + try: + response = requests.get(req_url) + if response.status_code != 200: + print("ERROR: Cannot connect to tracing: {}".format( + response.status_code)) + return False + except Exception as e: + print("ERROR: Cannot connect to tracing") + print(e) + return False + + data = response.json() + services = data['data'] + return services + + def getTraces(self, service, time_back=3600, limit='1000'): + ref_time = int(time.time()) + pad_time = '757000' + end_time = 'end=' + str(ref_time) + pad_time + '&' + if time_back == 0: + delta = self.test_start_time + else: + delta = ref_time - time_back + start_time = 'start=' + str(delta) + pad_time + limit = 'limit=' + limit + '&' + loopback = 'loopback=1h&' + max_dur = 'maxDuration&' + min_dur = 'minDuration&' + service = 'service=' + service + '&' + url_prefix = 'http://' + self.tracing_ip + ':' + self.tracing_port + \ + '/api/traces?' + req_url = url_prefix + end_time + limit + loopback + max_dur + \ + min_dur + service + start_time + + try: + response = requests.get(req_url) + if response.status_code != 200: + print("ERROR: Cannot connect to tracing: {}".format( + response.status_code)) + return False + except Exception as e: + print("ERROR: Cannot connect to tracing") + print(e) + return False + + traces = response.json() + return traces + + def numTraces(self, trace): + num_traces = len(trace['data']) + return str(num_traces) + + def outProcesses(self, trace): + processes = [] + if trace['data']: + first_trace = trace['data'][0] + for process in first_trace['processes']: + processes.append(process) + print(processes) + return processes + + def outTraces(self, trace): + for traces in trace['data']: + print("TraceID: {}".format(traces['traceID'])) + self.setRedisSet( + "traceids:{}".format(str(self.testid)), traces['traceID']) + for spans in traces['spans']: + print("SpanID: {}".format(spans['spanID'])) + self.setRedisSet( + "spanids:{}".format(traces['traceID']), spans['spanID']) + print("Duration: {} usec".format(spans['duration'])) + span = {} + span['spanID'] = spans['spanID'] + span['duration'] = spans['duration'] + span['startTime'] = spans['startTime'] + span['operationName'] = spans['operationName'] + # print("Tags:\n {} \n".format(spans['tags'])) + self.setRedisHash( + "spans:{}:{}".format( + traces['traceID'], spans['spanID']), span) + tag = {} + for tags in spans['tags']: + print("Tag key: {}, value: {}".format( + tags['key'], tags['value'])) + tag[tags['key']] = tags['value'] + self.setRedisHash("tags:{}:{}".format( + spans['spanID'], traces['traceID']), tag) + + def monitorTraces(self, sample_interval, service='istio-ingress'): + loop = True + while loop: + try: + t = self.getTraces(service, 10) + num_traces = self.numTraces(t) + print("Number of traces: " + num_traces) + self.outTraces(t) + time.sleep(sample_interval) + except KeyboardInterrupt: + print("Test Start: {}".format(self.test_start_time)) + loop = False + + def main(self): + self.monitorTraces(1) + + +if __name__ == '__main__': + Tracing(TRACING_IP, TRACING_PORT).main() diff --git a/clover/tracing/tracing_sample.py b/clover/tracing/tracing_sample.py new file mode 100644 index 0000000..f0234bf --- /dev/null +++ b/clover/tracing/tracing_sample.py @@ -0,0 +1,47 @@ +# Copyright (c) Authors of Clover +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 + +import uuid +import time +from tracing import Tracing + +t = Tracing('localhost', '30888') + +# Get toplevel services stored in tracing +services = t.getServices() +print(services) + +# Get traces from the last hour for istio-ingress service +service = 'istio-ingress' +traces = t.getTraces(service, 3600) +# Get process names for first trace service +t.outProcesses(traces) + +# Turn off redis tracing store and output basic trace info +t.use_redis = False +t.outTraces(traces) + +# Setup basic test and store in redis +t.use_redis = True +t.setTest(uuid.uuid4()) +time.sleep(20) +# Get all traces from test start time when time_back=0 +traces = t.getTraces(service, 0) +# Store traces in redis +t.outTraces(traces) + +# Get test id for some number of tests back +testid = t.getRedisTestid('0') +print(testid) +traceids = t.getRedisTraceids(testid) +print(traceids) + +# Print out span and tag info for all traces in test +# Will continue to consider what to extract from hashes for e2e validation +t.getRedisTestAll(testid) + +# t.monitorTraces(1) diff --git a/clover/tracing/validate.py b/clover/tracing/validate.py new file mode 100644 index 0000000..eed6f9a --- /dev/null +++ b/clover/tracing/validate.py @@ -0,0 +1,66 @@ +# Copyright (c) Authors of Clover +# +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 + +from tracing import Tracing +from kubernetes import client, config + + +JAEGER_IP = "localhost" +# JAEGER_IP = "1.1.1.1" +JAEGER_PORT = "30888" +JAEGER_DEPLOYMENT = "jaeger-deployment" +ISTIO_NAMESPACE = "istio-system" +ISTIO_SERVICES = ["istio-ingress", "istio-mixer"] + + +def validateDeploy(): + config.load_kube_config() + v1 = client.AppsV1Api() + + deployments = [] + namespaces = [] + validate = False + ret = v1.list_deployment_for_all_namespaces(watch=False) + for i in ret.items: + deployments.append(i.metadata.name) + namespaces.append(i.metadata.namespace) + if JAEGER_DEPLOYMENT in deployments: + d_index = deployments.index(JAEGER_DEPLOYMENT) + if ISTIO_NAMESPACE in namespaces[d_index]: + print("Deployment: {} present in {} namespace".format( + JAEGER_DEPLOYMENT, ISTIO_NAMESPACE)) + validate = True + return validate + +# Services in Jaeger will only be present when traffic passes through Istio +# Requires a deployment in Istio service mesh with some traffic targeting nodes +def validateServices(): + t = Tracing(JAEGER_IP, JAEGER_PORT) + services = t.getServices() + validate = True + if services: + for s in ISTIO_SERVICES: + if s in services: + print("Service in tracing: {} present".format(s)) + else: + validate = False + else: + validate = False + return validate + + +def main(): + if validateDeploy() and validateServices(): + print"Jaeger tracing validation has passed" + return True + else: + print"Jaeger tracing validation has failed" + return False + + +if __name__ == '__main__': + main() diff --git a/docs/logging.rst b/docs/logging.rst new file mode 100644 index 0000000..196ba40 --- /dev/null +++ b/docs/logging.rst @@ -0,0 +1,28 @@ +####### +Logging +####### + +************ +Installation +************ + +Currently, we use the `sample configuration`_ in Istio to install fluentd:: + + cd clover/logging + kubectl apply -f install + +.. _sample configuration: https://istio.io/docs/tasks/telemetry/fluentd.html + +******** +Validate +******** + +The scripts in ``clover/logging`` validates fluentd installation:: + + python clover/logging/validate.py + +It validates the installation with the following criterias + +#. existence of fluented pod +#. fluentd input is configured correctly +#. TBD diff --git a/docs/monitoring.rst b/docs/monitoring.rst new file mode 100644 index 0000000..44b01e3 --- /dev/null +++ b/docs/monitoring.rst @@ -0,0 +1,31 @@ +########## +Monitoring +########## + +************ +Installation +************ + +Currently, we use the Istio build-in prometheus addon to install prometheus:: + + cd <istio-release-path> + kubectl apply -f install/kubernetes/addons/prometheus.yaml + +******** +Validate +******** + +Setup port-forwarding for prometheus by executing the following command:: + + kubectl -n istio-system port-forward $(kubectl -n istio-system get pod -l app=prometheus -o jsonpath='{.items[0].metadata.name}') 9090:9090 & + +Run the scripts in ``clover/monitoring`` validates prometheus installation:: + + python clover/monitoring/validate.py + +It validates the installation with the following criterias + +#. [DONE] prometheus pod is in Running state +#. [DONE] prometheus is conneted to monitoring targets +#. [TODO] test collecting telemetry data from istio +#. [TODO] TBD diff --git a/docs/tracing.rst b/docs/tracing.rst new file mode 100644 index 0000000..79d686c --- /dev/null +++ b/docs/tracing.rst @@ -0,0 +1,44 @@ +####### +Tracing +####### + +************ +Installation +************ + +Currently, we use the Jaeger tracing all-in-one Kubernetes template for development and testing, +which uses in-memory storage. It can be deployed to the istio-system namespace with the +following command:: + + kubectl apply -n istio-system -f https://raw.githubusercontent.com/jaegertracing/jaeger-kubernetes/master/all-in-one/jaeger-all-in-one-template.yml + +The standard Jaeger REST port is at 16686. To make this service available outside of the +Kubernetes cluster, use the following command:: + + kubectl expose -n istio-system deployment jaeger-deployment --port=16686 --type=NodePort + +Kubernetes will expose the Jaeger service on another port, which can be found with:: + + kubectl get svc -n istio-system + +An example listing from the command above is shown below where the Jaeger service is exposed +externally on port 30888:: + + istio-system jaeger-deployment NodePort 10.104.113.94 <none> 16686:30888/TCP + +Jaeger will be accessible using the host IP of the Kubernetes cluster and port provided. + +******** +Validate +******** + +The script in ``clover/tracing`` validates Jaeger installation:: + + python clover/tracing/validate.py + +It validates the installation with the following criteria: + +#. Existence of Jaeger all-in-one deployment using Kubernetes +#. Jaeger service is accessible using IP address and port configured in installation steps +#. Jaeger can retrieve default service listing for default Istio components +#. TBD - consider installation of production setup with cassandra or elastic search |