summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Pipfile15
-rw-r--r--Pipfile.lock182
-rw-r--r--clover/logging/install/fluentd-istio.yaml40
-rw-r--r--clover/logging/install/logging-stack.yaml205
-rw-r--r--clover/logging/validate.py56
-rw-r--r--clover/monitoring/monitoring.py140
-rw-r--r--clover/monitoring/validate.py70
-rw-r--r--clover/tracing/tracing.py201
-rw-r--r--clover/tracing/tracing_sample.py47
-rw-r--r--clover/tracing/validate.py66
-rw-r--r--docs/logging.rst28
-rw-r--r--docs/monitoring.rst31
-rw-r--r--docs/tracing.rst44
13 files changed, 1125 insertions, 0 deletions
diff --git a/Pipfile b/Pipfile
new file mode 100644
index 0000000..7bac042
--- /dev/null
+++ b/Pipfile
@@ -0,0 +1,15 @@
+[[source]]
+
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+name = "pypi"
+
+
+[dev-packages]
+
+
+
+[packages]
+
+kubernetes = "*"
+sh = "*"
diff --git a/Pipfile.lock b/Pipfile.lock
new file mode 100644
index 0000000..d840d68
--- /dev/null
+++ b/Pipfile.lock
@@ -0,0 +1,182 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "3e773b6eb42a9dba3e5cb71bcac1c832939ab3b069641084d9f5ecd0967ce7cf"
+ },
+ "pipfile-spec": 6,
+ "requires": {},
+ "sources": [
+ {
+ "name": "pypi",
+ "url": "https://pypi.python.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "cachetools": {
+ "hashes": [
+ "sha256:4319bbb78172e7bcf99423e1ecd6914b32336ccfe97d2058ffe62e641a7f3abe",
+ "sha256:ede01f2d3cbd6ddc9e35e16c2b0ce011d8bb70ce0dbaf282f5b4df24b213bc5d"
+ ],
+ "version": "==2.0.1"
+ },
+ "certifi": {
+ "hashes": [
+ "sha256:14131608ad2fd56836d33a71ee60fa1c82bc9d2c8d98b7bdbc631fe1b3cd1296",
+ "sha256:edbc3f203427eef571f79a7692bb160a2b0f7ccaa31953e99bd17e307cf63f7d"
+ ],
+ "version": "==2018.1.18"
+ },
+ "chardet": {
+ "hashes": [
+ "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
+ "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+ ],
+ "version": "==3.0.4"
+ },
+ "google-auth": {
+ "hashes": [
+ "sha256:34088434cb2a2409360b8f3cbc04195a465df1fb2aafad71ebbded77cbf08803",
+ "sha256:9051802d3dae256036cca9e34633a32c0ed1427730d4ebc513dff91ec8b6dd45"
+ ],
+ "version": "==1.4.1"
+ },
+ "idna": {
+ "hashes": [
+ "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
+ "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
+ ],
+ "version": "==2.6"
+ },
+ "ipaddress": {
+ "hashes": [
+ "sha256:200d8686011d470b5e4de207d803445deee427455cd0cb7c982b68cf82524f81"
+ ],
+ "version": "==1.0.19"
+ },
+ "kubernetes": {
+ "hashes": [
+ "sha256:2f1a05a9bb2549d6afb6d138b2767d61d8aeb735a7a12bf554440524205e2894",
+ "sha256:f81f145882471a1dd9d23360e99bd77027f07744729ef2728af4af7130cd19fd"
+ ],
+ "version": "==5.0.0"
+ },
+ "oauthlib": {
+ "hashes": [
+ "sha256:ce57b501e906ff4f614e71c36a3ab9eacbb96d35c24d1970d2539bbc3ec70ce1"
+ ],
+ "version": "==2.0.6"
+ },
+ "pyasn1": {
+ "hashes": [
+ "sha256:0d7f6e959fe53f3960a23d73f35e1fce61348b30915b6664309ca756de7c1f89",
+ "sha256:5a0db897b311d265cde49615cf783f1c78613138605cdd0f907ecfa5b2aba3ee",
+ "sha256:758cb50abddc03e4563fd9e7f03db56e3e87b58c0bd01247360326e5c0c7ffa5",
+ "sha256:7d626683e3d792cccc608da02498aff37ab4f3dafd8905d6bf755d11f9b26b43",
+ "sha256:a7efe807c4b83a859e2735c692b92ed7b567cfddc4163763412920041d876c2b",
+ "sha256:b5a9ca48055b9a20f6d1b3d68e38692e5431c86a0f99ea602e61294e891fee5b",
+ "sha256:c07d6e587b2f928366b1f67c09bda026a3e6fcc99e80a744dc67f8fca3895626",
+ "sha256:d258b0a71994f7770599835249cece1caef3c70def868c4915e6e5ca49b67d15",
+ "sha256:d5cd6ed995dba16fad0c521cfe31cd2d68400b53fcc2bce93326829be73ab6d1",
+ "sha256:d84c2aea3cf43780e9e6a19f4e4dddee9f6976519020e64e47c57e5c7a8c3dd2",
+ "sha256:e85895087905c65b5b594eb91f7522664c85545b147d5f4d4e7b1b07da8dcbdc",
+ "sha256:f81c96761fca60d64b1c9b79ec2e40cf9495a745cf570613079ef324aeb9672b"
+ ],
+ "version": "==0.4.2"
+ },
+ "pyasn1-modules": {
+ "hashes": [
+ "sha256:041e9fbafac548d095f5b6c3b328b80792f006196e15a232b731a83c93d59493",
+ "sha256:0cdca76a68dcb701fff58c397de0ef9922b472b1cb3ea9695ca19d03f1869787",
+ "sha256:0cea139045c38f84abaa803bcb4b5e8775ea12a42af10019d942f227acc426c3",
+ "sha256:0f2e50d20bc670be170966638fa0ae603f0bc9ed6ebe8e97a6d1d4cef30cc889",
+ "sha256:47fb6757ab78fe966e7c58b2030b546854f78416d653163f0ce9290cf2278e8b",
+ "sha256:598a6004ec26a8ab40a39ea955068cf2a3949ad9c0030da970f2e1ca4c9f1cc9",
+ "sha256:72fd8b0c11191da088147c6e4678ec53e573923ecf60b57eeac9e97433e09fc2",
+ "sha256:854700bbdd01394e2ada9c1bfbd0ed9f5d0c551350dbbd023e88b11d2771ae06",
+ "sha256:af00ea8f2022b6287dc375b2c70f31ab5af83989fc6fe9eacd4976ce26cd7ccc",
+ "sha256:b1f395cae2d669e0830cb023aa86f9f283b7a9aa32317d7f80d8e78aa2745812",
+ "sha256:c6747146e95d2b14cc2a8399b2b0bde3f93778f8f9ec704690d2b589c376c137",
+ "sha256:f53fe5bcebdf318f51399b250fe8325ef3a26d927f012cc0c8e0f9e9af7f9deb"
+ ],
+ "version": "==0.2.1"
+ },
+ "python-dateutil": {
+ "hashes": [
+ "sha256:07009062406cffd554a9b4135cd2ff167c9bf6b7aac61fe946c93e69fad1bbd8",
+ "sha256:8f95bb7e6edbb2456a51a1fb58c8dca942024b4f5844cae62c90aa88afe6e300"
+ ],
+ "version": "==2.7.0"
+ },
+ "pyyaml": {
+ "hashes": [
+ "sha256:0c507b7f74b3d2dd4d1322ec8a94794927305ab4cebbe89cc47fe5e81541e6e8",
+ "sha256:16b20e970597e051997d90dc2cddc713a2876c47e3d92d59ee198700c5427736",
+ "sha256:3262c96a1ca437e7e4763e2843746588a965426550f3797a79fca9c6199c431f",
+ "sha256:326420cbb492172dec84b0f65c80942de6cedb5233c413dd824483989c000608",
+ "sha256:4474f8ea030b5127225b8894d626bb66c01cda098d47a2b0d3429b6700af9fd8",
+ "sha256:592766c6303207a20efc445587778322d7f73b161bd994f227adaa341ba212ab",
+ "sha256:5ac82e411044fb129bae5cfbeb3ba626acb2af31a8d17d175004b70862a741a7",
+ "sha256:5f84523c076ad14ff5e6c037fe1c89a7f73a3e04cf0377cb4d017014976433f3",
+ "sha256:827dc04b8fa7d07c44de11fabbc888e627fa8293b695e0f99cb544fdfa1bf0d1",
+ "sha256:b4c423ab23291d3945ac61346feeb9a0dc4184999ede5e7c43e1ffb975130ae6",
+ "sha256:bc6bced57f826ca7cb5125a10b23fd0f2fff3b7c4701d64c439a300ce665fff8",
+ "sha256:c01b880ec30b5a6e6aa67b09a2fe3fb30473008c85cd6a67359a1b15ed6d83a4",
+ "sha256:ca233c64c6e40eaa6c66ef97058cdc80e8d0157a443655baa1b2966e812807ca",
+ "sha256:e863072cdf4c72eebf179342c94e6989c67185842d9997960b3e69290b2fa269"
+ ],
+ "version": "==3.12"
+ },
+ "requests": {
+ "hashes": [
+ "sha256:6a1b267aa90cac58ac3a765d067950e7dbbf75b1da07e895d1f594193a40a38b",
+ "sha256:9c443e7324ba5b85070c4a818ade28bfabedf16ea10206da1132edaa6dda237e"
+ ],
+ "version": "==2.18.4"
+ },
+ "requests-oauthlib": {
+ "hashes": [
+ "sha256:50a8ae2ce8273e384895972b56193c7409601a66d4975774c60c2aed869639ca",
+ "sha256:883ac416757eada6d3d07054ec7092ac21c7f35cb1d2cf82faf205637081f468"
+ ],
+ "version": "==0.8.0"
+ },
+ "rsa": {
+ "hashes": [
+ "sha256:25df4e10c263fb88b5ace923dd84bf9aa7f5019687b5e55382ffcdb8bede9db5",
+ "sha256:43f682fea81c452c98d09fc316aae12de6d30c4b5c84226642cf8f8fd1c93abd"
+ ],
+ "version": "==3.4.2"
+ },
+ "sh": {
+ "hashes": [
+ "sha256:ae3258c5249493cebe73cb4e18253a41ed69262484bad36fdb3efcb8ad8870bb",
+ "sha256:b52bf5833ed01c7b5c5fb73a7f71b3d98d48e9b9b8764236237bdc7ecae850fc"
+ ],
+ "version": "==1.12.14"
+ },
+ "six": {
+ "hashes": [
+ "sha256:70e8a77beed4562e7f14fe23a786b54f6296e34344c23bc42f07b15018ff98e9",
+ "sha256:832dc0e10feb1aa2c68dcc57dbb658f1c7e65b9b61af69048abc87a2db00a0eb"
+ ],
+ "version": "==1.11.0"
+ },
+ "urllib3": {
+ "hashes": [
+ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
+ "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+ ],
+ "version": "==1.22"
+ },
+ "websocket-client": {
+ "hashes": [
+ "sha256:188b68b14fdb2d8eb1a111f21b9ffd2dbf1dbc4e4c1d28cf2c37cdbf1dd1cae6",
+ "sha256:a453dc4dfa6e0db3d8fd7738a308a88effe6240c59f3226eb93e8f020c216149"
+ ],
+ "version": "==0.47.0"
+ }
+ },
+ "develop": {}
+}
diff --git a/clover/logging/install/fluentd-istio.yaml b/clover/logging/install/fluentd-istio.yaml
new file mode 100644
index 0000000..1853831
--- /dev/null
+++ b/clover/logging/install/fluentd-istio.yaml
@@ -0,0 +1,40 @@
+# Configuration for logentry instances
+apiVersion: "config.istio.io/v1alpha2"
+kind: logentry
+metadata:
+ name: newlog
+ namespace: istio-system
+spec:
+ severity: '"info"'
+ timestamp: request.time
+ variables:
+ source: source.labels["app"] | source.service | "unknown"
+ user: source.user | "unknown"
+ destination: destination.labels["app"] | destination.service | "unknown"
+ responseCode: response.code | 0
+ responseSize: response.size | 0
+ latency: response.duration | "0ms"
+ monitored_resource_type: '"UNSPECIFIED"'
+---
+# Configuration for a fluentd handler
+apiVersion: "config.istio.io/v1alpha2"
+kind: fluentd
+metadata:
+ name: handler
+ namespace: istio-system
+spec:
+ address: "fluentd-es.logging:24224"
+---
+# Rule to send logentry instances to the fluentd handler
+apiVersion: "config.istio.io/v1alpha2"
+kind: rule
+metadata:
+ name: newlogtofluentd
+ namespace: istio-system
+spec:
+ match: "true" # match for all requests
+ actions:
+ - handler: handler.fluentd
+ instances:
+ - newlog.logentry
+---
diff --git a/clover/logging/install/logging-stack.yaml b/clover/logging/install/logging-stack.yaml
new file mode 100644
index 0000000..9542496
--- /dev/null
+++ b/clover/logging/install/logging-stack.yaml
@@ -0,0 +1,205 @@
+# Logging Namespace. All below are a part of this namespace.
+apiVersion: v1
+kind: Namespace
+metadata:
+ name: logging
+---
+# Elasticsearch Service
+apiVersion: v1
+kind: Service
+metadata:
+ name: elasticsearch
+ namespace: logging
+ labels:
+ app: elasticsearch
+spec:
+ ports:
+ - port: 9200
+ protocol: TCP
+ targetPort: db
+ selector:
+ app: elasticsearch
+ type: NodePort
+---
+# Elasticsearch Deployment
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+ name: elasticsearch
+ namespace: logging
+ labels:
+ app: elasticsearch
+ annotations:
+ sidecar.istio.io/inject: "false"
+spec:
+ template:
+ metadata:
+ labels:
+ app: elasticsearch
+ spec:
+ containers:
+ - image: docker.elastic.co/elasticsearch/elasticsearch-oss:6.1.1
+ name: elasticsearch
+ resources:
+ # need more cpu upon initialization, therefore burstable class
+ limits:
+ cpu: 1000m
+ requests:
+ cpu: 100m
+ env:
+ - name: discovery.type
+ value: single-node
+ ports:
+ - containerPort: 9200
+ name: db
+ protocol: TCP
+ - containerPort: 9300
+ name: transport
+ protocol: TCP
+ volumeMounts:
+ - name: elasticsearch
+ mountPath: /data
+ volumes:
+ - name: elasticsearch
+ emptyDir: {}
+---
+# Fluentd Service
+apiVersion: v1
+kind: Service
+metadata:
+ name: fluentd-es
+ namespace: logging
+ labels:
+ app: fluentd-es
+spec:
+ ports:
+ - name: fluentd-tcp
+ port: 24224
+ protocol: TCP
+ targetPort: 24224
+ - name: fluentd-udp
+ port: 24224
+ protocol: UDP
+ targetPort: 24224
+ selector:
+ app: fluentd-es
+---
+# Fluentd Deployment
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+ name: fluentd-es
+ namespace: logging
+ labels:
+ app: fluentd-es
+ annotations:
+ sidecar.istio.io/inject: "false"
+spec:
+ template:
+ metadata:
+ labels:
+ app: fluentd-es
+ spec:
+ containers:
+ - name: fluentd-es
+ image: gcr.io/google-containers/fluentd-elasticsearch:v2.0.1
+ env:
+ - name: FLUENTD_ARGS
+ value: --no-supervisor -q
+ resources:
+ limits:
+ memory: 500Mi
+ requests:
+ cpu: 100m
+ memory: 200Mi
+ volumeMounts:
+ - name: config-volume
+ mountPath: /etc/fluent/config.d
+ terminationGracePeriodSeconds: 30
+ volumes:
+ - name: config-volume
+ configMap:
+ name: fluentd-es-config
+---
+# Fluentd ConfigMap, contains config files.
+kind: ConfigMap
+apiVersion: v1
+data:
+ forward.input.conf: |-
+ # Takes the messages sent over TCP
+ <source>
+ type forward
+ </source>
+ output.conf: |-
+ <match **>
+ type elasticsearch
+ log_level info
+ include_tag_key true
+ host elasticsearch
+ port 9200
+ logstash_format true
+ # Set the chunk limits.
+ buffer_chunk_limit 2M
+ buffer_queue_limit 8
+ flush_interval 5s
+ # Never wait longer than 5 minutes between retries.
+ max_retry_wait 30
+ # Disable the limit on the number of retries (retry forever).
+ disable_retry_limit
+ # Use multiple threads for processing.
+ num_threads 2
+ </match>
+metadata:
+ name: fluentd-es-config
+ namespace: logging
+---
+# Kibana Service
+apiVersion: v1
+kind: Service
+metadata:
+ name: kibana
+ namespace: logging
+ labels:
+ app: kibana
+spec:
+ ports:
+ - port: 5601
+ protocol: TCP
+ targetPort: ui
+ selector:
+ app: kibana
+ type: NodePort
+---
+# Kibana Deployment
+apiVersion: extensions/v1beta1
+kind: Deployment
+metadata:
+ name: kibana
+ namespace: logging
+ labels:
+ app: kibana
+ annotations:
+ sidecar.istio.io/inject: "false"
+spec:
+ template:
+ metadata:
+ labels:
+ app: kibana
+ spec:
+ containers:
+ - name: kibana
+ image: docker.elastic.co/kibana/kibana-oss:6.1.1
+ resources:
+ # need more cpu upon initialization, therefore burstable class
+ limits:
+ cpu: 1000m
+ requests:
+ cpu: 100m
+ env:
+ - name: ELASTICSEARCH_URL
+ value: http://elasticsearch:9200
+ ports:
+ - containerPort: 5601
+ name: ui
+ protocol: TCP
+---
diff --git a/clover/logging/validate.py b/clover/logging/validate.py
new file mode 100644
index 0000000..821f912
--- /dev/null
+++ b/clover/logging/validate.py
@@ -0,0 +1,56 @@
+# Copyright (c) Authors of Clover
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+
+from kubernetes import client, config
+from kubernetes.stream import stream
+import sh
+import re
+
+FLUENTD_NAMESPACE = 'logging'
+FLUENTD_PATTERN = 'fluentd-.*'
+FLUENTD_LABELS = 'app=fluentd-es'
+FLUENTD_INPUT = """<source>
+ type forward
+</source>"""
+
+def main():
+ # Load config from default location.
+ config.load_kube_config()
+
+ v1 = client.CoreV1Api()
+
+ fluentd_pod_name = None
+
+ # find by name
+ print("Find fluentd pod by name '{}'".format(FLUENTD_PATTERN))
+ fluentd_regex = re.compile(FLUENTD_PATTERN)
+ resp = v1.list_namespaced_pod(FLUENTD_NAMESPACE)
+ for i in resp.items:
+ if fluentd_regex.search(i.metadata.name) is not None:
+ print(i.metadata.name)
+
+ # find by label selector
+ print("Find fluentd pod by label selector '{}'".format(FLUENTD_LABELS))
+ resp = v1.list_namespaced_pod(FLUENTD_NAMESPACE, label_selector=FLUENTD_LABELS)
+ for i in resp.items:
+ print(i.metadata.name)
+ fluentd_pod_name = i.metadata.name
+
+ # check fluentd configuration
+ # NOTE: exec in Python librarry does not work well, use shell command as a workaround
+ # See https://github.com/kubernetes-client/python/issues/485
+ result = sh.kubectl((
+ 'exec -n logging ' +
+ fluentd_pod_name +
+ ' cat /etc/fluent/config.d/forward.input.conf').split())
+ if FLUENTD_INPUT in result:
+ print("fluentd input configured correctly")
+ else:
+ print("fluentd input not configured\n{}".format(FLUENTD_INPUT))
+
+if __name__ == '__main__':
+ main()
diff --git a/clover/monitoring/monitoring.py b/clover/monitoring/monitoring.py
new file mode 100644
index 0000000..9726fd1
--- /dev/null
+++ b/clover/monitoring/monitoring.py
@@ -0,0 +1,140 @@
+# Copyright (c) Authors of Clover
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+
+from datetime import timedelta
+import pprint
+import requests
+import time
+
+PROMETHEUS_URL = "http://127.0.0.1:9090"
+
+
+class Monitoring(object):
+ PROMETHEUS_HEALTH_UP = "up"
+ PROMETHEUS_ISTIO_TARGETS = {"envoy",
+ "istio-mesh",
+ "kubernetes-apiservers",
+ "kubernetes-cadvisor",
+ "kubernetes-nodes",
+ "kubernetes-service-endpoints",
+ "mixer",
+ "pilot"}
+ PROMETHEUS_API_TARGETS = "/api/v1/targets"
+ PROMETHEUS_API_QUERY = "/api/v1/query"
+ PROMETHEUS_API_QUERY_RANGE = "/api/v1/query_range"
+
+ def __init__(self, host):
+ self.host = host
+
+ def get_targets(self):
+ try:
+ # Reference api: https://prometheus.io/docs/prometheus/latest/querying/api/#targets
+ response = requests.get('%s%s' % (self.host, Monitoring.PROMETHEUS_API_TARGETS))
+ if response.status_code != 200:
+ print("ERROR: get targets status code: %r" % response.status_code)
+ return False
+ except Exception as e:
+ print("ERROR: Cannot connect to prometheus\n%s" % e)
+ return False
+
+ return response.json()
+
+ def is_targets_healthy(self):
+ targets = set()
+
+ raw_targets = self.get_targets()
+ if raw_targets == False:
+ return False
+
+ for target in raw_targets["data"]["activeTargets"]:
+ if target["health"] != Monitoring.PROMETHEUS_HEALTH_UP:
+ print("ERROR: target unhealth job: %s, health: %s" % \
+ (target["labels"]["job"], target["health"]))
+ return False
+ targets.add(target["labels"]["job"])
+
+ diff = Monitoring.PROMETHEUS_ISTIO_TARGETS - targets
+ if len(diff):
+ print("ERROR: targets %r not found!" % diff)
+ return False
+
+ return True
+
+ # Reference links:
+ # - https://prometheus.io/docs/prometheus/latest/querying/api/#instant-queries
+ # - https://prometheus.io/docs/prometheus/latest/querying/api/#range-queries
+ # - https://github.com/prometheus/prombench/blob/master/apps/load-generator/main.py
+ def query(self, query_params):
+ try:
+ start = time.time()
+
+ query_type = query_params.get("type", "instant")
+ params = {"query": query_params["query"]}
+ if query_type == "instant":
+ url = "%s%s" % (self.host, Monitoring.PROMETHEUS_API_QUERY)
+ elif query_type == "range":
+ url = "%s%s" % (self.host, Monitoring.PROMETHEUS_API_QUERY_RANGE)
+ params["start"] = start - duration_seconds(query_params.get("start", "0h"))
+ params["end"] = start - duration_seconds(query_params.get("end", "0h"))
+ params["step"] = query_params.get("step", "15s")
+ else:
+ print("ERROR: invalidate query type")
+ return
+
+ resp = requests.get(url, params)
+ dur = time.time() - start
+
+ print("query %s %s, status=%s, size=%d, dur=%.3f" % \
+ (self.host, query_params["query"], resp.status_code, len(resp.text), dur))
+ pp = pprint.PrettyPrinter(indent=2)
+ pp.pprint(resp.json())
+
+ except Exception as e:
+ print("ERROR: Could not query prometheus instance %s. \n %s" % (url, e))
+
+
+def duration_seconds(s):
+ num = int(s[:-1])
+
+ if s.endswith('s'):
+ return timedelta(seconds=num).total_seconds()
+ elif s.endswith('m'):
+ return timedelta(minutes=num).total_seconds()
+ elif s.endswith('h'):
+ return timedelta(hours=num).total_seconds()
+
+ raise "ERROR: unknown duration %s" % s
+
+
+def main():
+ m = Monitoring(PROMETHEUS_URL)
+ if not m.is_targets_healthy():
+ print("ERROR: Prometheus targets is unhealthy!")
+ else:
+ print("Prometheus targets are all healthy!")
+
+ print "\n### query instant"
+ query_params = {
+ "type": "instant",
+ "query": "istio_double_request_count{destination='details.default.svc.cluster.local'}"
+ }
+ m.query(query_params)
+
+ print "\n### query range"
+ query_range_param = {
+ "type": "range",
+ "query": "istio_double_request_count{destination='details.default.svc.cluster.local'}",
+ "start": "5m",
+ "end": "3m",
+ "step": "30s"
+ }
+ m.query(query_range_param)
+
+
+if __name__ == '__main__':
+ main()
+
diff --git a/clover/monitoring/validate.py b/clover/monitoring/validate.py
new file mode 100644
index 0000000..fafe5df
--- /dev/null
+++ b/clover/monitoring/validate.py
@@ -0,0 +1,70 @@
+# Copyright (c) Authors of Clover
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+
+from monitoring import Monitoring
+from kubernetes import client, config
+
+PROMETHEUS_URL = "http://127.0.0.1:9090"
+PROMETHEUS_DEPLOYMENT = "prometheus"
+PROMETHEUS_LABELS = "app=prometheus"
+ISTIO_NAMESPACE = "istio-system"
+
+
+def validateDeploy():
+ config.load_kube_config()
+ appsv1 = client.AppsV1Api()
+ corev1 = client.CoreV1Api()
+ find_flag = False
+ prom_pod_name = None
+
+ # check prometheus deploytment
+ ret = appsv1.list_deployment_for_all_namespaces(watch=False)
+ for i in ret.items:
+ if PROMETHEUS_DEPLOYMENT == i.metadata.name and \
+ ISTIO_NAMESPACE == i.metadata.namespace:
+ find_flag = True
+ break
+ if find_flag == False:
+ print("ERROR: Deployment: {} doesn't present in {} namespace".format(
+ PROMETHEUS_DEPLOYMENT, ISTIO_NAMESPACE))
+ return False
+
+ # find prometheus pod by label selector
+ ret = corev1.list_namespaced_pod(ISTIO_NAMESPACE, label_selector=PROMETHEUS_LABELS)
+ for i in ret.items:
+ prom_pod_name = i.metadata.name
+ if prom_pod_name == None:
+ print("ERROR: prometheus pod not found")
+ return False
+
+ # check prometheus pod status
+ ret = corev1.read_namespaced_pod_status(prom_pod_name, ISTIO_NAMESPACE)
+ if ret.status.phase != "Running":
+ print("ERROR: prometheus pod %s is under %s state" % (prom_pod_name, ret.status.phase))
+ return False
+
+ return True
+
+
+def validateService():
+ m = Monitoring(PROMETHEUS_URL)
+
+ return m.is_targets_healthy()
+
+
+def main():
+ if validateDeploy() and validateService():
+ print"Prometheus monitoring validation has passed"
+ return True
+ else:
+ print"ERROR: Prometheus monitoring validation has failed"
+ return False
+
+
+if __name__ == '__main__':
+ main()
+
diff --git a/clover/tracing/tracing.py b/clover/tracing/tracing.py
new file mode 100644
index 0000000..16b952c
--- /dev/null
+++ b/clover/tracing/tracing.py
@@ -0,0 +1,201 @@
+# Copyright (c) Authors of Clover
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+
+import requests
+import time
+import redis
+
+TRACING_IP = "localhost"
+TRACING_PORT = "30888"
+
+
+class Tracing:
+
+ def __init__(
+ self, tracing_ip, tracing_port, redis_ip='localhost', use_redis=True):
+ self.tracing_ip = tracing_ip
+ self.tracing_port = tracing_port
+ self.testid = '0'
+ self.test_start_time = 0
+ self.use_redis = use_redis
+ if use_redis:
+ try:
+ self.r = redis.StrictRedis(host=redis_ip, port=6379, db=0)
+ except Exception:
+ print("Failed to connect to redis")
+
+ def setRedisSet(self, rkey, rvalue):
+ if self.use_redis:
+ self.r.sadd(rkey, rvalue)
+
+ def setRedisList(self, rkey, rvalue):
+ if self.use_redis:
+ self.r.lpush(rkey, rvalue)
+
+ def setRedisHash(self, rkey, rvalue):
+ if self.use_redis:
+ self.r.hmset(rkey, rvalue)
+
+ def getRedisTestid(self, index):
+ testid = self.r.lrange("testids", index, index)
+ return testid[0]
+
+ def getRedisTraceids(self, testid):
+ rkey = "traceids:" + str(testid)
+ traceids = self.r.smembers(rkey)
+ return traceids
+
+ def getRedisSpanids(self, traceid):
+ rkey = "spanids:" + str(traceid)
+ spanids = self.r.smembers(rkey)
+ return spanids
+
+ def getRedisSpan(self, spanid, traceid):
+ rkey = "spans:" + str(traceid) + ':' + str(spanid)
+ span = self.r.hgetall(rkey)
+ return span
+
+ def getRedisSpanValue(self, spanid, traceid, span_key):
+ rkey = "spans:" + str(traceid) + ':' + str(spanid)
+ span_value = self.r.hget(rkey, span_key)
+ return span_value
+
+ def getRedisTags(self, spanid, traceid):
+ rkey = "tags:" + str(spanid) + ':' + str(traceid)
+ tags = self.r.hgetall(rkey)
+ return tags
+
+ def getRedisTagsValue(self, spanid, traceid, tag_key):
+ rkey = "tags:" + str(spanid) + ':' + str(traceid)
+ tag_value = self.r.hget(rkey, tag_key)
+ return tag_value
+
+ def getRedisTestAll(self, testid):
+ traceids = self.getRedisTraceids(testid)
+ for trace in traceids:
+ spanids = self.getRedisSpanids(trace)
+ for span in spanids:
+ # print(self.getRedisSpan(span, trace))
+ print(self.getRedisSpanValue(span, trace, 'duration'))
+ # print(self.getRedisTags(span, trace))
+ print(self.getRedisTagsValue(span, trace, 'node_id'))
+
+ def setTest(self, testid):
+ self.testid = testid
+ self.setRedisList("testids", testid)
+ self.test_start_time = int(time.time())
+
+ def getServices(self):
+ req_url = 'http://' + self.tracing_ip + ':' + self.tracing_port + \
+ '/api/services'
+ try:
+ response = requests.get(req_url)
+ if response.status_code != 200:
+ print("ERROR: Cannot connect to tracing: {}".format(
+ response.status_code))
+ return False
+ except Exception as e:
+ print("ERROR: Cannot connect to tracing")
+ print(e)
+ return False
+
+ data = response.json()
+ services = data['data']
+ return services
+
+ def getTraces(self, service, time_back=3600, limit='1000'):
+ ref_time = int(time.time())
+ pad_time = '757000'
+ end_time = 'end=' + str(ref_time) + pad_time + '&'
+ if time_back == 0:
+ delta = self.test_start_time
+ else:
+ delta = ref_time - time_back
+ start_time = 'start=' + str(delta) + pad_time
+ limit = 'limit=' + limit + '&'
+ loopback = 'loopback=1h&'
+ max_dur = 'maxDuration&'
+ min_dur = 'minDuration&'
+ service = 'service=' + service + '&'
+ url_prefix = 'http://' + self.tracing_ip + ':' + self.tracing_port + \
+ '/api/traces?'
+ req_url = url_prefix + end_time + limit + loopback + max_dur + \
+ min_dur + service + start_time
+
+ try:
+ response = requests.get(req_url)
+ if response.status_code != 200:
+ print("ERROR: Cannot connect to tracing: {}".format(
+ response.status_code))
+ return False
+ except Exception as e:
+ print("ERROR: Cannot connect to tracing")
+ print(e)
+ return False
+
+ traces = response.json()
+ return traces
+
+ def numTraces(self, trace):
+ num_traces = len(trace['data'])
+ return str(num_traces)
+
+ def outProcesses(self, trace):
+ processes = []
+ if trace['data']:
+ first_trace = trace['data'][0]
+ for process in first_trace['processes']:
+ processes.append(process)
+ print(processes)
+ return processes
+
+ def outTraces(self, trace):
+ for traces in trace['data']:
+ print("TraceID: {}".format(traces['traceID']))
+ self.setRedisSet(
+ "traceids:{}".format(str(self.testid)), traces['traceID'])
+ for spans in traces['spans']:
+ print("SpanID: {}".format(spans['spanID']))
+ self.setRedisSet(
+ "spanids:{}".format(traces['traceID']), spans['spanID'])
+ print("Duration: {} usec".format(spans['duration']))
+ span = {}
+ span['spanID'] = spans['spanID']
+ span['duration'] = spans['duration']
+ span['startTime'] = spans['startTime']
+ span['operationName'] = spans['operationName']
+ # print("Tags:\n {} \n".format(spans['tags']))
+ self.setRedisHash(
+ "spans:{}:{}".format(
+ traces['traceID'], spans['spanID']), span)
+ tag = {}
+ for tags in spans['tags']:
+ print("Tag key: {}, value: {}".format(
+ tags['key'], tags['value']))
+ tag[tags['key']] = tags['value']
+ self.setRedisHash("tags:{}:{}".format(
+ spans['spanID'], traces['traceID']), tag)
+
+ def monitorTraces(self, sample_interval, service='istio-ingress'):
+ loop = True
+ while loop:
+ try:
+ t = self.getTraces(service, 10)
+ num_traces = self.numTraces(t)
+ print("Number of traces: " + num_traces)
+ self.outTraces(t)
+ time.sleep(sample_interval)
+ except KeyboardInterrupt:
+ print("Test Start: {}".format(self.test_start_time))
+ loop = False
+
+ def main(self):
+ self.monitorTraces(1)
+
+
+if __name__ == '__main__':
+ Tracing(TRACING_IP, TRACING_PORT).main()
diff --git a/clover/tracing/tracing_sample.py b/clover/tracing/tracing_sample.py
new file mode 100644
index 0000000..f0234bf
--- /dev/null
+++ b/clover/tracing/tracing_sample.py
@@ -0,0 +1,47 @@
+# Copyright (c) Authors of Clover
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+
+import uuid
+import time
+from tracing import Tracing
+
+t = Tracing('localhost', '30888')
+
+# Get toplevel services stored in tracing
+services = t.getServices()
+print(services)
+
+# Get traces from the last hour for istio-ingress service
+service = 'istio-ingress'
+traces = t.getTraces(service, 3600)
+# Get process names for first trace service
+t.outProcesses(traces)
+
+# Turn off redis tracing store and output basic trace info
+t.use_redis = False
+t.outTraces(traces)
+
+# Setup basic test and store in redis
+t.use_redis = True
+t.setTest(uuid.uuid4())
+time.sleep(20)
+# Get all traces from test start time when time_back=0
+traces = t.getTraces(service, 0)
+# Store traces in redis
+t.outTraces(traces)
+
+# Get test id for some number of tests back
+testid = t.getRedisTestid('0')
+print(testid)
+traceids = t.getRedisTraceids(testid)
+print(traceids)
+
+# Print out span and tag info for all traces in test
+# Will continue to consider what to extract from hashes for e2e validation
+t.getRedisTestAll(testid)
+
+# t.monitorTraces(1)
diff --git a/clover/tracing/validate.py b/clover/tracing/validate.py
new file mode 100644
index 0000000..eed6f9a
--- /dev/null
+++ b/clover/tracing/validate.py
@@ -0,0 +1,66 @@
+# Copyright (c) Authors of Clover
+#
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+
+from tracing import Tracing
+from kubernetes import client, config
+
+
+JAEGER_IP = "localhost"
+# JAEGER_IP = "1.1.1.1"
+JAEGER_PORT = "30888"
+JAEGER_DEPLOYMENT = "jaeger-deployment"
+ISTIO_NAMESPACE = "istio-system"
+ISTIO_SERVICES = ["istio-ingress", "istio-mixer"]
+
+
+def validateDeploy():
+ config.load_kube_config()
+ v1 = client.AppsV1Api()
+
+ deployments = []
+ namespaces = []
+ validate = False
+ ret = v1.list_deployment_for_all_namespaces(watch=False)
+ for i in ret.items:
+ deployments.append(i.metadata.name)
+ namespaces.append(i.metadata.namespace)
+ if JAEGER_DEPLOYMENT in deployments:
+ d_index = deployments.index(JAEGER_DEPLOYMENT)
+ if ISTIO_NAMESPACE in namespaces[d_index]:
+ print("Deployment: {} present in {} namespace".format(
+ JAEGER_DEPLOYMENT, ISTIO_NAMESPACE))
+ validate = True
+ return validate
+
+# Services in Jaeger will only be present when traffic passes through Istio
+# Requires a deployment in Istio service mesh with some traffic targeting nodes
+def validateServices():
+ t = Tracing(JAEGER_IP, JAEGER_PORT)
+ services = t.getServices()
+ validate = True
+ if services:
+ for s in ISTIO_SERVICES:
+ if s in services:
+ print("Service in tracing: {} present".format(s))
+ else:
+ validate = False
+ else:
+ validate = False
+ return validate
+
+
+def main():
+ if validateDeploy() and validateServices():
+ print"Jaeger tracing validation has passed"
+ return True
+ else:
+ print"Jaeger tracing validation has failed"
+ return False
+
+
+if __name__ == '__main__':
+ main()
diff --git a/docs/logging.rst b/docs/logging.rst
new file mode 100644
index 0000000..196ba40
--- /dev/null
+++ b/docs/logging.rst
@@ -0,0 +1,28 @@
+#######
+Logging
+#######
+
+************
+Installation
+************
+
+Currently, we use the `sample configuration`_ in Istio to install fluentd::
+
+ cd clover/logging
+ kubectl apply -f install
+
+.. _sample configuration: https://istio.io/docs/tasks/telemetry/fluentd.html
+
+********
+Validate
+********
+
+The scripts in ``clover/logging`` validates fluentd installation::
+
+ python clover/logging/validate.py
+
+It validates the installation with the following criterias
+
+#. existence of fluented pod
+#. fluentd input is configured correctly
+#. TBD
diff --git a/docs/monitoring.rst b/docs/monitoring.rst
new file mode 100644
index 0000000..44b01e3
--- /dev/null
+++ b/docs/monitoring.rst
@@ -0,0 +1,31 @@
+##########
+Monitoring
+##########
+
+************
+Installation
+************
+
+Currently, we use the Istio build-in prometheus addon to install prometheus::
+
+ cd <istio-release-path>
+ kubectl apply -f install/kubernetes/addons/prometheus.yaml
+
+********
+Validate
+********
+
+Setup port-forwarding for prometheus by executing the following command::
+
+ kubectl -n istio-system port-forward $(kubectl -n istio-system get pod -l app=prometheus -o jsonpath='{.items[0].metadata.name}') 9090:9090 &
+
+Run the scripts in ``clover/monitoring`` validates prometheus installation::
+
+ python clover/monitoring/validate.py
+
+It validates the installation with the following criterias
+
+#. [DONE] prometheus pod is in Running state
+#. [DONE] prometheus is conneted to monitoring targets
+#. [TODO] test collecting telemetry data from istio
+#. [TODO] TBD
diff --git a/docs/tracing.rst b/docs/tracing.rst
new file mode 100644
index 0000000..79d686c
--- /dev/null
+++ b/docs/tracing.rst
@@ -0,0 +1,44 @@
+#######
+Tracing
+#######
+
+************
+Installation
+************
+
+Currently, we use the Jaeger tracing all-in-one Kubernetes template for development and testing,
+which uses in-memory storage. It can be deployed to the istio-system namespace with the
+following command::
+
+ kubectl apply -n istio-system -f https://raw.githubusercontent.com/jaegertracing/jaeger-kubernetes/master/all-in-one/jaeger-all-in-one-template.yml
+
+The standard Jaeger REST port is at 16686. To make this service available outside of the
+Kubernetes cluster, use the following command::
+
+ kubectl expose -n istio-system deployment jaeger-deployment --port=16686 --type=NodePort
+
+Kubernetes will expose the Jaeger service on another port, which can be found with::
+
+ kubectl get svc -n istio-system
+
+An example listing from the command above is shown below where the Jaeger service is exposed
+externally on port 30888::
+
+ istio-system jaeger-deployment NodePort 10.104.113.94 <none> 16686:30888/TCP
+
+Jaeger will be accessible using the host IP of the Kubernetes cluster and port provided.
+
+********
+Validate
+********
+
+The script in ``clover/tracing`` validates Jaeger installation::
+
+ python clover/tracing/validate.py
+
+It validates the installation with the following criteria:
+
+#. Existence of Jaeger all-in-one deployment using Kubernetes
+#. Jaeger service is accessible using IP address and port configured in installation steps
+#. Jaeger can retrieve default service listing for default Istio components
+#. TBD - consider installation of production setup with cassandra or elastic search