# Logs and Metrics Analysis Notebook

#### Used to capture anomalies in the logs and analyse / visualize the metrics in the vicinity of that time

##### Contributors:

- Adarsh Yadav 
 
 Log Analysis and Anomaly Finding
 



- Aditya Srivastava 
 
 Metrics Analysis and Visualization

### Metrics Analysis and Visualization

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import numpy as np

import datetime
import time
import requests

from pprint import pprint
import json
from datetime import datetime, timedelta

from elasticsearch import Elasticsearch
from elasticsearch_dsl import Search
from elasticsearch.connection import create_ssl_context
import ssl
import urllib3

In [None]:
PROMETHEUS = 'http://10.10.120.211:30902/' #do not change, unless sure

## Helper Functions

In [None]:
#function to make DF out of query json

def convert_to_df(res_json):

 data_list = res_json['data']['result']
 res_df = pd.DataFrame()
 if not data_list:
 return res_df

 # making colums
 headers = data_list[0]
 for data in data_list:
 metrics = data['metric']
 for metric in metrics.keys():
 res_df[metric] = np.nan
 res_df['value'] = 0
 
 # filling the df
 for data in data_list:
 metrics = data['metric']
 metrics['value'] = data['value'][-1]
 res_df = res_df.append(metrics, ignore_index=True) 

 return res_df

def convert_to_df_range(res_json):

 data_list = res_json['data']['result']
 res_df = pd.DataFrame()
 if not data_list:
 return res_df

 # filling the df
 for data in data_list:
 metrics = data['metric']
 values = np.array(data['values'])
 for time, value in values:
 metrics['timestamp'] = time
 metrics['value'] = value
 res_df = res_df.append(metrics, ignore_index=True) 

 return res_df


In [None]:
# functions to query

def convert_to_timestamp(s):
 return time.mktime(datetime.strptime(s, "%Y-%m-%d %H:%M:%S").timetuple())

def query_current(params={}):
 # input: params
 # type: dict
 # Example: {'query': 'container_cpu_user_seconds_total'}
 
 # Output: dict, loaded json response of the query

 res = requests.get(PROMETHEUS + '/api/v1/query', 
 params=params)
 return json.loads(res.text)


def query_range(start, end, params={}, steps = '30s'):
 # input: params
 # type: dict
 # Example: {'query': 'container_cpu_user_seconds_total'}
 
 # Output: dict, loaded json response of the query
 params["start"] = convert_to_timestamp(start)
 params["end"] = convert_to_timestamp(end)
 params["step"] = steps

 # print(params)

 res = requests.get(PROMETHEUS + '/api/v1/query_range', 
 params=params,
 )

 return json.loads(res.text)


## Analysis Function

#### CPU

In [None]:
# CPU Unused Cores
def unused_cores(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):
 
 if csv is not None:
 df = pd.read_csv(csv)
 return df
 else:
 if start is None or end is None or node is None:
 return "Start, end and Node name required when fetching from prometheus"
 
 params = {'query' : "collectd_cpu_percent{exported_instance='" + node + "'}"}

 target_cpu_usage_range = query_range(start, end, params, steps)
 df = convert_to_df_range(target_cpu_usage_range)

 df = df.drop(['__name__', 'instance', 'job'], axis = 1)
 groups = df.groupby(['cpu'])
 if verbose: print("Unused Cores :")
 unused_cores = []
 for key, item in groups:
 curr_df = item
 idle_row = curr_df.loc[curr_df['type'] == 'idle']
 if idle_row['value'].iloc[0] == '100':
 if verbose: print("Core: ",key)
 unused_cores.append(int(key))

 print("Number of unused cores: ", len(unused_cores))
 return unused_cores


#CPU fully used cores
def fully_used_cores(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):
 
 if csv is not None:
 df = pd.read_csv(csv)
 return df
 else:
 if start is None or end is None or node is None:
 return "Start, end and Node name required when fetching from prometheus"
 
 params = {'query' : "collectd_cpu_percent{exported_instance='" + node + "'}"}

 target_cpu_usage_range = query_range(start, end, params, steps)
 df = convert_to_df_range(target_cpu_usage_range)

 df = df.drop(['__name__', 'instance', 'job'], axis = 1)
 groups = df.groupby(['cpu'])
 if verbose: print("Fully Used Cores :")
 fully_used_cores = []
 for key, item in groups:
 curr_df = item
 idle_row = curr_df.loc[curr_df['type'] == 'idle']
 if idle_row['value'].iloc[0] == '0':
 if verbose: print("Core: ",key)
 fully_used_cores.append(int(key))
 print("Number of fully used cores: ", len(fully_used_cores))
 return fully_used_cores


# CPU used cores plots
def plot_used_cores(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):
 
 if csv is not None:
 df = pd.read_csv(csv)
 return df
 else:
 if start is None or end is None or node is None:
 return "Start, end and Node name required when fetching from prometheus"

 params = {'query' : "collectd_cpu_percent{exported_instance='" + node + "'}"}

 target_cpu_usage_range = query_range(start, end, params, steps)
 df = convert_to_df_range(target_cpu_usage_range)
 
 df = df.drop(['__name__', 'instance', 'job'], axis = 1)
 groups = df.groupby(['cpu'])
 used_cores = []

 for key, item in groups:
 curr_df = item
 user_row = curr_df.loc[curr_df['type'] == 'user']
 sys_row = curr_df.loc[curr_df['type'] == 'system']


 if np.any(sys_row != '0') or np.any(user_row != '0'):
 used_cores.append(key)
 type_grps = curr_df.groupby('type')
 fig = plt.figure(figsize=(24,6), facecolor='oldlace', edgecolor='red')

 for type_key, new_item in type_grps:

 if type_key == 'system':
 ax1 = fig.add_subplot(131)
 ax1.title.set_text(type_key)
 ax1.plot(new_item['timestamp'], new_item['value'])
 elif type_key == 'user':
 ax2 = fig.add_subplot(132)
 ax2.title.set_text(type_key)
 ax2.plot(new_item['timestamp'], new_item['value'])
 elif type_key == 'wait':
 ax3 = fig.add_subplot(133)
 ax3.title.set_text(type_key)
 ax3.plot(new_item['timestamp'], new_item['value'])

 plt.suptitle('Used CPU Core {}'.format(key), fontsize=14)
 plt.show()
 print("Number of used cores: ", len(used_cores))
 return used_cores

#### Interface

In [None]:
# Interface Dropped (both type 1 and 2, i.e rx and tx)
#TODO: Change this to separate functions later
def interface_dropped(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):
 
 if csv is not None:
 df = pd.read_csv(csv)
 df_0 = df #TODO: Change this
 df_1 = df #TODO: Change this
 else:
 if start is None or end is None or node is None:
 return "Start, end and Node name required when fetching from prometheus"
 
 params = {'query' : "collectd_interface_if_dropped_0_total{exported_instance='" + node + "'}"}

 interface_dropped_0 = query_range(start, end, params, steps)
 df_0 = convert_to_df_range(interface_dropped_0)
 
 params = {'query' : "collectd_interface_if_dropped_1_total{exported_instance='" + node + "'}"}
 interface_dropped_1 = query_range(start, end, params, steps)
 df_1 = convert_to_df_range(interface_dropped_1)

 
 #df_0 : interfaces_dropped_0_df
 df_0 = df_0.drop(['__name__', 'instance', 'job'], axis = 1)

 #df_1 : interfaces_dropped_1_df
 df_1 = df_1.drop(['__name__', 'instance', 'job'], axis = 1)

 groups_0 = df_0.groupby(['interface'])
 groups_1 = df_1.groupby(['interface'])

 groups = [groups_0, groups_1]
 dropped_interfaces= []
 drop_type = 0
 color = ['oldlace', 'mistyrose']
 plot_iter = 111
 for group in groups:
 dropped = []

 for key, item in group:
 curr_df = item
 if np.any(curr_df['value'] == '1'):
 dropped_row = curr_df.loc[curr_df['value'] == '1']
 dropped.append([key, dropped_row['timestamp'].iloc[0]])
 fig = plt.figure(figsize=(24,6), facecolor=color[drop_type], edgecolor='red')
 ax = fig.add_subplot(plot_iter)
 ax.title.set_text("Interface: {}".format(key))
 ax.plot(item['timestamp'], item['value'])
 dropped_interfaces.append(dropped)
 plt.suptitle('Interfaces Drop type {}'.format(drop_type), fontsize=14)
 plt.show()
 drop_type += 1
 return dropped_interfaces


# Interface Errors (both type 1 and 2, i.e rx and tx)
#TODO: Change this to separate functions later
def interface_errors(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):
 
 if csv is not None:
 df = pd.read_csv(csv)
 df_0 = df #TODO: Change this
 df_1 = df #TODO: Change this
 else:
 if start is None or end is None or node is None:
 return "Start, end and Node name required when fetching from prometheus"
 
 params = {'query' : "collectd_interface_if_errors_0_total{exported_instance='" + node + "'}"}
 interfaces_errors_0 = query_range(start, end, params, steps)
 df_0 = convert_to_df_range(interfaces_errors_0)
 
 params = {'query' : "collectd_interface_if_errors_1_total{exported_instance='" + node + "'}"}
 interface_errors_1 = query_range(start, end, params, steps)
 df_1 = convert_to_df_range(interface_errors_1)

 
 #df_0 : interfaces_errors_0_df
 df_0 = df_0.drop(['__name__', 'instance', 'job'], axis = 1)

 #df_1 : interfaces_dropped_1_df
 df_1 = df_1.drop(['__name__', 'instance', 'job'], axis = 1)

 groups_0 = df_0.groupby(['interface'])
 groups_1 = df_1.groupby(['interface'])

 groups = [groups_0, groups_1]
 err_interfaces= []
 err_type = 0
 color = ['oldlace', 'mistyrose']
 for group in groups:
 errors = []

 for key, item in group:
 curr_df = item

 if np.any(curr_df['value'] == '1'):
 err_row = curr_df.loc[curr_df['value'] == '1']
 erros.append([key, err_row['timestamp'].iloc[0]])

 fig = plt.figure(figsize=(24,6), facecolor=color[err_type], edgecolor='red')
 ax = fig.add_subplot(111)
 ax.title.set_text("Interface: {}".format(key))
 ax.plot(item['timestamp'], item['value'])

 err_interfaces.append(errors)
 plt.suptitle('Interfaces Error type {}'.format(err_type), fontsize=14)
 plt.show()
 err_type += 1

 return err_interfaces

#### RDT 

In [None]:
# L3 cache bytes
def plot_rdt_bytes(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):
 
 if csv is not None:
 df = pd.read_csv(csv)
 else:
 if start is None or end is None or node is None:
 return "Start, end and Node name required when fetching from prometheus"

 params = {'query' : "collectd_intel_rdt_bytes{exported_instance='" + node + "'}"}
 intel_rdt_bytes = query_range(start, end, params, steps)
 df = convert_to_df_range(intel_rdt_bytes)

 df = df.drop(['__name__', 'instance', 'job'], axis = 1)
 groups = df.groupby(['intel_rdt'])
 for key, item in groups:
 curr_df = item
 fig = plt.figure(figsize=(24,6), facecolor='oldlace', edgecolor='red')
 ax1 = fig.add_subplot(111)
 ax1.title.set_text("Intel RDT Number: {}".format(key))
 ax1.plot(item['timestamp'], item['value'])
 plt.show()
 return


# L3 IPC values
def plot_rdt_ipc(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):
 
 if csv is not None:
 df = pd.read_csv(csv)
 else:
 if start is None or end is None or node is None:
 return "Start, end and Node name required when fetching from prometheus"
 
 params = {'query' : "collectd_intel_rdt_ipc{exported_instance='" + node + "'}"}
 intel_rdt_ipc = query_range(start, end, params, steps)
 df = convert_to_df_range(intel_rdt_ipc)

 df = df.drop(['__name__', 'instance', 'job'], axis = 1)
 groups = df.groupby(['intel_rdt'])
 for key, item in groups:
 curr_df = item
 fig = plt.figure(figsize=(24,6), facecolor='oldlace', edgecolor='red')
 ax1 = fig.add_subplot(111)
 ax1.title.set_text("Intel RDT Number: {}, IPC value".format(key))
 ax1.plot(item['timestamp'], item['value'])
 plt.show()
 return


# memeory bandwidtdh
def get_rdt_memory_bandwidth(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):
 
 if csv is not None:
 df = pd.read_csv(csv)
 else:

 if start is None or end is None or node is None:
 return "Start, end and Node name required when fetching from prometheus"
 
 params = {'query' : "collectd_intel_rdt_memory_bandwidth_total{exported_instance='" + node + "'}"}
 intel_rdt_mem_bw = query_range(start, end, params, steps)
 df = convert_to_df_range(intel_rdt_mem_bw)

 df = df.drop(['__name__', 'instance', 'job'], axis = 1)
 
 return df

#### Memory

In [None]:
def get_memory_usage(start=None, end=None, node=None, steps='15s', csv=None, verbose=False):
 
 if csv is not None:
 df = pd.read_csv(csv)
 else:
 if start is None or end is None or node is None:
 return "Start, end and Node name required when fetching from prometheus"
 
 params = {'query' : "collectd_memory{exported_instance='" + node + "'} / (1024*1024*1024) "} 
 target_memory_usage_range = query_range(start, end, params, steps)
 df = convert_to_df_range(target_memory_usage_range)
 
 df = df.drop(['instance', 'job'], axis = 1)
 groups = df.groupby(['memory'])
 for key, item in groups:
 curr_df = item
 fig = plt.figure(figsize=(24,6), facecolor='oldlace', edgecolor='red')
 ax1 = fig.add_subplot(111)
 ax1.title.set_text("Memory Type: {}".format(key))
 ax1.plot(item['timestamp'], item['value'])
 plt.show()
 return df

## Testing Zone

In [None]:
get_memory_usage('2020-08-03 08:00:12', '2020-08-03 08:01:12', 'pod12-node4')

In [None]:
def analyse(timestamp, node):
 ts = datetime.strptime(timestamp.split(',')[0], "%Y-%m-%d %H:%M:%S")
 start = ts - timedelta(seconds=10)
 end = ts + timedelta(seconds=10)
 
 start = str(start)
 end = str(end)
 steps = '5s'

 print("Starting Analysis from",start,"to",end,'\n\n')

 if "node4" in node:
 node = 'pod12-node4'

 #cpu analysis
 print("=====CPU ANALYSIS=====\n")
 unused = unused_cores(start, end, node, steps)
 print("Unused Cores:", unused)
 fully_used = fully_used_cores(start, end, node, steps)
 print("Fully Used Cores:", fully_used)
 print("Plotting used cores:")
 used_cores = plot_used_cores(start, end, node, steps)
 
 #interface analysis
 print("=====Interfaces Dropped / Errors=====\n")
 dropped_interfaces = interface_dropped(start, end, node, steps)
 err_interfaces = interface_errors(start, end, node, steps)
 
 #RDT Analysis
 print("=====RDT Analysis=====\n")
 plot_rdt_bytes(start, end, node, steps)
 plot_rdt_ipc(start, end, node, steps)
 mem_bandwidht = get_rdt_memory_bandwidth(start, end, node, steps)
 
 #Memory Analysis:
 print("=====Memory Analysis=====\n")
 mem = get_memory_usage(start, end, node, steps)

## Usage / Examples


##### CPU 

- For calling cpu unsued cores

```py
# Fetching from prometheus
cores = unused_cores('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')

```

- For finding fully used cores

```py
# Fetching from prometheus
fully_used = fully_used_cores('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')

```

- Similarly for plotting used cores

```py
# Fetching
plot_used_cores('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')

#csv
# use Analysis-Monitoring-Local Notebook for correct analysis 
plot_used_cores(csv='metrics_data/cpu-0/cpu-user-2020-06-02')

```


##### Interface

- Interface Dropped 

```py
# Fetching from prom
dropped_interfaces = interface_dropped('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')

```

- Interface Errors

```py
# Fetching from prom
interface_errors('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')
```

##### RDT

- Plot bytes

```py
# fetch
plot_rdt_bytes('2020-07-31 08:00:12', '2020-07-31 08:01:12','pod12-node4')
```

- Plot ipc values

```py
#fetch
plot_rdt_ipc('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')
```

- Memory bandwidth

```py
#fetch
get_rdt_memory_bandwidth('2020-07-31 08:00:12', '2020-07-31 08:01:12', 'pod12-node4')
```

##### Memory

- Memory usage

```py
get_memory_usage('2020-08-03 08:00:12', '2020-08-03 08:01:12', 'pod12-node4')
```

##### Analyse everything

```py
# example alert_time: 2020-08-03 08:00:12
# example index: 'pod12-node4'
analyse(alert_time,index)
```

#### Checking Anomaly in logs

In [None]:
#Give file name
foldername = "results_2020-08-07_03-39-57"
#Give index name - "node1*" or "node4*"
index = "node4*"

In [None]:
ssl_context = create_ssl_context()
ssl_context.check_hostname = False
ssl_context.verify_mode = ssl.CERT_NONE
urllib3.disable_warnings()
client = Elasticsearch(['https://elasticsearch:password123@10.10.120.211:31111'],verify_certs=False,ssl_context=ssl_context)

In [None]:
vsperf = "vsperf-overall_"+ foldername[8:] +".log"
s = Search(index=index).using(client).query("exists", field="alert").query("match_phrase", log_path=vsperf)
for hits in s.scan():
 alert_time = hits.alert_time

print(alert_time)

In [None]:
analyse(alert_time,index)