diff options
Diffstat (limited to 'models/failure_prediction/python/decision_tree.py')
-rw-r--r-- | models/failure_prediction/python/decision_tree.py | 199 |
1 files changed, 199 insertions, 0 deletions
diff --git a/models/failure_prediction/python/decision_tree.py b/models/failure_prediction/python/decision_tree.py new file mode 100644 index 0000000..a88c19c --- /dev/null +++ b/models/failure_prediction/python/decision_tree.py @@ -0,0 +1,199 @@ +# pylint: disable=C0103, C0116, W0621, E0401, W0104, W0105, R0913, E1136, W0612, E0102, C0301, W0611, C0411, W0311, C0326, C0330, W0106, C0412 +# -*- coding: utf-8 -*- +"""Decision_Tree.ipynb + +Automatically generated by Colaboratory. + +Original file is located at + https://colab.research.google.com/drive/1TdQCHMWu8lPA53-jFhxXDUPQdjqufrL1 + +Contributors: **Rohit Singh Rathaur, Girish L.** + +Copyright [2021](2021) [*Rohit Singh Rathaur, BIT Mesra and Girish L., CIT GUBBI, Karnataka*] + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +We mounted the drive to access the data +""" + +import sklearn.metrics as metrics +from sklearn.metrics import classification_report +import seaborn as sns +from sklearn import tree +from sklearn.linear_model import LogisticRegression +from sklearn.model_selection import train_test_split +import os +import numpy as np +import pandas as pd +import matplotlib as mpl +import matplotlib.pyplot as plt +import tensorflow as tf +from google.colab import drive +drive.mount('/content/drive') + +"""We are importing libraries to read the CSV and to train the models""" + +# Importing libraries + +"""We are reading CSV file using `read_csv` function and dropping the `Timestamp` column and storing it in a DataFrame called `df_Ellis`.""" + +df_Ellis = pd.read_csv( + "/content/drive/MyDrive/Failure/lstm/Ellis_FinalTwoConditionwithOR.csv") +df_Ellis = df_Ellis.drop(columns='Timestamp') +df_Ellis + +"""First we stored the `feature_cols` and defined the `X` matrix and `y` vector where `X` is a matrix and containing all the feature matrix and `y` is a vector which is having target value.""" + +# define X and y +feature_cols = [ + 'ellis-cpu.wait_perc', + 'ellis-load.avg_1_min', + 'ellis-net.in_bytes_sec', + 'ellis-cpu.system_perc', + 'ellis-mem.free_mb'] + +# X is a matrix, hence we use [] to access the features we want in feature_cols +X = df_Ellis[feature_cols] + +# y is a vector, hence we use dot to access 'label' +y = df_Ellis.Label + +"""We splitted `X` and `y` into `X_train`, `X_test`, `y_train`, and `y_test` using `train_test_split` function.""" + +# split X and y into training and testing sets +X_train, X_test, y_train, y_test = train_test_split( + X, y, test_size=0.30, random_state=5) + +"""We are training the model with Decision Tree.""" + +# train a logistic regression model on the training set + +# instantiate model +logreg = tree.DecisionTreeClassifier() + +# fit model +logreg.fit(X_train, y_train) + +"""We are making predictions for test set""" + +# make class predictions for the testing set +y_pred_class = logreg.predict(X_test) + +"""Here, we are calculating the accuracy using `sklearn` library""" + +# calculate accuracy +print(metrics.accuracy_score(y_test, y_pred_class)) + +"""We are examining the class distribution of the testing set using a `pandas` series method""" + +# examine the class distribution of the testing set (using a Pandas Series +# method) +y_test.value_counts() + +"""We counted the value for each lables""" + +y_train.value_counts() + +"""We are calculating the percentage of ones because `y_test` only contains ones and zeroes, we can simply calculate the mean = percentage of ones""" + +# calculate the percentage of ones +# because y_test only contains ones and zeros, we can simply calculate the +# mean = percentage of ones +y_test.mean() + +"""We are calculating the percentage of zeros""" + +# calculate the percentage of zeros +1 - y_test.mean() + +# calculate null accuracy in a single line of code +# only for binary classification problems coded as 0/1 +max(y_test.mean(), 1 - y_test.mean()) + +# calculate null accuracy (for multi-class classification problems) +y_test.value_counts().head(1) / len(y_test) + +# print the first 25 true and predicted responses +print('True:', y_test.values[0:50]) +print('False:', y_pred_class[0:50]) + +# IMPORTANT: first argument is true values, second argument is predicted values +# this produces a 2x2 numpy array (matrix) +print(metrics.confusion_matrix(y_test, y_pred_class)) + +# save confusion matrix and slice into four pieces +confusion = metrics.confusion_matrix(y_test, y_pred_class) +print(confusion) +#[row, column] +TP = confusion[1, 1] +TN = confusion[0, 0] +FP = confusion[0, 1] +FN = confusion[1, 0] + +# use float to perform true division, not integer division +print((TP + TN) / float(TP + TN + FP + FN)) +print(metrics.accuracy_score(y_test, y_pred_class)) + +"""We are defining a function `print_results` to print the result of `y_test` and `y_pred`.""" + + +def print_results(y_test, y_pred): + + # f1-score + f1 = metrics.f1_score(y_test, y_pred) + print("F1 Score: ", f1) + print(classification_report(y_test, y_pred)) + + conf_matrix = metrics.confusion_matrix(y_test, y_pred) + plt.figure(figsize=(12, 12)) + plt.subplot(221) + sns.heatmap(conf_matrix, fmt="d", annot=True, cmap='Blues') + b, t = plt.ylim() + plt.ylim(b + 0.5, t - 0.5) + plt.title('Confuion Matrix') + plt.ylabel('True Values') + plt.xlabel('Predicted Values') + + # roc_auc_score + model_roc_auc = metrics.roc_auc_score(y_test, y_pred) + print("Area under curve : ", model_roc_auc, "\n") + fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred) + gmeans = np.sqrt(tpr * (1 - fpr)) + ix = np.argmax(gmeans) + threshold = np.round(thresholds[ix], 3) + + plt.subplot(222) + plt.plot( + fpr, + tpr, + color='darkorange', + lw=1, + label="Auc : %.3f" % + model_roc_auc) + plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') + plt.scatter( + fpr[ix], + tpr[ix], + marker='o', + color='black', + label='Best Threshold:' + + str(threshold)) + plt.xlim([0.0, 1.0]) + plt.ylim([0.0, 1.05]) + plt.xlabel('False Positive Rate') + plt.ylabel('True Positive Rate') + plt.title('Receiver operating characteristic') + plt.legend(loc="lower right") + + +print_results(y_test, y_pred_class) |