From 945d43362584c536f685312d145d8f06a6012ca2 Mon Sep 17 00:00:00 2001
From: Linghui Zeng <linghui.zeng@huawei.com>
Date: Mon, 16 May 2016 15:09:54 +0800
Subject: Modify the predictor to run several models JIRA: PREDICTION-66

Change-Id: I60b7cd1c9c3c00000391767527346f9e774742e6
Signed-off-by: Linghui Zeng <linghui.zeng@huawei.com>
---
 predPy/predictor.py | 199 +++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 144 insertions(+), 55 deletions(-)

diff --git a/predPy/predictor.py b/predPy/predictor.py
index 159fcf4..a24d541 100644
--- a/predPy/predictor.py
+++ b/predPy/predictor.py
@@ -1,3 +1,5 @@
+#!/usr/bin/python2
+
 # Copyright (c) 2016 Huawei
 # All Rights Reserved.
 #
@@ -17,7 +19,8 @@
 #   limitations under the License.
 #
 
-"""
+"""Summary of models here.
+
 SVM
 Logistic Regression with SGD
 Logistic Regression with LBFGS
@@ -26,40 +29,44 @@ Decision Tree
 Random Forest
 Gradient Boosted Trees
 """
-from __future__ import print_function
 
+from __future__ import print_function
 from pyspark import SparkContext
-
 import csv
 import StringIO
+# import tempfile
+# from shutil import rmtree
 
-import tempfile
-from shutil import rmtree
-
-# from pyspark.mllib.classification import SVMWithSGD, SVMModel
+from pyspark.mllib.classification import SVMWithSGD
+# from pyspark.mllib.classification import SVMModel
 from pyspark.mllib.classification import LogisticRegressionWithSGD
-# from pyspark.mllib.classification import LogisticRegressionWithLBFGS
-from pyspark.mllib.classification import LogisticRegressionModel
-# from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
-# from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
-# from pyspark.mllib.tree import RandomForest, RandomForestModel
-# from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
+from pyspark.mllib.classification import LogisticRegressionWithLBFGS
+# from pyspark.mllib.classification import LogisticRegressionModel
+from pyspark.mllib.classification import NaiveBayes
+# from pyspark.mllib.classification import NaiveBayesModel
+from pyspark.mllib.tree import DecisionTree
+# from pyspark.mllib.tree import DecisionTreeModel
+from pyspark.mllib.tree import RandomForest
+# from pyspark.mllib.tree import RandomForestModel
+from pyspark.mllib.tree import GradientBoostedTrees
+# from pyspark.mllib.tree import GradientBoostedTreesModel
 from pyspark.mllib.linalg import Vectors
 from pyspark.mllib.regression import LabeledPoint
 
 
 def loadRecord(line):
     """Load a CSV line and select 26 indicative parameters"""
-    input = StringIO.StringIO(line)
-    reader = csv.reader(input)
+    inputLine = StringIO.StringIO(line)
+    reader = csv.reader(inputLine)
     parameters = reader.next()
     # Instances that were collected within seven days before the failures
     # are used to train the failing model
     if parameters[3] >= 168:
         parameters[-1] = 0
-    selectedParameters = parameters[12:17] + parameters[19:20] \
-        + parameters[23:26] + parameters[39:47] + parameters[54:61] \
-        + parameters[62:]
+    selectedParameters = (
+        parameters[12:17] + parameters[19:20] + parameters[23:26] +
+        parameters[39:47] + parameters[54:61] + parameters[62:]
+    )
     return selectedParameters
 
 
@@ -75,65 +82,147 @@ if __name__ == "__main__":
     sc = SparkContext(appName="HardDriveFailurePrediction")
 
     # $example on$
-    data = sc.textFile('hdd/harddrive1.csv').map(loadRecord)\
-        .map(parseLine)
+    data = (sc.textFile('hdd/harddrive1.csv').map(loadRecord).
+            map(parseLine))
 
+    print("===== Choose SVM model =====")
     # Split data aproximately into training (60%) and test (40%)
-    [trainingData, testData] = data.randomSplit([0.6, 0.4], seed=0)
+    trainingData, testData = data.randomSplit([0.6, 0.4], seed=0)
 
     # Train a SVM model
-#    model = SVMWithSGD.train(trainingData, iterations=2)
+    model = SVMWithSGD.train(trainingData, iterations=2)
+
+    # Make prediction and test accuracy.
+#    labelsAndPredictions = (testData
+#        .map(lambda p: (p.label, model.predict(p.features))))
+#    accuracy = (labelsAndPredictions
+#        .filter(lambda (x, v): x == v).count() / float(testData.count()))
+    predictions = model.predict(testData.map(lambda x: x.features))
+    labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions)
+    accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p).
+                count() / float(testData.count()))
+    print("The test accuracy of SVM model is: %.4f\n\n" % accuracy)
+
+    print("===== Choose Logistic Regression model with SGD algorithm =====")
+    # Split data aproximately into training (60%) and test (40%)
+    trainingData, testData = data.randomSplit([0.6, 0.4], seed=0)
+
     # Train a logistic regression model
     model = LogisticRegressionWithSGD.train(trainingData, iterations=3)
-#    model = LogisticRegressionWithLBFGS.train(trainingData)
+
+    # Make prediction and test accuracy.
+    predictions = model.predict(testData.map(lambda x: x.features))
+    labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions)
+    accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p).
+                count() / float(testData.count()))
+    print("The test accuracy of Logistic Regression model with"
+          " SGD algorithm is: %.4f\n\n" % accuracy)
+
+    print("===== Choose Logistic Regression model with LBFGS algorithm =====")
+    # Split data aproximately into training (60%) and test (40%)
+    trainingData, testData = data.randomSplit([0.6, 0.4], seed=0)
+
+    # Train a logistic regression model
+    model = LogisticRegressionWithLBFGS.train(trainingData)
+
+    # Make prediction and test accuracy.
+    predictions = model.predict(testData.map(lambda x: x.features))
+    labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions)
+    accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p).
+                count() / float(testData.count()))
+    print("The test accuracy of Logistic Regression model with"
+          " LBFGS algorithm is: %.4f\n\n" % accuracy)
+
+    print("===== Choose Multinomial Naive Bayes model =====")
+    # Split data aproximately into training (60%) and test (40%)
+    trainingData, testData = data.randomSplit([0.6, 0.4], seed=0)
+
     # Train a multinomial naive Bayes model given an RDD of LabeledPoint.
-#    model = NaiveBayes.train(trainingData, 0.8)
+    model = NaiveBayes.train(trainingData, 0.8)
+
+    # Make prediction and test accuracy.
+    predictions = model.predict(testData.map(lambda x: x.features))
+    labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions)
+    accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p).
+                count() / float(testData.count()))
+    print("The test accuracy of Multinomial Naive Bayes "
+          "is: %.4f\n\n" % accuracy)
+
+    print("===== Choose Decision Tree  model =====")
+    # Split data aproximately into training (60%) and test (40%)
+    trainingData, testData = data.randomSplit([0.6, 0.4], seed=0)
+
     # Train a decision tree model.
     # Empty categoricalFeaturesInfo indicates all features are continuous.
-#    model = DecisionTree.trainClassifier(trainingData, numClasses=2,
-#                                         categoricalFeaturesInfo={},
-#                                         impurity='entropy', maxDepth=5,
-#                                         maxBins=32)
-    # Train a RandomForest model.
+    model = DecisionTree.trainClassifier(trainingData, numClasses=2,
+                                         categoricalFeaturesInfo={},
+                                         impurity='entropy', maxDepth=5,
+                                         maxBins=32)
+    print('Learned classification tree model:')
+    print(model.toDebugString())
+
+    # Make prediction and test accuracy.
+    predictions = model.predict(testData.map(lambda x: x.features))
+    labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions)
+    accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p).
+                count() / float(testData.count()))
+    print("The test accuracy of decision tree model is: %.4f\n\n" % accuracy)
+
+    print("===== Choose Random Forest model =====")
+    # Split data aproximately into training (60%) and test (40%)
+    trainingData, testData = data.randomSplit([0.6, 0.4], seed=0)
+
+    # Train a Random Forest model.
     # Empty categoricalFeaturesInfo indicates all features are continuous.
     # Note: Use larger numTrees in practice.
     # Setting featureSubsetStrategy="auto" lets the algorithm choose.
-#    model = RandomForest.trainClassifier(trainingData, numClasses=2,
-#                                         categoricalFeaturesInfo={},
-#                                         numTrees=3,
-#                                         featureSubsetStrategy="auto",
-#                                         impurity='gini', maxDepth=7,
-#                                         maxBins=32)
+    model = RandomForest.trainClassifier(trainingData, numClasses=2,
+                                         categoricalFeaturesInfo={},
+                                         numTrees=3,
+                                         featureSubsetStrategy="auto",
+                                         impurity='gini', maxDepth=7,
+                                         maxBins=32)
+    print('Learned classification tree model:')
+    print(model.toDebugString())
+
+    # Make prediction and test accuracy.
+    predictions = model.predict(testData.map(lambda x: x.features))
+    labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions)
+    accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p).
+                count() / float(testData.count()))
+    print("The test accuracy of random forest model is: %.4f\n\n" % accuracy)
+
+    print("===== Choose Gradient Boosted Trees model =====")
+    # Split data aproximately into training (60%) and test (40%)
+    trainingData, testData = data.randomSplit([0.6, 0.4], seed=0)
 
     # Train a GradientBoostedTrees model.
     # Empty categoricalFeaturesInfo indicates all features are continuous.
-#    model = GradientBoostedTrees.trainClassifier(trainingData,
-#                                                 categoricalFeaturesInfo={},
-#                                                 numIterations=3, maxDepth=3,
-#                                                 maxBins=32)
+    model = GradientBoostedTrees.trainClassifier(trainingData,
+                                                 categoricalFeaturesInfo={},
+                                                 numIterations=3, maxDepth=3,
+                                                 maxBins=32)
+    print('Learned classification tree model:')
+    print(model.toDebugString())
+
     # Make prediction and test accuracy.
-#    labelsAndPredictions = testData\
-#        .map(lambda p: (p.label, model.predict(p.features)))
-#    accuracy = labelsAndPredictions\
-#        .filter(lambda (x, v): x == v).count() / float(testData.count())
     predictions = model.predict(testData.map(lambda x: x.features))
     labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions)
-    accuracy = labelsAndPredictions.filter(lambda (v, p): v == p).\
-        count() / float(testData.count())
-    print('Test Accuracy = ' + str(accuracy))
-#    print('Learned classification tree model:')
-#    print(model.toDebugString())
+    accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p).
+                count() / float(testData.count()))
+    print("The test accuracy of Gradient Boosted Trees "
+          "model is: %.4f" % accuracy)
 
     # Save and load model
-    path = tempfile.mkdtemp(dir='.')
-    model.save(sc, path)
+#    path = tempfile.mkdtemp(dir='.')
+#    model.save(sc, path)
 #    sameModel = SVMModel.load(sc, path)
-    sameModel = LogisticRegressionModel.load(sc, path)
+#    sameModel = LogisticRegressionModel.load(sc, path)
 #    sameModel = NaiveBayesModel.load(sc, path)
 #    sameModel = DecisionTreeModel.load(sc, path)
 #    sameModel = RandomForestModel.load(sc, path)
 #    sameModel = GradientBoostedTreesModel.load(sc, path)
-    try:
-        rmtree(path)
-    except OSError:
-        pass
+#    try:
+#        rmtree(path)
+#    except OSError:
+#        pass
-- 
cgit 1.2.3-korg