From 945d43362584c536f685312d145d8f06a6012ca2 Mon Sep 17 00:00:00 2001 From: Linghui Zeng Date: Mon, 16 May 2016 15:09:54 +0800 Subject: Modify the predictor to run several models JIRA: PREDICTION-66 Change-Id: I60b7cd1c9c3c00000391767527346f9e774742e6 Signed-off-by: Linghui Zeng --- predPy/predictor.py | 199 +++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 144 insertions(+), 55 deletions(-) diff --git a/predPy/predictor.py b/predPy/predictor.py index 159fcf4..a24d541 100644 --- a/predPy/predictor.py +++ b/predPy/predictor.py @@ -1,3 +1,5 @@ +#!/usr/bin/python2 + # Copyright (c) 2016 Huawei # All Rights Reserved. # @@ -17,7 +19,8 @@ # limitations under the License. # -""" +"""Summary of models here. + SVM Logistic Regression with SGD Logistic Regression with LBFGS @@ -26,40 +29,44 @@ Decision Tree Random Forest Gradient Boosted Trees """ -from __future__ import print_function +from __future__ import print_function from pyspark import SparkContext - import csv import StringIO +# import tempfile +# from shutil import rmtree -import tempfile -from shutil import rmtree - -# from pyspark.mllib.classification import SVMWithSGD, SVMModel +from pyspark.mllib.classification import SVMWithSGD +# from pyspark.mllib.classification import SVMModel from pyspark.mllib.classification import LogisticRegressionWithSGD -# from pyspark.mllib.classification import LogisticRegressionWithLBFGS -from pyspark.mllib.classification import LogisticRegressionModel -# from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel -# from pyspark.mllib.tree import DecisionTree, DecisionTreeModel -# from pyspark.mllib.tree import RandomForest, RandomForestModel -# from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel +from pyspark.mllib.classification import LogisticRegressionWithLBFGS +# from pyspark.mllib.classification import LogisticRegressionModel +from pyspark.mllib.classification import NaiveBayes +# from pyspark.mllib.classification import NaiveBayesModel +from pyspark.mllib.tree import DecisionTree +# from pyspark.mllib.tree import DecisionTreeModel +from pyspark.mllib.tree import RandomForest +# from pyspark.mllib.tree import RandomForestModel +from pyspark.mllib.tree import GradientBoostedTrees +# from pyspark.mllib.tree import GradientBoostedTreesModel from pyspark.mllib.linalg import Vectors from pyspark.mllib.regression import LabeledPoint def loadRecord(line): """Load a CSV line and select 26 indicative parameters""" - input = StringIO.StringIO(line) - reader = csv.reader(input) + inputLine = StringIO.StringIO(line) + reader = csv.reader(inputLine) parameters = reader.next() # Instances that were collected within seven days before the failures # are used to train the failing model if parameters[3] >= 168: parameters[-1] = 0 - selectedParameters = parameters[12:17] + parameters[19:20] \ - + parameters[23:26] + parameters[39:47] + parameters[54:61] \ - + parameters[62:] + selectedParameters = ( + parameters[12:17] + parameters[19:20] + parameters[23:26] + + parameters[39:47] + parameters[54:61] + parameters[62:] + ) return selectedParameters @@ -75,65 +82,147 @@ if __name__ == "__main__": sc = SparkContext(appName="HardDriveFailurePrediction") # $example on$ - data = sc.textFile('hdd/harddrive1.csv').map(loadRecord)\ - .map(parseLine) + data = (sc.textFile('hdd/harddrive1.csv').map(loadRecord). + map(parseLine)) + print("===== Choose SVM model =====") # Split data aproximately into training (60%) and test (40%) - [trainingData, testData] = data.randomSplit([0.6, 0.4], seed=0) + trainingData, testData = data.randomSplit([0.6, 0.4], seed=0) # Train a SVM model -# model = SVMWithSGD.train(trainingData, iterations=2) + model = SVMWithSGD.train(trainingData, iterations=2) + + # Make prediction and test accuracy. +# labelsAndPredictions = (testData +# .map(lambda p: (p.label, model.predict(p.features)))) +# accuracy = (labelsAndPredictions +# .filter(lambda (x, v): x == v).count() / float(testData.count())) + predictions = model.predict(testData.map(lambda x: x.features)) + labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions) + accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p). + count() / float(testData.count())) + print("The test accuracy of SVM model is: %.4f\n\n" % accuracy) + + print("===== Choose Logistic Regression model with SGD algorithm =====") + # Split data aproximately into training (60%) and test (40%) + trainingData, testData = data.randomSplit([0.6, 0.4], seed=0) + # Train a logistic regression model model = LogisticRegressionWithSGD.train(trainingData, iterations=3) -# model = LogisticRegressionWithLBFGS.train(trainingData) + + # Make prediction and test accuracy. + predictions = model.predict(testData.map(lambda x: x.features)) + labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions) + accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p). + count() / float(testData.count())) + print("The test accuracy of Logistic Regression model with" + " SGD algorithm is: %.4f\n\n" % accuracy) + + print("===== Choose Logistic Regression model with LBFGS algorithm =====") + # Split data aproximately into training (60%) and test (40%) + trainingData, testData = data.randomSplit([0.6, 0.4], seed=0) + + # Train a logistic regression model + model = LogisticRegressionWithLBFGS.train(trainingData) + + # Make prediction and test accuracy. + predictions = model.predict(testData.map(lambda x: x.features)) + labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions) + accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p). + count() / float(testData.count())) + print("The test accuracy of Logistic Regression model with" + " LBFGS algorithm is: %.4f\n\n" % accuracy) + + print("===== Choose Multinomial Naive Bayes model =====") + # Split data aproximately into training (60%) and test (40%) + trainingData, testData = data.randomSplit([0.6, 0.4], seed=0) + # Train a multinomial naive Bayes model given an RDD of LabeledPoint. -# model = NaiveBayes.train(trainingData, 0.8) + model = NaiveBayes.train(trainingData, 0.8) + + # Make prediction and test accuracy. + predictions = model.predict(testData.map(lambda x: x.features)) + labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions) + accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p). + count() / float(testData.count())) + print("The test accuracy of Multinomial Naive Bayes " + "is: %.4f\n\n" % accuracy) + + print("===== Choose Decision Tree model =====") + # Split data aproximately into training (60%) and test (40%) + trainingData, testData = data.randomSplit([0.6, 0.4], seed=0) + # Train a decision tree model. # Empty categoricalFeaturesInfo indicates all features are continuous. -# model = DecisionTree.trainClassifier(trainingData, numClasses=2, -# categoricalFeaturesInfo={}, -# impurity='entropy', maxDepth=5, -# maxBins=32) - # Train a RandomForest model. + model = DecisionTree.trainClassifier(trainingData, numClasses=2, + categoricalFeaturesInfo={}, + impurity='entropy', maxDepth=5, + maxBins=32) + print('Learned classification tree model:') + print(model.toDebugString()) + + # Make prediction and test accuracy. + predictions = model.predict(testData.map(lambda x: x.features)) + labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions) + accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p). + count() / float(testData.count())) + print("The test accuracy of decision tree model is: %.4f\n\n" % accuracy) + + print("===== Choose Random Forest model =====") + # Split data aproximately into training (60%) and test (40%) + trainingData, testData = data.randomSplit([0.6, 0.4], seed=0) + + # Train a Random Forest model. # Empty categoricalFeaturesInfo indicates all features are continuous. # Note: Use larger numTrees in practice. # Setting featureSubsetStrategy="auto" lets the algorithm choose. -# model = RandomForest.trainClassifier(trainingData, numClasses=2, -# categoricalFeaturesInfo={}, -# numTrees=3, -# featureSubsetStrategy="auto", -# impurity='gini', maxDepth=7, -# maxBins=32) + model = RandomForest.trainClassifier(trainingData, numClasses=2, + categoricalFeaturesInfo={}, + numTrees=3, + featureSubsetStrategy="auto", + impurity='gini', maxDepth=7, + maxBins=32) + print('Learned classification tree model:') + print(model.toDebugString()) + + # Make prediction and test accuracy. + predictions = model.predict(testData.map(lambda x: x.features)) + labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions) + accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p). + count() / float(testData.count())) + print("The test accuracy of random forest model is: %.4f\n\n" % accuracy) + + print("===== Choose Gradient Boosted Trees model =====") + # Split data aproximately into training (60%) and test (40%) + trainingData, testData = data.randomSplit([0.6, 0.4], seed=0) # Train a GradientBoostedTrees model. # Empty categoricalFeaturesInfo indicates all features are continuous. -# model = GradientBoostedTrees.trainClassifier(trainingData, -# categoricalFeaturesInfo={}, -# numIterations=3, maxDepth=3, -# maxBins=32) + model = GradientBoostedTrees.trainClassifier(trainingData, + categoricalFeaturesInfo={}, + numIterations=3, maxDepth=3, + maxBins=32) + print('Learned classification tree model:') + print(model.toDebugString()) + # Make prediction and test accuracy. -# labelsAndPredictions = testData\ -# .map(lambda p: (p.label, model.predict(p.features))) -# accuracy = labelsAndPredictions\ -# .filter(lambda (x, v): x == v).count() / float(testData.count()) predictions = model.predict(testData.map(lambda x: x.features)) labelsAndPredictions = testData.map(lambda p: p.label).zip(predictions) - accuracy = labelsAndPredictions.filter(lambda (v, p): v == p).\ - count() / float(testData.count()) - print('Test Accuracy = ' + str(accuracy)) -# print('Learned classification tree model:') -# print(model.toDebugString()) + accuracy = (labelsAndPredictions.filter(lambda (v, p): v == p). + count() / float(testData.count())) + print("The test accuracy of Gradient Boosted Trees " + "model is: %.4f" % accuracy) # Save and load model - path = tempfile.mkdtemp(dir='.') - model.save(sc, path) +# path = tempfile.mkdtemp(dir='.') +# model.save(sc, path) # sameModel = SVMModel.load(sc, path) - sameModel = LogisticRegressionModel.load(sc, path) +# sameModel = LogisticRegressionModel.load(sc, path) # sameModel = NaiveBayesModel.load(sc, path) # sameModel = DecisionTreeModel.load(sc, path) # sameModel = RandomForestModel.load(sc, path) # sameModel = GradientBoostedTreesModel.load(sc, path) - try: - rmtree(path) - except OSError: - pass +# try: +# rmtree(path) +# except OSError: +# pass -- cgit 1.2.3-korg