From 80b0ba4a8d41adee65adf73bebed4bbef3c065c3 Mon Sep 17 00:00:00 2001
From: Alassane Samba <alassane.samba@orange.com>
Date: Wed, 29 Mar 2017 16:19:15 +0200
Subject: init TOM: machine learning on test results

Change-Id: Ic66214e53d694bef9d4fac08a2ededa1f3ae4f82
Signed-off-by: Alassane Samba <alassane.samba@orange.com>
---
 contrib/TOM/R/boostedBoxPlot.R |  64 +++++++++++
 contrib/TOM/R/explainer.R      | 112 +++++++++++++++++++
 contrib/TOM/R/tom.R            | 238 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 414 insertions(+)
 create mode 100644 contrib/TOM/R/boostedBoxPlot.R
 create mode 100644 contrib/TOM/R/explainer.R
 create mode 100644 contrib/TOM/R/tom.R

(limited to 'contrib/TOM/R')

diff --git a/contrib/TOM/R/boostedBoxPlot.R b/contrib/TOM/R/boostedBoxPlot.R
new file mode 100644
index 00000000..c255864a
--- /dev/null
+++ b/contrib/TOM/R/boostedBoxPlot.R
@@ -0,0 +1,64 @@
+# -----------------------------------------------------------------------------
+# title: Boosted Boxplots
+# description: add several useful features to the classical R boxplot function
+# author: Alassane Samba (alassane.samba@orange.com)
+# Copyright (c) 2016 Orange
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+# -----------------------------------------------------------------------------
+boostedBoxplot<-function(y,x, main="", labx=NULL,laby=NULL, plot.mean=T, text.freq=T, las=1, ylim=c(0,0), limitVisibleModalities=30, decreasing=NULL, dynamic=F){
+  xlab=""
+  if(is.null(labx))labx=deparse(substitute(x))
+  if(is.null(laby))laby=deparse(substitute(y))
+  if(main==""){
+  main=labx
+  }else{
+  xlab=labx
+  }
+  x=droplevels(as.factor(x))
+  p=length(levels(as.factor(x)))
+  if(!is.null(decreasing)){
+    x=factor(x,levels = names(sort(tapply(y,x,median), decreasing = decreasing)), ordered = F)
+  }else{
+    decreasing=T
+  }
+  #limitVisibleModalities
+  if(limitVisibleModalities<p-1){
+    x=factor(x,levels = names(sort(tapply(y,x,median), decreasing = decreasing)), ordered = F)
+    lx=levels(as.factor(x))
+    leftl=lx[1:floor(limitVisibleModalities/2)]
+    rightl=lx[(p-floor(limitVisibleModalities/2)+1):p]
+    n_other=length(lx[!lx%in%c(leftl,rightl)])
+    x=as.character(x)
+    x[!x%in%c(leftl,rightl)]<-paste(c("other(",n_other,")"),collapse="")
+    x=as.factor(x)
+    x=factor(x,levels = names(sort(tapply(y,x,median), decreasing = decreasing)), ordered = F)
+  }
+  #dynamicity
+  if(dynamic){
+    dataf=data.frame(Y=y,X=x)
+    require(rAmCharts)
+    amBoxplot(Y~X,data=dataf,labelRotation = (las==2)*90, ylab = laby, main = main)
+  }else{
+  if(sum(ylim)==0){
+    rb<-boxplot(y~x, main=main, xlab=xlab, ylab=laby, las=las)
+    grid()
+    #rb<-boxplot(y~x, main=main, xlab=xlab, ylab=laby, las=las, add=T)
+  }else{
+    rb<-boxplot(y~x, main=main, xlab=xlab, ylab=laby, las=las, ylim=ylim)
+    grid()
+    #rb<-boxplot(y~x, main=main, xlab=xlab, ylab=laby, las=las, add=T)
+  }
+  if(plot.mean){
+    mn.t <- tapply(y, x, mean, na.rm=T)
+    sd.t <- tapply(y, x, sd, na.rm=T)
+    xi <- 0.3 + seq(rb$n)
+    points(xi, mn.t, col = "red", pch = 18, cex=1)
+    arrows(xi, mn.t - sd.t, xi, mn.t + sd.t,code = 3, col = "red", angle = 75, length = .1, lwd = 1)
+  }
+  if(text.freq)text(x=1:length(rb$names), y=(rb$stats[3,]+rb$stats[4,])/2,label=rb$n)
+  }
+}
+############
\ No newline at end of file
diff --git a/contrib/TOM/R/explainer.R b/contrib/TOM/R/explainer.R
new file mode 100644
index 00000000..fedda750
--- /dev/null
+++ b/contrib/TOM/R/explainer.R
@@ -0,0 +1,112 @@
+# ----------------------------------------------------------------------
+# title: TOM useful functions
+# author: Alassane Samba (alassane.samba@orange.com)
+# Copyright (c) 2017 Orange
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+# ----------------------------------------------------------------------
+############
+### Evaluate a prediction
+############
+prediction_evaluator_with_p<-function(pred, actual, p){
+  ##error
+  error=actual-pred
+  ## null ndeviance
+  null.deviance=sum((mean(actual)-actual)^2)
+  ## deviance : sum of squared error
+  se=sum(error^2)
+  ## n
+  n=length(actual)
+  ## mean squared error
+  mse=se/n
+  ## root mean squared error
+  rmse=sqrt(mse)
+  ## normalized root mean squared error
+  nrmse=sqrt(mse/(null.deviance/n))
+  ## r2 : coef of determination
+  r2=1-(se/null.deviance)
+  #adjusted r2
+  adfR2=1-(1-r2)*((n-1)/(n-p))
+  ## absolute error ratio
+  abs.error.ratio=abs(error/actual)
+  ## mean absolute error ratio
+  mean.abs.error.ratio=mean(abs.error.ratio)
+  ## median absolute error ratio
+  med.abs.error.ratio=median(abs.error.ratio)
+  ## 80th-percentile absolute error ratio
+  perc80.abs.error.ratio=quantile(abs.error.ratio,0.8)
+  ## return :
+  return(list(p=p,n=n,sd=sqrt(null.deviance/n),NRMSE=nrmse,RMSE=rmse,R2=r2,adj.R2=adfR2,mean.abs.error.ratio=mean.abs.error.ratio,med.abs.error.ratio=med.abs.error.ratio,perc80.abs.error.ratio=perc80.abs.error.ratio))
+  ### ajouter error et error ratio au return pour pouvoir faire les graphes, etc. 
+}
+#########
+## Calculate R2 (coef of determination / part of explained variance) from the GLM regression
+#########
+betterGenericR2calculator<-function(dataset,targetName,independantVariableNames){
+  dataset2=na.omit(dataset[,c(targetName,independantVariableNames)])
+  numVars=colnames(dataset2)[unlist(lapply(dataset2,is.numeric))&colnames(dataset2)%in%independantVariableNames]
+  factorVars=colnames(dataset2)[(!unlist(lapply(dataset2,is.numeric)))&colnames(dataset2)%in%independantVariableNames]
+  if(length(factorVars)>0&length(numVars)>0){
+    factorVarPasted=droplevels(as.factor(apply(cbind(rep("",nrow(dataset2)),as.data.frame(dataset2[,factorVars])),1,paste,collapse=":")))
+    theformula=paste(targetName,paste(numVars,collapse = "*"), sep='~')
+    nbNumVars=length(numVars)
+    resList=by(dataset2, factorVarPasted, FUN=function(x){m=lm(theformula,data=x,y=T); return(list(pred=m$fitted.values,actual=m$y))})
+    pred=unlist(lapply(resList,function(x){x$pred}))
+    actual=unlist(lapply(resList,function(x){x$actual}))
+    p=((length(levels(droplevels(as.factor(factorVarPasted))))-1)*nbNumVars)+(length(levels(droplevels(as.factor(factorVarPasted))))-1)+nbNumVars+1
+    res=prediction_evaluator_with_p(pred,actual,p)
+  }else if(length(factorVars)==0&length(numVars)>0){
+    theformula=paste(targetName,paste(numVars,collapse = "*"), sep='~')
+    nbNumVars=length(numVars)
+    m=lm(theformula,data=dataset2,y=T)
+    pred=m$fitted.values
+    actual=m$y
+    p=nbNumVars+1
+    res=prediction_evaluator_with_p(pred,actual,p)
+  }else if(length(factorVars)>0&length(numVars)==0){
+    factorVarPasted=droplevels(as.factor(apply(cbind(rep("",nrow(dataset2)),as.data.frame(dataset2[,factorVars])),1,paste,collapse=":")))
+    m=lm(dataset2[,targetName]~factorVarPasted,y=T)
+    pred=m$fitted.values
+    actual=m$y
+    p=length(levels(factorVarPasted))
+    res=prediction_evaluator_with_p(pred,actual,p)
+  }else{
+    res=NULL
+  }
+  return(res)
+}
+###################
+###### Determine the best predictor set (continuous and factor independant variables) to consider for a continuous dependant variable
+###################
+genericBestPredictor<-function(dataset,targetName,independantVariableNames, plot=T, text=T, las=1){
+  ordered_best_additional_predictors=list()
+  ordered_best_predictors_per_level=list(NULL)
+  ordered_best_r2=list()
+  for (i in 1:length(independantVariableNames)){
+    level_i_predictors=as.list(independantVariableNames)[!as.list(independantVariableNames)%in%ordered_best_additional_predictors]
+    varExp_i=lapply(level_i_predictors,function(x){betterGenericR2calculator(dataset,targetName,c(unlist(ordered_best_additional_predictors),x))$R2})
+    if(i==1){
+      bivariateR2<-varExp_i
+      names(bivariateR2)<-unlist(level_i_predictors)
+    }
+    ordered_best_additional_predictors=c(ordered_best_additional_predictors,level_i_predictors[varExp_i%in%max(unlist(varExp_i))])
+    ordered_best_predictors_per_level=c(ordered_best_predictors_per_level,paste(unlist(ordered_best_predictors_per_level[i]),unlist(ordered_best_additional_predictors[i]), sep=":"))
+    ordered_best_r2=c(ordered_best_r2,max(unlist(varExp_i)))
+    ordered_r2_progress=c(round(unlist(ordered_best_r2)[1],2),paste(rep("+",length(ordered_best_r2)-1),round(unlist(ordered_best_r2)[-1]-unlist(ordered_best_r2)[-length(ordered_best_r2)],2)))
+  }
+  ordered_best_predictors_per_level=ordered_best_predictors_per_level[-1]
+  
+  mynames=unlist(ordered_best_additional_predictors)
+  mynames=c(mynames[1],paste("+",mynames[2:length(mynames)]))
+  names(ordered_best_r2)<-mynames
+  
+  if(plot){
+    #barplot(unlist(ordered_best_r2), names.arg = unlist(ordered_best_predictors_per_level),las=las,ylab="R2",main=paste("Correlation with",targetName))
+    barplot(unlist(ordered_best_r2), names.arg = mynames, las=las, ylab="R2",main=paste("Correlation with",targetName))
+    text(y=0.1, x=((1:length(mynames))-0.4)*1.2,labels = ordered_r2_progress)
+  }
+  if(text) return(list(bivariateR2=bivariateR2,orderedBestR2=ordered_best_r2,orderedBestPredictorsPerLevel=ordered_best_predictors_per_level,orderedBestAdditionalPredictors=ordered_best_additional_predictors, targetName=targetName))
+}
+####################
\ No newline at end of file
diff --git a/contrib/TOM/R/tom.R b/contrib/TOM/R/tom.R
new file mode 100644
index 00000000..a9e00ff2
--- /dev/null
+++ b/contrib/TOM/R/tom.R
@@ -0,0 +1,238 @@
+# ----------------------------------------------------------------------
+# title: TOM R API
+# description: Machine Learning-Based Test Results Analysis
+# author: Alassane Samba (alassane.samba@orange.com)
+# Copyright (c) 2017 Orange
+# All rights reserved. This program and the accompanying materials
+# are made available under the terms of the Apache License, Version 2.0
+# which accompanies this distribution, and is available at
+# http://www.apache.org/licenses/LICENSE-2.0
+# ----------------------------------------------------------------------
+# Load useful fonctions
+source("R/explainer.R")
+source("R/boostedBoxPlot.R")
+# init
+tomData<-NULL
+tomAnalysis<-NULL
+#* Read new file (to change to post)
+#* @get /read
+tomRead<-function(file,res){
+  file_ok <- as.character(file)
+  if (is.na(file_ok)){
+    res$status <- 400
+    res$body <- "val parameter must be a number"
+    return(res)
+  }
+  st=Sys.time()
+  tomData<<-read.table(file = file_ok, sep=',', header=TRUE, quote = "\"", comment.char = "")
+  et=Sys.time()
+  #save(tomData,file = "tom.rdata")
+  list(result="success", durationInSeconds=as.numeric((et-st), units = "secs"))
+}
+#* Analyze correlations
+#* @get /analyze
+tomAnalyze<-function(input="pod_name:deploy_scenario:version:runner_id",output="bandwidth.MBps.",res){
+  if(is.null(tomData)){
+    res$status <- 400
+    res$body <- "please read data first : url/read"
+    return(res)
+  }
+  input_ok<-as.character(input)
+  input_ok<-unlist(strsplit(input_ok,split = ":"))
+  if(sum(is.na(input_ok))>0){
+    res$status <- 400
+    res$body <- "input parameter must be a string including data context parameters separatd by a double dot (:)"
+    return(res)
+  }
+  if(sum(!input_ok%in%colnames(tomData))==length(input_ok)){
+    res$status <- 400
+    res$body <- "input parameter items must all be inluded in data header"
+    return(res)
+  }
+  output_ok <- as.character(output)
+  if(is.na(output_ok)){
+    res$status <- 400
+    res$body <- "output parameter must be a string inluded in data header"
+    return(res)
+  }
+  if(!output_ok%in%colnames(tomData)){
+    res$status <- 400
+    res$body <- "output parameter must be inluded in data header"
+    return(res)
+  }
+  st=Sys.time()
+  tomAnalysis<<-genericBestPredictor(dataset = tomData, targetName = output_ok, independantVariableNames = input_ok, plot = F)
+  et=Sys.time()
+  list(result="success", durationInSeconds=as.numeric((et-st), units = "secs"))
+}
+#* Get bivariate R2 values
+#* @get /correlation
+tomBivariateR2<-function(res){
+  if(is.null(tomAnalysis)){
+    res$status <- 400
+    res$body <- "please analyze data first : url/analyze"
+    return(res)
+  }
+  return(as.list(tomAnalysis$bivariateR2))
+}
+#* Get bivariate R2 values
+#* @get /explain
+tomExplain<-function(res){
+  if(is.null(tomAnalysis)){
+    res$status <- 400
+    res$body <- "please analyze data first : url/analyze"
+    return(res)
+  }
+  return(as.list(tomAnalysis$orderedBestR2))
+}
+#* Get bivariate R2 values
+#* @png
+#* @get /explainGraph
+tomExplainGraph<-function(res){
+  if(is.null(tomAnalysis)){
+    res$status <- 400
+    res$body <- "please analyze data first : url/analyze"
+    return(res)
+  }
+  maxmargin=20
+  bottomheigth=0.5*max(nchar(names(tomAnalysis$orderedBestR2)),na.rm=T)
+  if(bottomheigth>maxmargin) bottomheigth<-maxmargin
+  op=par(mar=c(bottomheigth, 4.1, 4.1, 2.1))
+  barplot(unlist(tomAnalysis$orderedBestR2), names.arg = names(tomAnalysis$orderedBestR2), las=2, ylab="R squared correlation coef.",main=paste("Correlation with",tomAnalysis$targetName))
+  par(op)
+}
+#* Get the head contexts cases having highest KPI values
+#* @get /head
+tomHead<-function(input="pod_name",output="bandwidth.MBps.",limit=5,res){
+  if(is.null(tomData)){
+    res$status <- 400
+    res$body <- "please read data first : url/read"
+    return(res)
+  }
+  input_ok<-as.character(input)
+  input_ok<-unlist(strsplit(input_ok,split = ":"))
+  if(sum(is.na(input_ok))>0){
+    res$status <- 400
+    res$body <- "input parameter must be a string including data context parameters separatd by a double dot (:)"
+    return(res)
+  }
+  if(sum(!input_ok%in%colnames(tomData))==length(input_ok)){
+    res$status <- 400
+    res$body <- "input parameter items must all be inluded in data header"
+    return(res)
+  }
+  output_ok <- as.character(output)
+  if(is.na(output_ok)){
+    res$status <- 400
+    res$body <- "output parameter must be a string inluded in data header"
+    return(res)
+  }
+  if(!output_ok%in%colnames(tomData)){
+    res$status <- 400
+    res$body <- "output parameter must be inluded in data header"
+    return(res)
+  }
+  limit=as.integer(limit)
+  if(is.na(limit)){
+    res$status <- 400
+    res$body <- "limit parameter must be an integer"
+    return(res)
+  }
+  varFactor=apply(as.data.frame(tomData[,input_ok]),1,paste,collapse=":")
+  sorted=sort(tapply(tomData[,output_ok],varFactor,median, na.rm=T),decreasing = TRUE)[1:limit]
+  return(as.list(sorted))
+}
+#* Get the tail contexts cases having lowest KPI values
+#* @get /tail
+tomTail<-function(input="pod_name",output="bandwidth.MBps.",limit=5,res){
+  if(is.null(tomData)){
+    res$status <- 400
+    res$body <- "please read data first : url/read"
+    return(res)
+  }
+  input_ok<-as.character(input)
+  input_ok<-unlist(strsplit(input_ok,split = ":"))
+  if(sum(is.na(input_ok))>0){
+    res$status <- 400
+    res$body <- "input parameter must be a string including data context parameters separatd by a double dot (:)"
+    return(res)
+  }
+  if(sum(!input_ok%in%colnames(tomData))==length(input_ok)){
+    res$status <- 400
+    res$body <- "input parameter items must all be inluded in data header"
+    return(res)
+  }
+  output_ok <- as.character(output)
+  if(is.na(output_ok)){
+    res$status <- 400
+    res$body <- "output parameter must be a string inluded in data header"
+    return(res)
+  }
+  if(!output_ok%in%colnames(tomData)){
+    res$status <- 400
+    res$body <- "output parameter must be inluded in data header"
+    return(res)
+  }
+  limit=as.integer(limit)
+  if(is.na(limit)){
+    res$status <- 400
+    res$body <- "limit parameter must be an integer"
+    return(res)
+  }
+  varFactor=apply(as.data.frame(tomData[,input_ok]),1,paste,collapse=":")
+  sorted=sort(tapply(tomData[,output_ok],varFactor,median,na.rm=T),decreasing = FALSE)[1:limit]
+  return(as.list(sorted))
+}
+#* Get the comparison graph
+#* @png
+#* @get /comparGraph
+tomComparGraph<-function(input="pod_name",output="bandwidth.MBps.",limit=10,plotmean=TRUE,textfreq=TRUE,res){
+  if(is.null(tomData)){
+    res$status <- 400
+    res$body <- "please read data first : url/read"
+    return(res)
+  }
+  input_ok<-as.character(input)
+  input_ok<-unlist(strsplit(input_ok,split = ":"))
+  if(sum(is.na(input_ok))>0){
+    res$status <- 400
+    res$body <- "input parameter must be a string including data context parameters separatd by a double dot (:)"
+    return(res)
+  }
+  if(sum(!input_ok%in%colnames(tomData))==length(input_ok)){
+    res$status <- 400
+    res$body <- "input parameter items must all be inluded in data header"
+    return(res)
+  }
+  output_ok <- as.character(output)
+  if(is.na(output_ok)){
+    res$status <- 400
+    res$body <- "output parameter must be a string inluded in data header"
+    return(res)
+  }
+  if(!output_ok%in%colnames(tomData)){
+    res$status <- 400
+    res$body <- "output parameter must be inluded in data header"
+    return(res)
+  }
+  limit=as.integer(limit)
+  if(is.na(limit)){
+    res$status <- 400
+    res$body <- "n parameter must be an integer"
+    return(res)
+  }
+  varFactor=as.factor(apply(as.data.frame(tomData[,input_ok]),1,paste,collapse=":"))
+  maxmargin=20
+  bottomheigth=0.5*max(nchar(levels(varFactor)),na.rm=T)
+  if(bottomheigth>maxmargin) bottomheigth<-maxmargin
+  leftwidth=0.5*nchar(as.character(max(tomData[,output_ok],na.rm=T)))
+  if(leftwidth>maxmargin) leftwidth<-maxmargin
+  op=par(mar=c(bottomheigth, leftwidth+1, 4.1, 2.1))
+  boostedBoxplot(tomData[,output_ok],varFactor,decreasing=T,las=2,main=input,laby="",labx="",limitVisibleModalities=limit,dynamic=F,plot.mean=plotmean, text.freq=textfreq)
+  title(ylab = output_ok, line = leftwidth)
+  par(op)
+}
+# # to run it, do:
+# library(plumber)
+# r<-plumb("tom.r")
+# r$run(port=8000)
-- 
cgit 1.2.3-korg