diff options
author | Alassane Samba <alassane.samba@orange.com> | 2017-03-29 16:19:15 +0200 |
---|---|---|
committer | Alassane Samba <alassane.samba@orange.com> | 2017-03-29 14:46:47 +0000 |
commit | 80b0ba4a8d41adee65adf73bebed4bbef3c065c3 (patch) | |
tree | 1359a36885c5dd62e0f4f63a90cbdf406cde4f28 /contrib/TOM/R | |
parent | 1bcbfb8a99e0a42bb6d5b87ade3ec15fcddc4bf7 (diff) |
init TOM: machine learning on test results
Change-Id: Ic66214e53d694bef9d4fac08a2ededa1f3ae4f82
Signed-off-by: Alassane Samba <alassane.samba@orange.com>
Diffstat (limited to 'contrib/TOM/R')
-rw-r--r-- | contrib/TOM/R/boostedBoxPlot.R | 64 | ||||
-rw-r--r-- | contrib/TOM/R/explainer.R | 112 | ||||
-rw-r--r-- | contrib/TOM/R/tom.R | 238 |
3 files changed, 414 insertions, 0 deletions
diff --git a/contrib/TOM/R/boostedBoxPlot.R b/contrib/TOM/R/boostedBoxPlot.R new file mode 100644 index 00000000..c255864a --- /dev/null +++ b/contrib/TOM/R/boostedBoxPlot.R @@ -0,0 +1,64 @@ +# ----------------------------------------------------------------------------- +# title: Boosted Boxplots +# description: add several useful features to the classical R boxplot function +# author: Alassane Samba (alassane.samba@orange.com) +# Copyright (c) 2016 Orange +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +# ----------------------------------------------------------------------------- +boostedBoxplot<-function(y,x, main="", labx=NULL,laby=NULL, plot.mean=T, text.freq=T, las=1, ylim=c(0,0), limitVisibleModalities=30, decreasing=NULL, dynamic=F){ + xlab="" + if(is.null(labx))labx=deparse(substitute(x)) + if(is.null(laby))laby=deparse(substitute(y)) + if(main==""){ + main=labx + }else{ + xlab=labx + } + x=droplevels(as.factor(x)) + p=length(levels(as.factor(x))) + if(!is.null(decreasing)){ + x=factor(x,levels = names(sort(tapply(y,x,median), decreasing = decreasing)), ordered = F) + }else{ + decreasing=T + } + #limitVisibleModalities + if(limitVisibleModalities<p-1){ + x=factor(x,levels = names(sort(tapply(y,x,median), decreasing = decreasing)), ordered = F) + lx=levels(as.factor(x)) + leftl=lx[1:floor(limitVisibleModalities/2)] + rightl=lx[(p-floor(limitVisibleModalities/2)+1):p] + n_other=length(lx[!lx%in%c(leftl,rightl)]) + x=as.character(x) + x[!x%in%c(leftl,rightl)]<-paste(c("other(",n_other,")"),collapse="") + x=as.factor(x) + x=factor(x,levels = names(sort(tapply(y,x,median), decreasing = decreasing)), ordered = F) + } + #dynamicity + if(dynamic){ + dataf=data.frame(Y=y,X=x) + require(rAmCharts) + amBoxplot(Y~X,data=dataf,labelRotation = (las==2)*90, ylab = laby, main = main) + }else{ + if(sum(ylim)==0){ + rb<-boxplot(y~x, main=main, xlab=xlab, ylab=laby, las=las) + grid() + #rb<-boxplot(y~x, main=main, xlab=xlab, ylab=laby, las=las, add=T) + }else{ + rb<-boxplot(y~x, main=main, xlab=xlab, ylab=laby, las=las, ylim=ylim) + grid() + #rb<-boxplot(y~x, main=main, xlab=xlab, ylab=laby, las=las, add=T) + } + if(plot.mean){ + mn.t <- tapply(y, x, mean, na.rm=T) + sd.t <- tapply(y, x, sd, na.rm=T) + xi <- 0.3 + seq(rb$n) + points(xi, mn.t, col = "red", pch = 18, cex=1) + arrows(xi, mn.t - sd.t, xi, mn.t + sd.t,code = 3, col = "red", angle = 75, length = .1, lwd = 1) + } + if(text.freq)text(x=1:length(rb$names), y=(rb$stats[3,]+rb$stats[4,])/2,label=rb$n) + } +} +############
\ No newline at end of file diff --git a/contrib/TOM/R/explainer.R b/contrib/TOM/R/explainer.R new file mode 100644 index 00000000..fedda750 --- /dev/null +++ b/contrib/TOM/R/explainer.R @@ -0,0 +1,112 @@ +# ---------------------------------------------------------------------- +# title: TOM useful functions +# author: Alassane Samba (alassane.samba@orange.com) +# Copyright (c) 2017 Orange +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +# ---------------------------------------------------------------------- +############ +### Evaluate a prediction +############ +prediction_evaluator_with_p<-function(pred, actual, p){ + ##error + error=actual-pred + ## null ndeviance + null.deviance=sum((mean(actual)-actual)^2) + ## deviance : sum of squared error + se=sum(error^2) + ## n + n=length(actual) + ## mean squared error + mse=se/n + ## root mean squared error + rmse=sqrt(mse) + ## normalized root mean squared error + nrmse=sqrt(mse/(null.deviance/n)) + ## r2 : coef of determination + r2=1-(se/null.deviance) + #adjusted r2 + adfR2=1-(1-r2)*((n-1)/(n-p)) + ## absolute error ratio + abs.error.ratio=abs(error/actual) + ## mean absolute error ratio + mean.abs.error.ratio=mean(abs.error.ratio) + ## median absolute error ratio + med.abs.error.ratio=median(abs.error.ratio) + ## 80th-percentile absolute error ratio + perc80.abs.error.ratio=quantile(abs.error.ratio,0.8) + ## return : + return(list(p=p,n=n,sd=sqrt(null.deviance/n),NRMSE=nrmse,RMSE=rmse,R2=r2,adj.R2=adfR2,mean.abs.error.ratio=mean.abs.error.ratio,med.abs.error.ratio=med.abs.error.ratio,perc80.abs.error.ratio=perc80.abs.error.ratio)) + ### ajouter error et error ratio au return pour pouvoir faire les graphes, etc. +} +######### +## Calculate R2 (coef of determination / part of explained variance) from the GLM regression +######### +betterGenericR2calculator<-function(dataset,targetName,independantVariableNames){ + dataset2=na.omit(dataset[,c(targetName,independantVariableNames)]) + numVars=colnames(dataset2)[unlist(lapply(dataset2,is.numeric))&colnames(dataset2)%in%independantVariableNames] + factorVars=colnames(dataset2)[(!unlist(lapply(dataset2,is.numeric)))&colnames(dataset2)%in%independantVariableNames] + if(length(factorVars)>0&length(numVars)>0){ + factorVarPasted=droplevels(as.factor(apply(cbind(rep("",nrow(dataset2)),as.data.frame(dataset2[,factorVars])),1,paste,collapse=":"))) + theformula=paste(targetName,paste(numVars,collapse = "*"), sep='~') + nbNumVars=length(numVars) + resList=by(dataset2, factorVarPasted, FUN=function(x){m=lm(theformula,data=x,y=T); return(list(pred=m$fitted.values,actual=m$y))}) + pred=unlist(lapply(resList,function(x){x$pred})) + actual=unlist(lapply(resList,function(x){x$actual})) + p=((length(levels(droplevels(as.factor(factorVarPasted))))-1)*nbNumVars)+(length(levels(droplevels(as.factor(factorVarPasted))))-1)+nbNumVars+1 + res=prediction_evaluator_with_p(pred,actual,p) + }else if(length(factorVars)==0&length(numVars)>0){ + theformula=paste(targetName,paste(numVars,collapse = "*"), sep='~') + nbNumVars=length(numVars) + m=lm(theformula,data=dataset2,y=T) + pred=m$fitted.values + actual=m$y + p=nbNumVars+1 + res=prediction_evaluator_with_p(pred,actual,p) + }else if(length(factorVars)>0&length(numVars)==0){ + factorVarPasted=droplevels(as.factor(apply(cbind(rep("",nrow(dataset2)),as.data.frame(dataset2[,factorVars])),1,paste,collapse=":"))) + m=lm(dataset2[,targetName]~factorVarPasted,y=T) + pred=m$fitted.values + actual=m$y + p=length(levels(factorVarPasted)) + res=prediction_evaluator_with_p(pred,actual,p) + }else{ + res=NULL + } + return(res) +} +################### +###### Determine the best predictor set (continuous and factor independant variables) to consider for a continuous dependant variable +################### +genericBestPredictor<-function(dataset,targetName,independantVariableNames, plot=T, text=T, las=1){ + ordered_best_additional_predictors=list() + ordered_best_predictors_per_level=list(NULL) + ordered_best_r2=list() + for (i in 1:length(independantVariableNames)){ + level_i_predictors=as.list(independantVariableNames)[!as.list(independantVariableNames)%in%ordered_best_additional_predictors] + varExp_i=lapply(level_i_predictors,function(x){betterGenericR2calculator(dataset,targetName,c(unlist(ordered_best_additional_predictors),x))$R2}) + if(i==1){ + bivariateR2<-varExp_i + names(bivariateR2)<-unlist(level_i_predictors) + } + ordered_best_additional_predictors=c(ordered_best_additional_predictors,level_i_predictors[varExp_i%in%max(unlist(varExp_i))]) + ordered_best_predictors_per_level=c(ordered_best_predictors_per_level,paste(unlist(ordered_best_predictors_per_level[i]),unlist(ordered_best_additional_predictors[i]), sep=":")) + ordered_best_r2=c(ordered_best_r2,max(unlist(varExp_i))) + ordered_r2_progress=c(round(unlist(ordered_best_r2)[1],2),paste(rep("+",length(ordered_best_r2)-1),round(unlist(ordered_best_r2)[-1]-unlist(ordered_best_r2)[-length(ordered_best_r2)],2))) + } + ordered_best_predictors_per_level=ordered_best_predictors_per_level[-1] + + mynames=unlist(ordered_best_additional_predictors) + mynames=c(mynames[1],paste("+",mynames[2:length(mynames)])) + names(ordered_best_r2)<-mynames + + if(plot){ + #barplot(unlist(ordered_best_r2), names.arg = unlist(ordered_best_predictors_per_level),las=las,ylab="R2",main=paste("Correlation with",targetName)) + barplot(unlist(ordered_best_r2), names.arg = mynames, las=las, ylab="R2",main=paste("Correlation with",targetName)) + text(y=0.1, x=((1:length(mynames))-0.4)*1.2,labels = ordered_r2_progress) + } + if(text) return(list(bivariateR2=bivariateR2,orderedBestR2=ordered_best_r2,orderedBestPredictorsPerLevel=ordered_best_predictors_per_level,orderedBestAdditionalPredictors=ordered_best_additional_predictors, targetName=targetName)) +} +####################
\ No newline at end of file diff --git a/contrib/TOM/R/tom.R b/contrib/TOM/R/tom.R new file mode 100644 index 00000000..a9e00ff2 --- /dev/null +++ b/contrib/TOM/R/tom.R @@ -0,0 +1,238 @@ +# ---------------------------------------------------------------------- +# title: TOM R API +# description: Machine Learning-Based Test Results Analysis +# author: Alassane Samba (alassane.samba@orange.com) +# Copyright (c) 2017 Orange +# All rights reserved. This program and the accompanying materials +# are made available under the terms of the Apache License, Version 2.0 +# which accompanies this distribution, and is available at +# http://www.apache.org/licenses/LICENSE-2.0 +# ---------------------------------------------------------------------- +# Load useful fonctions +source("R/explainer.R") +source("R/boostedBoxPlot.R") +# init +tomData<-NULL +tomAnalysis<-NULL +#* Read new file (to change to post) +#* @get /read +tomRead<-function(file,res){ + file_ok <- as.character(file) + if (is.na(file_ok)){ + res$status <- 400 + res$body <- "val parameter must be a number" + return(res) + } + st=Sys.time() + tomData<<-read.table(file = file_ok, sep=',', header=TRUE, quote = "\"", comment.char = "") + et=Sys.time() + #save(tomData,file = "tom.rdata") + list(result="success", durationInSeconds=as.numeric((et-st), units = "secs")) +} +#* Analyze correlations +#* @get /analyze +tomAnalyze<-function(input="pod_name:deploy_scenario:version:runner_id",output="bandwidth.MBps.",res){ + if(is.null(tomData)){ + res$status <- 400 + res$body <- "please read data first : url/read" + return(res) + } + input_ok<-as.character(input) + input_ok<-unlist(strsplit(input_ok,split = ":")) + if(sum(is.na(input_ok))>0){ + res$status <- 400 + res$body <- "input parameter must be a string including data context parameters separatd by a double dot (:)" + return(res) + } + if(sum(!input_ok%in%colnames(tomData))==length(input_ok)){ + res$status <- 400 + res$body <- "input parameter items must all be inluded in data header" + return(res) + } + output_ok <- as.character(output) + if(is.na(output_ok)){ + res$status <- 400 + res$body <- "output parameter must be a string inluded in data header" + return(res) + } + if(!output_ok%in%colnames(tomData)){ + res$status <- 400 + res$body <- "output parameter must be inluded in data header" + return(res) + } + st=Sys.time() + tomAnalysis<<-genericBestPredictor(dataset = tomData, targetName = output_ok, independantVariableNames = input_ok, plot = F) + et=Sys.time() + list(result="success", durationInSeconds=as.numeric((et-st), units = "secs")) +} +#* Get bivariate R2 values +#* @get /correlation +tomBivariateR2<-function(res){ + if(is.null(tomAnalysis)){ + res$status <- 400 + res$body <- "please analyze data first : url/analyze" + return(res) + } + return(as.list(tomAnalysis$bivariateR2)) +} +#* Get bivariate R2 values +#* @get /explain +tomExplain<-function(res){ + if(is.null(tomAnalysis)){ + res$status <- 400 + res$body <- "please analyze data first : url/analyze" + return(res) + } + return(as.list(tomAnalysis$orderedBestR2)) +} +#* Get bivariate R2 values +#* @png +#* @get /explainGraph +tomExplainGraph<-function(res){ + if(is.null(tomAnalysis)){ + res$status <- 400 + res$body <- "please analyze data first : url/analyze" + return(res) + } + maxmargin=20 + bottomheigth=0.5*max(nchar(names(tomAnalysis$orderedBestR2)),na.rm=T) + if(bottomheigth>maxmargin) bottomheigth<-maxmargin + op=par(mar=c(bottomheigth, 4.1, 4.1, 2.1)) + barplot(unlist(tomAnalysis$orderedBestR2), names.arg = names(tomAnalysis$orderedBestR2), las=2, ylab="R squared correlation coef.",main=paste("Correlation with",tomAnalysis$targetName)) + par(op) +} +#* Get the head contexts cases having highest KPI values +#* @get /head +tomHead<-function(input="pod_name",output="bandwidth.MBps.",limit=5,res){ + if(is.null(tomData)){ + res$status <- 400 + res$body <- "please read data first : url/read" + return(res) + } + input_ok<-as.character(input) + input_ok<-unlist(strsplit(input_ok,split = ":")) + if(sum(is.na(input_ok))>0){ + res$status <- 400 + res$body <- "input parameter must be a string including data context parameters separatd by a double dot (:)" + return(res) + } + if(sum(!input_ok%in%colnames(tomData))==length(input_ok)){ + res$status <- 400 + res$body <- "input parameter items must all be inluded in data header" + return(res) + } + output_ok <- as.character(output) + if(is.na(output_ok)){ + res$status <- 400 + res$body <- "output parameter must be a string inluded in data header" + return(res) + } + if(!output_ok%in%colnames(tomData)){ + res$status <- 400 + res$body <- "output parameter must be inluded in data header" + return(res) + } + limit=as.integer(limit) + if(is.na(limit)){ + res$status <- 400 + res$body <- "limit parameter must be an integer" + return(res) + } + varFactor=apply(as.data.frame(tomData[,input_ok]),1,paste,collapse=":") + sorted=sort(tapply(tomData[,output_ok],varFactor,median, na.rm=T),decreasing = TRUE)[1:limit] + return(as.list(sorted)) +} +#* Get the tail contexts cases having lowest KPI values +#* @get /tail +tomTail<-function(input="pod_name",output="bandwidth.MBps.",limit=5,res){ + if(is.null(tomData)){ + res$status <- 400 + res$body <- "please read data first : url/read" + return(res) + } + input_ok<-as.character(input) + input_ok<-unlist(strsplit(input_ok,split = ":")) + if(sum(is.na(input_ok))>0){ + res$status <- 400 + res$body <- "input parameter must be a string including data context parameters separatd by a double dot (:)" + return(res) + } + if(sum(!input_ok%in%colnames(tomData))==length(input_ok)){ + res$status <- 400 + res$body <- "input parameter items must all be inluded in data header" + return(res) + } + output_ok <- as.character(output) + if(is.na(output_ok)){ + res$status <- 400 + res$body <- "output parameter must be a string inluded in data header" + return(res) + } + if(!output_ok%in%colnames(tomData)){ + res$status <- 400 + res$body <- "output parameter must be inluded in data header" + return(res) + } + limit=as.integer(limit) + if(is.na(limit)){ + res$status <- 400 + res$body <- "limit parameter must be an integer" + return(res) + } + varFactor=apply(as.data.frame(tomData[,input_ok]),1,paste,collapse=":") + sorted=sort(tapply(tomData[,output_ok],varFactor,median,na.rm=T),decreasing = FALSE)[1:limit] + return(as.list(sorted)) +} +#* Get the comparison graph +#* @png +#* @get /comparGraph +tomComparGraph<-function(input="pod_name",output="bandwidth.MBps.",limit=10,plotmean=TRUE,textfreq=TRUE,res){ + if(is.null(tomData)){ + res$status <- 400 + res$body <- "please read data first : url/read" + return(res) + } + input_ok<-as.character(input) + input_ok<-unlist(strsplit(input_ok,split = ":")) + if(sum(is.na(input_ok))>0){ + res$status <- 400 + res$body <- "input parameter must be a string including data context parameters separatd by a double dot (:)" + return(res) + } + if(sum(!input_ok%in%colnames(tomData))==length(input_ok)){ + res$status <- 400 + res$body <- "input parameter items must all be inluded in data header" + return(res) + } + output_ok <- as.character(output) + if(is.na(output_ok)){ + res$status <- 400 + res$body <- "output parameter must be a string inluded in data header" + return(res) + } + if(!output_ok%in%colnames(tomData)){ + res$status <- 400 + res$body <- "output parameter must be inluded in data header" + return(res) + } + limit=as.integer(limit) + if(is.na(limit)){ + res$status <- 400 + res$body <- "n parameter must be an integer" + return(res) + } + varFactor=as.factor(apply(as.data.frame(tomData[,input_ok]),1,paste,collapse=":")) + maxmargin=20 + bottomheigth=0.5*max(nchar(levels(varFactor)),na.rm=T) + if(bottomheigth>maxmargin) bottomheigth<-maxmargin + leftwidth=0.5*nchar(as.character(max(tomData[,output_ok],na.rm=T))) + if(leftwidth>maxmargin) leftwidth<-maxmargin + op=par(mar=c(bottomheigth, leftwidth+1, 4.1, 2.1)) + boostedBoxplot(tomData[,output_ok],varFactor,decreasing=T,las=2,main=input,laby="",labx="",limitVisibleModalities=limit,dynamic=F,plot.mean=plotmean, text.freq=textfreq) + title(ylab = output_ok, line = leftwidth) + par(op) +} +# # to run it, do: +# library(plumber) +# r<-plumb("tom.r") +# r$run(port=8000) |