suppressMessages(library(readr))
suppressMessages(library(ggplot2))
suppressMessages(library(gridExtra))
suppressMessages(library(tidyverse))
suppressMessages(library(lazyeval))
dataset <- suppressMessages(read_csv("classification-results.csv"))
dataset[,8:ncol(dataset)] <- round(dataset[,8:ncol(dataset)],digits = 3)
stations <- unique(dataset$var)
algoritmos <- unique(dataset$alg)
metrics <- c("FAR","Sensitivity","Specificity","Accuracy","Kappa","F1","Precision")
#' ## Carga dataset
#' Creación de dataset genérico para resultados clasificación
df <- dataset %>%
unite(dataset,
col=label,c("dataset","var","config.train","config.vars","T","alg"),
sep = "-",remove=FALSE) %>%
select(label,dataset,var,config.train,config.vars,T,alg,FAR,Sensitivity,Specificity,Accuracy,Kappa,F1,Precision)
Estaciones vs algoritmos: cuales de los algoritmos son “mejores” respecto a F1, Recall, precision
df3 <- df %>% filter( dataset == "dacc" & T==1 )
p <-ggplot(aes(y = Precision , x = var, fill = alg), data = df3) + geom_boxplot() + coord_flip()
print(p)
p <-ggplot(aes(y = Sensitivity , x = var, fill = alg), data = df3) + geom_boxplot() + coord_flip()
print(p)
#png("plot-estaciones-vs-algs-F1-filtro-datasetDacc-T1.png")
p <-ggplot(aes(y = F1 , x = var, fill = alg), data = df3) +
geom_boxplot() + coord_flip() #+ labs(fill="T")
print(p)
#dev.off()
Ahora, lo mismo que lo anterior pero considerando todos los T
df3 <- df %>% filter( dataset == "dacc" )
p <-ggplot(aes(y = Precision , x = var, fill = alg), data = df3) + geom_boxplot() + coord_flip()
print(p)
p <-ggplot(aes(y = Sensitivity , x = var, fill = alg), data = df3) + geom_boxplot() + coord_flip()
print(p)
#png("plot-estaciones-vs-algs-F1-filtro-datasetDacc-T1.png")
p <-ggplot(aes(y = F1 , x = var, fill = alg), data = df3) +
geom_boxplot() + coord_flip() #+ labs(fill="T")
print(p)
#dev.off()
RF por cada una de las estaciones
df3 <- df %>% filter( dataset == "dacc" & T==1 & alg=="rf")
p <-ggplot(aes(y = Precision , x = var, fill = config.vars), data = df3) + geom_boxplot() + coord_flip() + labs(title="Random forest por estaciones")
print(p)
p <-ggplot(aes(y = Sensitivity , x = var, fill = config.vars), data = df3) + geom_boxplot() + coord_flip() + labs(title="Random forest por estaciones")
print(p)
p <-ggplot(aes(y = F1 , x = var, fill = config.vars), data = df3) +
geom_boxplot() + coord_flip() #+ labs(fill="T")
print(p)
df3 <- df %>% filter( dataset == "dacc" & T==1 & alg=="glm")
p <-ggplot(aes(y = Precision , x = var, fill = config.vars), data = df3) + geom_boxplot() + coord_flip() + labs(title="Random forest por estaciones")
print(p)
p <-ggplot(aes(y = Sensitivity , x = var, fill = config.vars), data = df3) + geom_boxplot() + coord_flip() + labs(title="Random forest por estaciones")
print(p)
p <-ggplot(aes(y = F1 , x = var, fill = config.vars), data = df3) +
geom_boxplot() + coord_flip() #+ labs(fill="T")
print(p)
Comportamiento general de local vs all de los algoritmos por cada estación según métrica. Cada linea incluye casos T=1…4, normal/smote, all/local
for(s in stations){
df1 <- df %>% filter( var == s )
p <-ggplot(aes(y = F1 , x = alg, fill = config.vars), data = df1) +
geom_boxplot() + coord_flip() + labs(title=paste("Estación",s,sep=" "))
print(p)
p <-ggplot(aes(y = Sensitivity , x = alg, fill = config.vars), data = df1) +
geom_boxplot() + coord_flip() + labs(title=paste("Estación",s,sep=" "))
print(p)
p <-ggplot(aes(y = Precision , x = alg, fill = config.vars), data = df1) +
geom_boxplot() + coord_flip() + labs(title=paste("Estación",s,sep=" "))
print(p)
}
library(reshape)
##
## Attaching package: 'reshape'
## The following object is masked from 'package:dplyr':
##
## rename
## The following objects are masked from 'package:tidyr':
##
## expand, smiths
df3 <- df %>% filter( dataset == "dacc") %>% select(-one_of(c("FAR")))
# melt(df3,id.vars="label")
df4 <- melt(as.data.frame(df3),
id.vars =(c("label","dataset","var","config.train","config.vars","T","alg")),
measure.vars = metrics[-1])
p <-ggplot(aes(y = value , x = var, fill = variable), data = df4) +
geom_boxplot() + coord_flip() + labs(title="Variabilidad de las métricas por las estaciones")
print(p)
df4 <- df %>% filter( dataset == "dacc")
df4 <- melt(as.data.frame(df4),
id.vars =(c("label","dataset","var","config.train","config.vars","T","alg")),
measure.vars = metrics)
for(a in algoritmos)
{
for(m in metrics)
{
df5 <- df4 %>% filter( alg == a & variable == m)
p <-ggplot(aes(y = value , x = var, fill = config.vars), data = df5) +
geom_boxplot() + coord_flip() + labs(title=paste("Comportamiento de ",m," en modelo ",a,sep=""))
print(p)
}
}
Random forest
Comparación normal vs smote por estación para métricas sensitivity y precision
p <-ggplot(aes(y = Precision , x = var, fill = config.train), data = df) + geom_boxplot() + coord_flip()
print(p)
p <-ggplot(aes(y = Sensitivity , x = var, fill = config.train), data = df) + geom_boxplot() + coord_flip()
print(p)
p <-ggplot(aes(y = F1 , x = var, fill = config.train), data = df) + geom_boxplot() + coord_flip()
print(p)
La configuración SMoTE aumenta la sensitivity/F1 en detrimento de la precision. Es una consecuencia esperable.