Resultados clasificación

library(readr)
library(ggplot2)
library(gridExtra)
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble  1.4.2     ✔ dplyr   0.7.6
## ✔ tidyr   0.8.1     ✔ stringr 1.3.1
## ✔ purrr   0.2.5     ✔ forcats 0.3.0
## ── Conflicts ──────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
library(lazyeval)
## 
## Attaching package: 'lazyeval'
## The following objects are masked from 'package:purrr':
## 
##     is_atomic, is_formula
dataset <- read_csv("classification-results.csv")
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   dataset = col_character(),
##   var = col_character(),
##   config.train = col_character(),
##   config.vars = col_character(),
##   T = col_integer(),
##   alg = col_character()
## )
## See spec(...) for full column specifications.
dataset[,8:ncol(dataset)] <- round(dataset[,8:ncol(dataset)],digits = 3)

ESTACION: valores posibles en la columna vars. Son las estaciones meteorológicas

print(unique(dataset$var))
## [1] "tunuyan.temp_min"     "agua_amarga.temp_min" "junin.temp_min"      
## [4] "la_llave.temp_min"    "las_paredes.temp_min"

DATASET: valores posibles “dacc”, faltan casos de “dacc-temp”,“dacc-spring”

Caso columna o dato que evalue si colabora o no el enfoque agregar vecinos.

ESTACION <- "junin.temp_min"
DATASET <- "dacc"
ALGORITMO <- "glm"

stations <- unique(dataset$var)
algoritmos <- unique(dataset$alg)
metrics <- c("FAR","Sensitivity","Specificity","Accuracy","Kappa","F1","Precision")

Carga dataset

Creación de dataset genérico para resultados clasificación

df <- dataset %>%
  unite(dataset, 
        col=label,c("dataset","var","config.train","config.vars","T","alg"),
        sep = "-",remove=FALSE) %>% 
  select(label,dataset,var,config.train,config.vars,T,alg,FAR,Sensitivity,Specificity,Accuracy,Kappa,F1,Precision) 

ENFOQUE local vs all

Vistazo del dataset del que hacemos la resta:

df %>%
  select(dataset,var,config.train,config.vars,T,alg,FAR) %>%
  filter(dataset == DATASET & var == ESTACION ) %>% # & alg=="glm" & T == 1 ) %>%
  group_by(dataset,var,config.train,T,alg) %>% 
  arrange(dataset,var,config.train,T,alg,desc(config.vars)) %>% 
  mutate(local_vs_all = lag(FAR) - FAR)
## # A tibble: 54 x 8
## # Groups:   dataset, var, config.train, T, alg [28]
##    dataset var     config.train config.vars     T alg     FAR local_vs_all
##    <chr>   <chr>   <chr>        <chr>       <int> <chr> <dbl>        <dbl>
##  1 dacc    junin.… normal       local           1 C5.0   0.2         NA   
##  2 dacc    junin.… normal       all             1 C5.0   0.28        -0.08
##  3 dacc    junin.… normal       local           1 glm    0.15        NA   
##  4 dacc    junin.… normal       all             1 glm    0.16        -0.01
##  5 dacc    junin.… normal       local           1 rf     0.15        NA   
##  6 dacc    junin.… normal       all             1 rf     0.17        -0.02
##  7 dacc    junin.… normal       local           1 rpart  0.18        NA   
##  8 dacc    junin.… normal       all             1 rpart  0.26        -0.08
##  9 dacc    junin.… normal       local           2 C5.0   0.21        NA   
## 10 dacc    junin.… normal       all             2 C5.0   0.25        -0.04
## # ... with 44 more rows

para crear columna local vs all

mutate_call_ <- function(df, col1, col2, new_col_name) {
  mutate_call = lazyeval::interp(~ round(lag(a) - b,2), a = as.name(col1), b = as.name(col2))
 df %>% mutate_(.dots = setNames(list(mutate_call), new_col_name))
}

df data.frame

NO IMPLEMENTADO filtro: filtro a pasar a ggplot en filter

m metrica, valores posibles o referencia array metrics o mirar dataset.

s: character, nombre de la estacion o variable predecida

plot_local_vs_all <- function(df,m,s)
{
  df1 <- df %>%
    select(dataset,var,config.train,config.vars,T,alg,m) %>%
    group_by(dataset,var,config.train,T,alg) %>% 
    arrange(dataset,var,config.train,T,alg,desc(config.vars)) %>% 
    mutate_call_(m, m, "local_vs_all") %>% 
    filter(!is.na(local_vs_all))  %>%
    unite(col=label,c("dataset","var","config.train","T","alg"),
          sep = "-",remove=FALSE) 
  
  
  p <- ggplot(data=df1, aes(x= reorder(label,-local_vs_all), y=local_vs_all)) +
          geom_bar(stat="identity",fill="green")+ 
          geom_text(aes(label=local_vs_all), vjust=1.3, color="black", size=3) + 
          coord_flip()+
          theme_minimal() +
          labs(x = "Models",title=paste(s,m,sep="--"))
  print(p)
  return(p)
}

corro para cada estación para cada una de las métricas

IMPORTANTE, las barras representan la resta de local - all para alguna métrica.

Barra negativa significa que config ALL es mayor a local

(info de las otras estaciones)

lista <- NULL
for(s in stations)
{
  df1 <- df %>%
    #select(dataset,var,config.train,config.vars,T,alg,m) %>%
    filter(dataset == DATASET & var == s ) # & alg=="glm" & T == 1 ) %>%
  for(m in metrics){
    lista[[paste(s,m,sep="--")]] <- plot_local_vs_all(df1,m,s)
  }
}