librerias a utilizar

library(tidymodels) # para machine learning
library(ranger) # para random forest
library(themis) # SMOTE, ROSE etc
library(DataExplorer)

Load raw dataset

library(here)
source(here("utils","dataset-processing.R"))

# global variables

dacc_daily_tmin <- get.dataset("dacc")$data

Nombres de las estaciones meteorológicas

estaciones <- c("junin","tunuyan","agua_amarga","las_paredes","la_llave")

Función que genera el dataset para clasificación

get_dataset_for_classification <- function(station_name)
{
  
  if(is.null(station_name) | !(station_name %in% estaciones) ) stop("station_name must be a valid name from estaciones")

  junin <- dacc_daily_tmin %>% 
  select(starts_with(station_name)) %>% 
  rename_with( ~ tolower(gsub(paste0(station_name,"."), "", .x, fixed = TRUE))) 
  # quita el junin. de los nombres de las columnas

# Quiero predecir la temperatura mínima, si es o no es helada.

  tmin <- junin %>% 
    select(temp_min) 
  
  colnames(tmin) <- "tmin"
  
  
  tmin_helada <- tmin
  
  tmin_helada <- tmin_helada %>% mutate(tmin = case_when(
    tmin <= 0 ~ as.character("helada"),  # frost event
    TRUE ~ as.character("no-helada")   # no frost
  ))
  
  # para regresion
  # data_tmin <- cbind(junin[1:(nrow(junin)-1),],tmin[2:nrow(tmin),])
  
  data_clasificacion <- cbind(junin[1:(nrow(junin)-1),],tmin_helada[2:nrow(tmin_helada),])
  
  #Convierto a factor la columna tmin
  
  data_clasificacion$tmin <- as.factor(data_clasificacion$tmin)
  
  return(data_clasificacion)
}

Obtener dataset para clasificación

estacion <- "junin"   ### <<<<<<<<<<<<-----------------
data_clasificacion <- get_dataset_for_classification(estacion)
get_dataset_for_classification(estacion) %>% head()
##   temp_max humedad_max temp_med humedad_med temp_min humedad_min      tmin
## 1    38.50          79 27.84583    37.12500    16.89          13 no-helada
## 2    38.58          69 28.35833    41.66667    18.28          16 no-helada
## 3    31.53          78 25.12500    53.83333    20.55          26 no-helada
## 4    37.85          96 26.40417    58.58333    15.83           1 no-helada
## 5    34.82          97 26.53333    66.95833    17.87          42 no-helada
## 6    33.38          91 25.81250    56.04167    19.70          28 no-helada

clasificador sin SMOTE

Configuración entrenamiento

# split in trainint and testing sets
data_split <- function(dataset,prop = 0.75) {

    base_split <- dataset %>%
          initial_split(prop=prop) # divido en prop %
    return(base_split)
}
set.seed(4848) #setear la semilla

base_split <- data_split(data_clasificacion) 

# to extract the training and testing datasets 
#base_train <- training(base_split)
#base_testing <- testing(base_split)

base_split
## <Analysis/Assess/Total>
## <4138/1379/5517>
# para hacer validación cruzada estratificada
base_folds <- vfold_cv(training(base_split), strata = tmin) 

Vamos a entrenar con 4138 muestras, validar con 1379 y en total son 5517 muestras.

Especifico ahora la receta para el entrenamiento.

receta_smote <- function(un_split){
  
  base_recipe <- training(un_split) %>%
  recipe(tmin ~.) %>%
  step_corr(all_numeric()) %>%
  step_normalize(all_numeric(), -all_outcomes()) %>% 
  step_smote(tmin)
  
  
  base_recipe
}



receta_sin_smote <- function(un_split) {
  
  base_recipe <- training(base_split) %>%
  recipe(tmin ~.) %>%
  step_corr(all_numeric()) %>%
  step_normalize(all_numeric(), -all_outcomes()) 
  
  base_recipe

}
# receta
set.seed(4848) #setear la semilla

base_recipe <- training(base_split) %>%
  recipe(tmin ~.) %>%
  step_corr(all_numeric()) %>%
  step_normalize(all_numeric(), -all_outcomes()) 
#%>% 
#  step_smote(tmin)

base_recipe
## Data Recipe
## 
## Inputs:
## 
##       role #variables
##    outcome          1
##  predictor          6
## 
## Operations:
## 
## Correlation filter on all_numeric()
## Centering and scaling for all_numeric(), -all_outcomes()

Modelo random forest Junín solo para predecir helada o no_helada

# modelo
base_rf_spec <- rand_forest() %>% 
  set_engine("ranger") %>% 
  set_mode("classification")
#random forest fit 
set.seed(4848)

base_rf_model <- base_rf_spec %>% 
  fit_resamples(tmin ~ ., 
                resamples = base_folds)
base_rf_model %>% collect_metrics() 
## # A tibble: 2 x 5
##   .metric  .estimator  mean     n std_err
##   <chr>    <chr>      <dbl> <int>   <dbl>
## 1 accuracy binary     0.937    10 0.00348
## 2 roc_auc  binary     0.967    10 0.00299
show_best(base_rf_model, metric = "accuracy")
## # A tibble: 1 x 5
##   .metric  .estimator  mean     n std_err
##   <chr>    <chr>      <dbl> <int>   <dbl>
## 1 accuracy binary     0.937    10 0.00348

Predicción en test-set

Sin without SMOTE

base_split <- data_split(data_clasificacion) 
base_recipe <- receta_sin_smote(base_split)

wf <- workflow() %>%
  add_recipe(base_recipe) %>% #agrego la receta
  add_model(base_rf_spec) #agrego el modelo

final_fit <- last_fit(wf,split = base_split)


final_fit %>%
  collect_metrics()
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.921
## 2 roc_auc  binary         0.937
accuracy <- final_fit %>%
  collect_metrics() %>% 
  select(.estimate) %>% 
  as.vector() %>% 
  .[[1]] %>% 
  .[1] %>% round(4)
roc_auc <- final_fit %>%
  collect_metrics() %>% 
  select(.estimate) %>% 
  as.vector() %>% 
  .[[1]] %>% 
  .[2] %>% round(4)
recall <- final_fit$.predictions[[1]] %>%
  recall(truth= tmin, .pred_class) %>%
  select(.estimate) %>%
  as.numeric()  %>% round(4)
precision <- final_fit$.predictions[[1]] %>%
  precision(truth= tmin, .pred_class) %>%
  select(.estimate) %>%
  as.numeric()  %>% round(4)
config1 <- c(estacion,"sin-smote",accuracy,roc_auc,recall,precision)

pr_curve

pr_curve_sin_smote <- final_fit$.predictions[[1]] %>% pr_curve(truth=tmin,.pred_helada) 

roc curve

roc_curve_sin_smote <- final_fit$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada) 

Con SMOTE

base_split <- data_split(data_clasificacion) 
base_recipe <- receta_smote(base_split)

wf <- workflow() %>%
  add_recipe(base_recipe) %>% #agrego la receta
  add_model(base_rf_spec) #agrego el modelo

final_fit <- last_fit(wf,split = base_split)


final_fit %>%
  collect_metrics()
## # A tibble: 2 x 3
##   .metric  .estimator .estimate
##   <chr>    <chr>          <dbl>
## 1 accuracy binary         0.899
## 2 roc_auc  binary         0.936
final_fit$.predictions[[1]] %>% recall(truth= tmin, .pred_class)   
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 recall  binary         0.734
accuracy <- final_fit %>%
  collect_metrics() %>% 
  select(.estimate) %>% 
  as.vector() %>% 
  .[[1]] %>% 
  .[1] %>% round(4)
roc_auc <- final_fit %>%
  collect_metrics() %>% 
  select(.estimate) %>% 
  as.vector() %>% 
  .[[1]] %>% 
  .[2] %>% round(4)
recall <- final_fit$.predictions[[1]] %>%
  recall(truth= tmin, .pred_class) %>%
  select(.estimate) %>%
  as.numeric()  %>% round(4)
precision <- final_fit$.predictions[[1]] %>%
  precision(truth= tmin, .pred_class) %>%
  select(.estimate) %>%
  as.numeric()  %>% round(4)
config2 <- c(estacion,"smote",accuracy,roc_auc,recall,precision)

pr_curve

pr_curve_smote <- final_fit$.predictions[[1]] %>% pr_curve(truth=tmin,.pred_helada) 

roc curve

roc_curve_smote <- final_fit$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada) 

Comparación

df <- data.frame(rbind(config1,config2))
colnames(df) <- c("estacion","config","accuracy","roc_auc","recall","precision")
df
##         estacion    config accuracy roc_auc recall precision
## config1    junin sin-smote    0.921  0.9373 0.5875    0.6861
## config2    junin     smote   0.8992   0.936 0.7338    0.5355
tablita <- pr_curve_sin_smote %>% add_column(configuracion="Sin SMOTE")
pr_curve_smote %>% 
  add_column(configuracion="SMOTE") %>% 
  bind_rows(tablita) %>% 
  ggplot(aes(x=recall,y=precision,color=configuracion)) +
  geom_line()

tablita <- roc_curve_sin_smote %>% add_column(configuracion="Sin SMOTE")
roc_curve_smote %>% 
  add_column(configuracion="SMOTE") %>% 
  bind_rows(tablita) %>% 
  ggplot(aes(x=(1-specificity),y=sensitivity,color=configuracion)) +
  geom_line()