library(tidymodels) # para machine learning
library(ranger) # para random forest
library(themis) # SMOTE, ROSE etc
library(DataExplorer)
library(here)
source(here("utils","dataset-processing.R"))
# global variables
dacc_daily_tmin <- get.dataset("dacc")$data
estaciones <- c("junin","tunuyan","agua_amarga","las_paredes","la_llave")
get_dataset_for_classification <- function(station_name)
{
if(is.null(station_name) | !(station_name %in% estaciones) ) stop("station_name must be a valid name from estaciones")
junin <- dacc_daily_tmin %>%
select(starts_with(station_name)) %>%
rename_with( ~ tolower(gsub(paste0(station_name,"."), "", .x, fixed = TRUE)))
# quita el junin. de los nombres de las columnas
# Quiero predecir la temperatura mÃnima, si es o no es helada.
tmin <- junin %>%
select(temp_min)
colnames(tmin) <- "tmin"
tmin_helada <- tmin
tmin_helada <- tmin_helada %>% mutate(tmin = case_when(
tmin <= 0 ~ as.character("helada"), # frost event
TRUE ~ as.character("no-helada") # no frost
))
# para regresion
# data_tmin <- cbind(junin[1:(nrow(junin)-1),],tmin[2:nrow(tmin),])
data_clasificacion <- cbind(junin[1:(nrow(junin)-1),],tmin_helada[2:nrow(tmin_helada),])
#Convierto a factor la columna tmin
data_clasificacion$tmin <- as.factor(data_clasificacion$tmin)
return(data_clasificacion)
}
estacion <- "junin" ### <<<<<<<<<<<<-----------------
data_clasificacion <- get_dataset_for_classification(estacion)
get_dataset_for_classification(estacion) %>% head()
## temp_max humedad_max temp_med humedad_med temp_min humedad_min tmin
## 1 38.50 79 27.84583 37.12500 16.89 13 no-helada
## 2 38.58 69 28.35833 41.66667 18.28 16 no-helada
## 3 31.53 78 25.12500 53.83333 20.55 26 no-helada
## 4 37.85 96 26.40417 58.58333 15.83 1 no-helada
## 5 34.82 97 26.53333 66.95833 17.87 42 no-helada
## 6 33.38 91 25.81250 56.04167 19.70 28 no-helada
# split in trainint and testing sets
data_split <- function(dataset,prop = 0.75) {
base_split <- dataset %>%
initial_split(prop=prop) # divido en prop %
return(base_split)
}
set.seed(4848) #setear la semilla
base_split <- data_split(data_clasificacion)
# to extract the training and testing datasets
#base_train <- training(base_split)
#base_testing <- testing(base_split)
base_split
## <Analysis/Assess/Total>
## <4138/1379/5517>
# para hacer validación cruzada estratificada
base_folds <- vfold_cv(training(base_split), strata = tmin)
Vamos a entrenar con 4138 muestras, validar con 1379 y en total son 5517 muestras.
Especifico ahora la receta para el entrenamiento.
receta_smote <- function(un_split){
base_recipe <- training(un_split) %>%
recipe(tmin ~.) %>%
step_corr(all_numeric()) %>%
step_normalize(all_numeric(), -all_outcomes()) %>%
step_smote(tmin)
base_recipe
}
receta_sin_smote <- function(un_split) {
base_recipe <- training(base_split) %>%
recipe(tmin ~.) %>%
step_corr(all_numeric()) %>%
step_normalize(all_numeric(), -all_outcomes())
base_recipe
}
# receta
set.seed(4848) #setear la semilla
base_recipe <- training(base_split) %>%
recipe(tmin ~.) %>%
step_corr(all_numeric()) %>%
step_normalize(all_numeric(), -all_outcomes())
#%>%
# step_smote(tmin)
base_recipe
## Data Recipe
##
## Inputs:
##
## role #variables
## outcome 1
## predictor 6
##
## Operations:
##
## Correlation filter on all_numeric()
## Centering and scaling for all_numeric(), -all_outcomes()
# modelo
base_rf_spec <- rand_forest() %>%
set_engine("ranger") %>%
set_mode("classification")
#random forest fit
set.seed(4848)
base_rf_model <- base_rf_spec %>%
fit_resamples(tmin ~ .,
resamples = base_folds)
base_rf_model %>% collect_metrics()
## # A tibble: 2 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 accuracy binary 0.937 10 0.00348
## 2 roc_auc binary 0.967 10 0.00299
show_best(base_rf_model, metric = "accuracy")
## # A tibble: 1 x 5
## .metric .estimator mean n std_err
## <chr> <chr> <dbl> <int> <dbl>
## 1 accuracy binary 0.937 10 0.00348
base_split <- data_split(data_clasificacion)
base_recipe <- receta_sin_smote(base_split)
wf <- workflow() %>%
add_recipe(base_recipe) %>% #agrego la receta
add_model(base_rf_spec) #agrego el modelo
final_fit <- last_fit(wf,split = base_split)
final_fit %>%
collect_metrics()
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.921
## 2 roc_auc binary 0.937
accuracy <- final_fit %>%
collect_metrics() %>%
select(.estimate) %>%
as.vector() %>%
.[[1]] %>%
.[1] %>% round(4)
roc_auc <- final_fit %>%
collect_metrics() %>%
select(.estimate) %>%
as.vector() %>%
.[[1]] %>%
.[2] %>% round(4)
recall <- final_fit$.predictions[[1]] %>%
recall(truth= tmin, .pred_class) %>%
select(.estimate) %>%
as.numeric() %>% round(4)
precision <- final_fit$.predictions[[1]] %>%
precision(truth= tmin, .pred_class) %>%
select(.estimate) %>%
as.numeric() %>% round(4)
config1 <- c(estacion,"sin-smote",accuracy,roc_auc,recall,precision)
pr_curve
pr_curve_sin_smote <- final_fit$.predictions[[1]] %>% pr_curve(truth=tmin,.pred_helada)
roc curve
roc_curve_sin_smote <- final_fit$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada)
base_split <- data_split(data_clasificacion)
base_recipe <- receta_smote(base_split)
wf <- workflow() %>%
add_recipe(base_recipe) %>% #agrego la receta
add_model(base_rf_spec) #agrego el modelo
final_fit <- last_fit(wf,split = base_split)
final_fit %>%
collect_metrics()
## # A tibble: 2 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 accuracy binary 0.899
## 2 roc_auc binary 0.936
final_fit$.predictions[[1]] %>% recall(truth= tmin, .pred_class)
## # A tibble: 1 x 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 recall binary 0.734
accuracy <- final_fit %>%
collect_metrics() %>%
select(.estimate) %>%
as.vector() %>%
.[[1]] %>%
.[1] %>% round(4)
roc_auc <- final_fit %>%
collect_metrics() %>%
select(.estimate) %>%
as.vector() %>%
.[[1]] %>%
.[2] %>% round(4)
recall <- final_fit$.predictions[[1]] %>%
recall(truth= tmin, .pred_class) %>%
select(.estimate) %>%
as.numeric() %>% round(4)
precision <- final_fit$.predictions[[1]] %>%
precision(truth= tmin, .pred_class) %>%
select(.estimate) %>%
as.numeric() %>% round(4)
config2 <- c(estacion,"smote",accuracy,roc_auc,recall,precision)
pr_curve
pr_curve_smote <- final_fit$.predictions[[1]] %>% pr_curve(truth=tmin,.pred_helada)
roc curve
roc_curve_smote <- final_fit$.predictions[[1]] %>% roc_curve(truth=tmin,.pred_helada)
df <- data.frame(rbind(config1,config2))
colnames(df) <- c("estacion","config","accuracy","roc_auc","recall","precision")
df
## estacion config accuracy roc_auc recall precision
## config1 junin sin-smote 0.921 0.9373 0.5875 0.6861
## config2 junin smote 0.8992 0.936 0.7338 0.5355
tablita <- pr_curve_sin_smote %>% add_column(configuracion="Sin SMOTE")
pr_curve_smote %>%
add_column(configuracion="SMOTE") %>%
bind_rows(tablita) %>%
ggplot(aes(x=recall,y=precision,color=configuracion)) +
geom_line()
tablita <- roc_curve_sin_smote %>% add_column(configuracion="Sin SMOTE")
roc_curve_smote %>%
add_column(configuracion="SMOTE") %>%
bind_rows(tablita) %>%
ggplot(aes(x=(1-specificity),y=sensitivity,color=configuracion)) +
geom_line()