Bibliotecas

library(tidyverse)
library(tidymodels)
library(glmnet)
library(vip)
library(themis)
library(doParallel)

Ejemplo regresión

Datos iniciales

df_turistas <- read_csv("Train.csv") %>% 
  select(-country)
df_turistas %>% head()

Train - Test

set.seed(2022)
particion_inicial <- initial_split(data = df_turistas, prop = 0.8, strata = total_cost)
train <- training(particion_inicial)
test <- testing(particion_inicial)

Validación cruzada (k-fold)

set.seed(2022)
submuestras_reg <- vfold_cv(data = train, v = 5, repeats = 1, strata = total_cost)

Regresión Regularizada

modelo_regresion <-
  linear_reg(mixture = tune(), penalty = tune()) %>%
  set_mode("regression") %>%
  set_engine("glmnet")

Receta

receta_regresion <- recipe(total_cost ~ ., data = train) %>% 
  update_role(ID, new_role = "ID") %>% 
  step_impute_median(all_numeric_predictors()) %>% 
  step_impute_mode(all_nominal_predictors()) %>% 
  step_poly(night_zanzibar, degree = 2) %>% 
  step_ns(night_mainland, deg_free = 2) %>% 
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>% 
  step_normalize(all_numeric_predictors())

Flujo de trabajo

flujo_regresion <- 
  workflow() %>% 
  add_recipe(receta_regresion) %>% 
  add_model(modelo_regresion)

Cuadrícula de búsqueda

  • grid_regular()
  • grid_random()
  • grid_latin_hypercube()
  • grid_max_entropy()
set.seed(2022)
grid_regresion <- 
  grid_max_entropy(penalty(), mixture(), size = 20)

Optimización de hiperparámetros

Ajuste

registerDoParallel()
tune_regresion <-
  tune_grid(flujo_regresion,
            resamples = submuestras_reg,
            grid = grid_regresion)
stopImplicitCluster()

Resultados de hiperparámetros

tune_regresion %>% 
  collect_metrics() %>% 
  filter(.metric == "rmse") %>% 
  ggplot(aes(x = penalty, y = mixture, color = mean)) +
  geom_point(size = 3) +
  scale_x_log10() +
  scale_color_viridis_c()

5 Mejores resultados

tune_regresion %>% 
  show_best(metric = "rmse")

Mejor resultado

mejor_regresion <-
  tune_regresion %>%
  select_best(metric = "rmse")
mejor_regresion

Finalizando flujo

final_wf_regresion <- flujo_regresion %>% 
  finalize_workflow(mejor_regresion)
final_wf_regresion
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: linear_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 6 Recipe Steps
## 
## * step_impute_median()
## * step_impute_mode()
## * step_poly()
## * step_ns()
## * step_dummy()
## * step_normalize()
## 
## -- Model -----------------------------------------------------------------------
## Linear Regression Model Specification (regression)
## 
## Main Arguments:
##   penalty = 0.632285841990838
##   mixture = 0.00163340591825545
## 
## Computational engine: glmnet

Modelo optimizado

Ajuste final

registerDoParallel()

ajuste_final_regresion <- 
  last_fit(final_wf_regresion, particion_inicial)

stopImplicitCluster()

Métricas en Test

ajuste_final_regresion %>% 
  collect_metrics()

Importancia de variables

flujo_regresion %>% 
  fit(train) %>% 
  pull_workflow_fit() %>% 
  vi() %>% 
  slice(1:20) %>% 
  mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y = NULL)

Ejemplo de clasificación

Datos iniciales

df_cardio <-
  read_csv("heart_failure_clinical_records_dataset.csv") %>%
  mutate(across(
    c(anaemia, diabetes, high_blood_pressure, sex, smoking, DEATH_EVENT),
    as.factor
  ))

df_cardio

Train - Test

set.seed(2022)
particion_inicial_c <- initial_split(data = df_cardio, prop = 0.8, strata = DEATH_EVENT)
train_c <- training(particion_inicial_c)
test_c <- testing(particion_inicial_c)

Validación cruzada (k-fold)

set.seed(2022)
submuestras_kfold_c <- vfold_cv(data = train_c, v = 5, repeats = 1, strata = DEATH_EVENT)

Regresión regularizada

modelo_clasificacion  <-
  logistic_reg(mixture = tune(), penalty = tune()) %>%
  set_mode("classification") %>%
  set_engine("glmnet")

Receta

receta_clasificacion <- recipe(DEATH_EVENT ~ ., data = train_c) %>%
  step_YeoJohnson(all_numeric_predictors()) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_normalize(all_numeric_predictors()) %>%
  step_upsample(DEATH_EVENT)

Flujo de trabajo

flujo_clasificacion <-
  workflow() %>% 
  add_recipe(receta_clasificacion) %>% 
  add_model(modelo_clasificacion)

Cuadrícula de búsqueda

set.seed(2022)
grid_clasificacion <- 
  grid_latin_hypercube(penalty(), mixture(), size = 20)

Optimización de hiperparámetros

Ajuste

registerDoParallel()
tune_clasificacion <-
  tune_grid(flujo_clasificacion,
            resamples = submuestras_kfold_c,
            grid = grid_clasificacion)
stopImplicitCluster()

Resultados de hiperparámetros

tune_clasificacion %>% 
  collect_metrics() %>% 
  filter(.metric == "roc_auc") %>% 
  ggplot(aes(x = penalty, y = mixture, color = mean)) +
  geom_point(size = 3) +
  scale_x_log10() +
  scale_color_viridis_c()

5 mejores resultados

tune_clasificacion %>% 
  show_best(metric = "roc_auc")

Mejor resultado

mejor_clasificacion <-
  tune_clasificacion %>%
  select_best(metric = "roc_auc")
mejor_clasificacion

Finalizando flujo

final_wf_clasificacion <- flujo_clasificacion %>% 
  finalize_workflow(mejor_clasificacion)
final_wf_clasificacion
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
## 
## -- Preprocessor ----------------------------------------------------------------
## 4 Recipe Steps
## 
## * step_YeoJohnson()
## * step_dummy()
## * step_normalize()
## * step_upsample()
## 
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
## 
## Main Arguments:
##   penalty = 0.188068715459359
##   mixture = 0.439757207746152
## 
## Computational engine: glmnet

Modelo optimizado

Ajuste final

registerDoParallel()

ajuste_final_clasificacion <- 
  last_fit(final_wf_clasificacion, particion_inicial_c)

stopImplicitCluster()

Métricas en Test

ajuste_final_clasificacion %>% 
  collect_metrics()

Importancia de variables

flujo_clasificacion %>% 
  fit(train_c) %>% 
  pull_workflow_fit() %>% 
  vi() %>% 
  slice(1:20) %>% 
  mutate(
    Importance = abs(Importance),
    Variable = fct_reorder(Variable, Importance)
  ) %>%
  ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
  geom_col() +
  scale_x_continuous(expand = c(0, 0)) +
  labs(y = NULL)