Bibliotecas
library(tidyverse)
library(tidymodels)
library(glmnet)
library(vip)
library(themis)
library(doParallel)
Ejemplo regresión
Datos iniciales
df_turistas <- read_csv("Train.csv") %>%
select(-country)
df_turistas %>% head()
Train - Test
set.seed(2022)
particion_inicial <- initial_split(data = df_turistas, prop = 0.8, strata = total_cost)
train <- training(particion_inicial)
test <- testing(particion_inicial)
Validación cruzada (k-fold)
set.seed(2022)
submuestras_reg <- vfold_cv(data = train, v = 5, repeats = 1, strata = total_cost)
Regresión Regularizada
modelo_regresion <-
linear_reg(mixture = tune(), penalty = tune()) %>%
set_mode("regression") %>%
set_engine("glmnet")
Receta
receta_regresion <- recipe(total_cost ~ ., data = train) %>%
update_role(ID, new_role = "ID") %>%
step_impute_median(all_numeric_predictors()) %>%
step_impute_mode(all_nominal_predictors()) %>%
step_poly(night_zanzibar, degree = 2) %>%
step_ns(night_mainland, deg_free = 2) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_normalize(all_numeric_predictors())
Flujo de trabajo
flujo_regresion <-
workflow() %>%
add_recipe(receta_regresion) %>%
add_model(modelo_regresion)
Cuadrícula de búsqueda
grid_regular()
grid_random()
grid_latin_hypercube()
grid_max_entropy()
set.seed(2022)
grid_regresion <-
grid_max_entropy(penalty(), mixture(), size = 20)
Optimización de
hiperparámetros
Ajuste
registerDoParallel()
tune_regresion <-
tune_grid(flujo_regresion,
resamples = submuestras_reg,
grid = grid_regresion)
stopImplicitCluster()
Resultados de hiperparámetros
tune_regresion %>%
collect_metrics() %>%
filter(.metric == "rmse") %>%
ggplot(aes(x = penalty, y = mixture, color = mean)) +
geom_point(size = 3) +
scale_x_log10() +
scale_color_viridis_c()

5 Mejores resultados
tune_regresion %>%
show_best(metric = "rmse")
Mejor resultado
mejor_regresion <-
tune_regresion %>%
select_best(metric = "rmse")
mejor_regresion
Finalizando flujo
final_wf_regresion <- flujo_regresion %>%
finalize_workflow(mejor_regresion)
final_wf_regresion
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: linear_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 6 Recipe Steps
##
## * step_impute_median()
## * step_impute_mode()
## * step_poly()
## * step_ns()
## * step_dummy()
## * step_normalize()
##
## -- Model -----------------------------------------------------------------------
## Linear Regression Model Specification (regression)
##
## Main Arguments:
## penalty = 0.632285841990838
## mixture = 0.00163340591825545
##
## Computational engine: glmnet
Modelo optimizado
Ajuste final
registerDoParallel()
ajuste_final_regresion <-
last_fit(final_wf_regresion, particion_inicial)
stopImplicitCluster()
Métricas en Test
ajuste_final_regresion %>%
collect_metrics()
Importancia de variables
flujo_regresion %>%
fit(train) %>%
pull_workflow_fit() %>%
vi() %>%
slice(1:20) %>%
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)

Ejemplo de clasificación
Datos iniciales
df_cardio <-
read_csv("heart_failure_clinical_records_dataset.csv") %>%
mutate(across(
c(anaemia, diabetes, high_blood_pressure, sex, smoking, DEATH_EVENT),
as.factor
))
df_cardio
Train - Test
set.seed(2022)
particion_inicial_c <- initial_split(data = df_cardio, prop = 0.8, strata = DEATH_EVENT)
train_c <- training(particion_inicial_c)
test_c <- testing(particion_inicial_c)
Validación cruzada (k-fold)
set.seed(2022)
submuestras_kfold_c <- vfold_cv(data = train_c, v = 5, repeats = 1, strata = DEATH_EVENT)
Regresión regularizada
modelo_clasificacion <-
logistic_reg(mixture = tune(), penalty = tune()) %>%
set_mode("classification") %>%
set_engine("glmnet")
Receta
receta_clasificacion <- recipe(DEATH_EVENT ~ ., data = train_c) %>%
step_YeoJohnson(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_normalize(all_numeric_predictors()) %>%
step_upsample(DEATH_EVENT)
Flujo de trabajo
flujo_clasificacion <-
workflow() %>%
add_recipe(receta_clasificacion) %>%
add_model(modelo_clasificacion)
Cuadrícula de búsqueda
set.seed(2022)
grid_clasificacion <-
grid_latin_hypercube(penalty(), mixture(), size = 20)
Optimización de
hiperparámetros
Ajuste
registerDoParallel()
tune_clasificacion <-
tune_grid(flujo_clasificacion,
resamples = submuestras_kfold_c,
grid = grid_clasificacion)
stopImplicitCluster()
Resultados de hiperparámetros
tune_clasificacion %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
ggplot(aes(x = penalty, y = mixture, color = mean)) +
geom_point(size = 3) +
scale_x_log10() +
scale_color_viridis_c()

5 mejores resultados
tune_clasificacion %>%
show_best(metric = "roc_auc")
Mejor resultado
mejor_clasificacion <-
tune_clasificacion %>%
select_best(metric = "roc_auc")
mejor_clasificacion
Finalizando flujo
final_wf_clasificacion <- flujo_clasificacion %>%
finalize_workflow(mejor_clasificacion)
final_wf_clasificacion
## == Workflow ====================================================================
## Preprocessor: Recipe
## Model: logistic_reg()
##
## -- Preprocessor ----------------------------------------------------------------
## 4 Recipe Steps
##
## * step_YeoJohnson()
## * step_dummy()
## * step_normalize()
## * step_upsample()
##
## -- Model -----------------------------------------------------------------------
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = 0.188068715459359
## mixture = 0.439757207746152
##
## Computational engine: glmnet
Modelo optimizado
Ajuste final
registerDoParallel()
ajuste_final_clasificacion <-
last_fit(final_wf_clasificacion, particion_inicial_c)
stopImplicitCluster()
Métricas en Test
ajuste_final_clasificacion %>%
collect_metrics()
Importancia de variables
flujo_clasificacion %>%
fit(train_c) %>%
pull_workflow_fit() %>%
vi() %>%
slice(1:20) %>%
mutate(
Importance = abs(Importance),
Variable = fct_reorder(Variable, Importance)
) %>%
ggplot(aes(x = Importance, y = Variable, fill = Sign)) +
geom_col() +
scale_x_continuous(expand = c(0, 0)) +
labs(y = NULL)
