library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.1.2 
## ✔ infer        1.0.6      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.0.1 
## ✔ parsnip      1.2.0      ✔ yardstick    1.3.0 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages

Evaluación de la calidad del aire con dataset AirQuality

El conjunto de datos Air Quality del repositorio de UCI (https://archive.ics.uci.edu/dataset/360/air+quality) contiene mediciones de la calidad del aire tomadas en una estación de monitoreo de calidad del aire en una zona suburbana de Milán, Italia, en el año 2004. Este dataset es ampliamente utilizado para experimentos de análisis de series temporales, modelos de predicción y problemas de regresión.

Variables del dataset:

data <- read.csv("https://raw.githubusercontent.com/data-datum/datasets_ml/refs/heads/main/AirQualityUCI.csv", sep = ";", dec = ",", na.strings = c("NA", "-200"))
# Verificar nombres de columnas
names(data)
##  [1] "Date"          "Time"          "CO.GT."        "PT08.S1.CO."  
##  [5] "NMHC.GT."      "C6H6.GT."      "PT08.S2.NMHC." "NOx.GT."      
##  [9] "PT08.S3.NOx."  "NO2.GT."       "PT08.S4.NO2."  "PT08.S5.O3."  
## [13] "T"             "RH"            "AH"            "X"            
## [17] "X.1"
# Seleccionar las variables de interés y eliminar valores faltantes
data <- data %>%
  select(PT08.S5.O3., T, RH, NOx.GT., NO2.GT., NMHC.GT.) %>%
  drop_na()
set.seed(123)  # Para reproducibilidad
p_split <- initial_split(data, prop = 0.8, strata = PT08.S5.O3.)
train_data <- training(p_split)
test_data <- testing(p_split)

Los datos de TRAIN voy a dividirlos en 3-folds

p_folds <- vfold_cv(train_data, v=3)
# Crear la receta de preprocesamiento
rf_recipe <- recipe(PT08.S5.O3. ~ T + RH + NOx.GT. + NO2.GT. + NMHC.GT., data = train_data) %>%
  step_normalize(all_predictors())  # Estandarizar las variables predictoras

Modelo XGBoost

xgb_spec <- boost_tree(
  trees = 1000, 
  tree_depth = tune(), min_n = tune(), 
  loss_reduction = tune(),                     ## first three: model complexity
  sample_size = tune(), mtry = tune(),         ## randomness
  learn_rate = tune(),                         ## step size
) %>% 
  set_engine("xgboost") %>% 
  set_mode("regression")

xgb_spec
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   loss_reduction = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

Grilla de optimización de XGBoost

xgb_grid <- grid_latin_hypercube(
  tree_depth(),
  min_n(),
  loss_reduction(),
  sample_size = sample_prop(),
  finalize(mtry(), train_data),
  learn_rate(),
  size = 30
)

xgb_grid
## # A tibble: 30 × 6
##    tree_depth min_n loss_reduction sample_size  mtry learn_rate
##         <int> <int>          <dbl>       <dbl> <int>      <dbl>
##  1         14    21       2.66e- 5       0.840     6   1.35e- 7
##  2         10    26       5.59e-10       0.566     2   1.13e- 8
##  3          4    25       6.96e- 4       0.295     3   1.03e-10
##  4          3    13       2.29e- 8       0.451     5   2.21e- 9
##  5          3    10       1.79e- 2       1.00      4   7.68e- 8
##  6          8    23       1.95e+ 0       0.638     3   4.64e- 4
##  7          1    27       4.93e- 8       0.874     4   3.12e- 6
##  8         12    30       8.63e- 1       0.507     2   8.99e- 3
##  9          6    32       6.88e- 7       0.587     5   7.84e- 5
## 10         13     4       3.04e- 1       0.715     4   1.13e- 5
## # ℹ 20 more rows

Workflow

xgb_wf <- workflow() %>%
  add_recipe(rf_recipe) %>%
  add_model(xgb_spec)

xgb_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   loss_reduction = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

ENTRENAMIENTO del Modelo

# Define the metrics, ensuring "mae" is included
metrics <- metric_set(mae, rmse, mape)
doParallel::registerDoParallel()

set.seed(234)
xgb_res <- tune_grid(
  xgb_wf,
  resamples = p_folds,
  grid = xgb_grid,
  metrics=metrics,
  control = control_grid(save_pred = TRUE)
)

xgb_res
## # Tuning results
## # 3-fold cross-validation 
## # A tibble: 3 × 5
##   splits            id    .metrics           .notes           .predictions
##   <list>            <chr> <list>             <list>           <list>      
## 1 <split [452/227]> Fold1 <tibble [90 × 10]> <tibble [0 × 3]> <tibble>    
## 2 <split [453/226]> Fold2 <tibble [90 × 10]> <tibble [0 × 3]> <tibble>    
## 3 <split [453/226]> Fold3 <tibble [90 × 10]> <tibble [0 × 3]> <tibble>

Resultados

collect_metrics(xgb_res)
## # A tibble: 90 × 12
##     mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
##    <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
##  1     6    21         14   1.35e- 7       2.66e- 5       0.840 mae    
##  2     6    21         14   1.35e- 7       2.66e- 5       0.840 mape   
##  3     6    21         14   1.35e- 7       2.66e- 5       0.840 rmse   
##  4     2    26         10   1.13e- 8       5.59e-10       0.566 mae    
##  5     2    26         10   1.13e- 8       5.59e-10       0.566 mape   
##  6     2    26         10   1.13e- 8       5.59e-10       0.566 rmse   
##  7     3    25          4   1.03e-10       6.96e- 4       0.295 mae    
##  8     3    25          4   1.03e-10       6.96e- 4       0.295 mape   
##  9     3    25          4   1.03e-10       6.96e- 4       0.295 rmse   
## 10     5    13          3   2.21e- 9       2.29e- 8       0.451 mae    
## # ℹ 80 more rows
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>

Veamos los hiperparámetros

xgb_res %>%
  collect_metrics() %>%
  filter(.metric == "mae") %>% 
  dplyr::select(mtry:sample_size, mean) %>%
  pivot_longer(mtry:sample_size,
               values_to = "value",
               names_to = "parameter"
  ) %>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(x = NULL, y = "mae")

show_best(xgb_res, "mae")
## # A tibble: 5 × 12
##    mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
## 1     4    12         14    0.0152   0.000000145         0.794 mae    
## 2     2     8          6    0.0449   0.00000000433       0.898 mae    
## 3     6    21         15    0.0682   0.0000000167        0.540 mae    
## 4     2    30         12    0.00899  0.863               0.507 mae    
## 5     2    18          5    0.00196  0.00000000130       0.268 mae    
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## #   .config <chr>

Mejor modelo.

best_mae <- select_best(xgb_res, "mae")
best_mae
## # A tibble: 1 × 7
##    mtry min_n tree_depth learn_rate loss_reduction sample_size .config          
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>            
## 1     4    12         14     0.0152    0.000000145       0.794 Preprocessor1_Mo…

Finalizo el workflow

final_xgb <- finalize_workflow(
  xgb_wf,
  best_mae
)

final_xgb
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
## 
## • step_normalize()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = 4
##   trees = 1000
##   min_n = 12
##   tree_depth = 14
##   learn_rate = 0.0152390892809597
##   loss_reduction = 1.44641089477552e-07
##   sample_size = 0.793916898444295
## 
## Computational engine: xgboost

Importancia de las variables

`

library(vip)
## Warning: package 'vip' was built under R version 4.3.3
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
final_xgb %>%
  fit(data = train_data) %>%
  pull_workflow_fit() %>%
  vip(geom = "point")
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## ℹ Please use `extract_fit_parsnip()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

Predicción en TEST

final_res <- last_fit(final_xgb, p_split, metrics=metrics)
## Warning: package 'xgboost' was built under R version 4.3.3
collect_metrics(final_res)
## # A tibble: 3 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <chr>               
## 1 mae     standard        96.8 Preprocessor1_Model1
## 2 rmse    standard       122.  Preprocessor1_Model1
## 3 mape    standard        10.7 Preprocessor1_Model1
collect_predictions(final_res) %>%
  ggplot(aes(PT08.S5.O3., .pred)) +
  geom_abline(lty = 2, color = "gray50") +
  geom_point(alpha = 0.5, color = "midnightblue") +
  coord_fixed()