library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.1.2
## ✔ infer 1.0.6 ✔ workflows 1.1.4
## ✔ modeldata 1.3.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.2.0 ✔ yardstick 1.3.0
## ✔ recipes 1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
El conjunto de datos Air Quality del repositorio de UCI (https://archive.ics.uci.edu/dataset/360/air+quality) contiene mediciones de la calidad del aire tomadas en una estación de monitoreo de calidad del aire en una zona suburbana de Milán, Italia, en el año 2004. Este dataset es ampliamente utilizado para experimentos de análisis de series temporales, modelos de predicción y problemas de regresión.
Variables del dataset:
Date (Fecha): La fecha de la medición en el formato DD/MM/YYYY.
Time (Hora): La hora de la medición en el formato de 24 horas HH.MM.SS.
CO(GT): Concentración de monóxido de carbono (CO) en mg/m³. Este valor fue registrado por un sensor de gas (GT).
PT08.S1(CO): Señal del sensor 1, que es sensible al CO. Este sensor responde principalmente al CO y está representado en unidades arbitrarias de la señal del sensor.
NMHC(GT): Hidrocarburos no metánicos (NMHC) en microgramos por metro cúbico (µg/m³).
C6H6(GT): Concentración de benceno en microgramos por metro cúbico (µg/m³).
PT08.S2(NMHC): Señal del sensor 2, que responde principalmente a hidrocarburos no metánicos.
NOx(GT): Concentración de óxidos de nitrógeno (NOx) en partes por mil millones (ppb).
PT08.S3(NOx): Señal del sensor 3, que responde principalmente a los óxidos de nitrógeno.
NO2(GT): Concentración de dióxido de nitrógeno (NO₂) en partes por mil millones (ppb).
PT08.S4(NO2): Señal del sensor 4, sensible al dióxido de nitrógeno.
PT08.S5(O3): Señal del sensor 5, sensible al ozono (O₃).
T (Temperatura): Temperatura ambiental en grados Celsius (°C).
RH (Humedad Relativa): Humedad relativa en porcentaje (%).
AH (Humedad Absoluta): Humedad absoluta en gramos por metro cúbico (g/m³).
data <- read.csv("https://raw.githubusercontent.com/data-datum/datasets_ml/refs/heads/main/AirQualityUCI.csv", sep = ";", dec = ",", na.strings = c("NA", "-200"))
# Verificar nombres de columnas
names(data)
## [1] "Date" "Time" "CO.GT." "PT08.S1.CO."
## [5] "NMHC.GT." "C6H6.GT." "PT08.S2.NMHC." "NOx.GT."
## [9] "PT08.S3.NOx." "NO2.GT." "PT08.S4.NO2." "PT08.S5.O3."
## [13] "T" "RH" "AH" "X"
## [17] "X.1"
# Seleccionar las variables de interés y eliminar valores faltantes
data <- data %>%
select(PT08.S5.O3., T, RH, NOx.GT., NO2.GT., NMHC.GT.) %>%
drop_na()
set.seed(123) # Para reproducibilidad
p_split <- initial_split(data, prop = 0.8, strata = PT08.S5.O3.)
train_data <- training(p_split)
test_data <- testing(p_split)
Los datos de TRAIN voy a dividirlos en 3-folds
p_folds <- vfold_cv(train_data, v=3)
# Crear la receta de preprocesamiento
rf_recipe <- recipe(PT08.S5.O3. ~ T + RH + NOx.GT. + NO2.GT. + NMHC.GT., data = train_data) %>%
step_normalize(all_predictors()) # Estandarizar las variables predictoras
xgb_spec <- boost_tree(
trees = 1000,
tree_depth = tune(), min_n = tune(),
loss_reduction = tune(), ## first three: model complexity
sample_size = tune(), mtry = tune(), ## randomness
learn_rate = tune(), ## step size
) %>%
set_engine("xgboost") %>%
set_mode("regression")
xgb_spec
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
## tree_depth = tune()
## learn_rate = tune()
## loss_reduction = tune()
## sample_size = tune()
##
## Computational engine: xgboost
Grilla de optimización de XGBoost
xgb_grid <- grid_latin_hypercube(
tree_depth(),
min_n(),
loss_reduction(),
sample_size = sample_prop(),
finalize(mtry(), train_data),
learn_rate(),
size = 30
)
xgb_grid
## # A tibble: 30 × 6
## tree_depth min_n loss_reduction sample_size mtry learn_rate
## <int> <int> <dbl> <dbl> <int> <dbl>
## 1 14 21 2.66e- 5 0.840 6 1.35e- 7
## 2 10 26 5.59e-10 0.566 2 1.13e- 8
## 3 4 25 6.96e- 4 0.295 3 1.03e-10
## 4 3 13 2.29e- 8 0.451 5 2.21e- 9
## 5 3 10 1.79e- 2 1.00 4 7.68e- 8
## 6 8 23 1.95e+ 0 0.638 3 4.64e- 4
## 7 1 27 4.93e- 8 0.874 4 3.12e- 6
## 8 12 30 8.63e- 1 0.507 2 8.99e- 3
## 9 6 32 6.88e- 7 0.587 5 7.84e- 5
## 10 13 4 3.04e- 1 0.715 4 1.13e- 5
## # ℹ 20 more rows
xgb_wf <- workflow() %>%
add_recipe(rf_recipe) %>%
add_model(xgb_spec)
xgb_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_normalize()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
## tree_depth = tune()
## learn_rate = tune()
## loss_reduction = tune()
## sample_size = tune()
##
## Computational engine: xgboost
# Define the metrics, ensuring "mae" is included
metrics <- metric_set(mae, rmse, mape)
doParallel::registerDoParallel()
set.seed(234)
xgb_res <- tune_grid(
xgb_wf,
resamples = p_folds,
grid = xgb_grid,
metrics=metrics,
control = control_grid(save_pred = TRUE)
)
xgb_res
## # Tuning results
## # 3-fold cross-validation
## # A tibble: 3 × 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [452/227]> Fold1 <tibble [90 × 10]> <tibble [0 × 3]> <tibble>
## 2 <split [453/226]> Fold2 <tibble [90 × 10]> <tibble [0 × 3]> <tibble>
## 3 <split [453/226]> Fold3 <tibble [90 × 10]> <tibble [0 × 3]> <tibble>
collect_metrics(xgb_res)
## # A tibble: 90 × 12
## mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 6 21 14 1.35e- 7 2.66e- 5 0.840 mae
## 2 6 21 14 1.35e- 7 2.66e- 5 0.840 mape
## 3 6 21 14 1.35e- 7 2.66e- 5 0.840 rmse
## 4 2 26 10 1.13e- 8 5.59e-10 0.566 mae
## 5 2 26 10 1.13e- 8 5.59e-10 0.566 mape
## 6 2 26 10 1.13e- 8 5.59e-10 0.566 rmse
## 7 3 25 4 1.03e-10 6.96e- 4 0.295 mae
## 8 3 25 4 1.03e-10 6.96e- 4 0.295 mape
## 9 3 25 4 1.03e-10 6.96e- 4 0.295 rmse
## 10 5 13 3 2.21e- 9 2.29e- 8 0.451 mae
## # ℹ 80 more rows
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## # .config <chr>
xgb_res %>%
collect_metrics() %>%
filter(.metric == "mae") %>%
dplyr::select(mtry:sample_size, mean) %>%
pivot_longer(mtry:sample_size,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~parameter, scales = "free_x") +
labs(x = NULL, y = "mae")
show_best(xgb_res, "mae")
## # A tibble: 5 × 12
## mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 4 12 14 0.0152 0.000000145 0.794 mae
## 2 2 8 6 0.0449 0.00000000433 0.898 mae
## 3 6 21 15 0.0682 0.0000000167 0.540 mae
## 4 2 30 12 0.00899 0.863 0.507 mae
## 5 2 18 5 0.00196 0.00000000130 0.268 mae
## # ℹ 5 more variables: .estimator <chr>, mean <dbl>, n <int>, std_err <dbl>,
## # .config <chr>
best_mae <- select_best(xgb_res, "mae")
best_mae
## # A tibble: 1 × 7
## mtry min_n tree_depth learn_rate loss_reduction sample_size .config
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 4 12 14 0.0152 0.000000145 0.794 Preprocessor1_Mo…
final_xgb <- finalize_workflow(
xgb_wf,
best_mae
)
final_xgb
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 1 Recipe Step
##
## • step_normalize()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
##
## Main Arguments:
## mtry = 4
## trees = 1000
## min_n = 12
## tree_depth = 14
## learn_rate = 0.0152390892809597
## loss_reduction = 1.44641089477552e-07
## sample_size = 0.793916898444295
##
## Computational engine: xgboost
`
library(vip)
## Warning: package 'vip' was built under R version 4.3.3
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
final_xgb %>%
fit(data = train_data) %>%
pull_workflow_fit() %>%
vip(geom = "point")
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## ℹ Please use `extract_fit_parsnip()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
final_res <- last_fit(final_xgb, p_split, metrics=metrics)
## Warning: package 'xgboost' was built under R version 4.3.3
collect_metrics(final_res)
## # A tibble: 3 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 mae standard 96.8 Preprocessor1_Model1
## 2 rmse standard 122. Preprocessor1_Model1
## 3 mape standard 10.7 Preprocessor1_Model1
collect_predictions(final_res) %>%
ggplot(aes(PT08.S5.O3., .pred)) +
geom_abline(lty = 2, color = "gray50") +
geom_point(alpha = 0.5, color = "midnightblue") +
coord_fixed()