library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.1
## ✔ dials 1.2.1 ✔ tune 1.1.2
## ✔ infer 1.0.6 ✔ workflows 1.1.4
## ✔ modeldata 1.3.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.2.0 ✔ yardstick 1.3.0
## ✔ recipes 1.0.10
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/
Dataset disponible en: https://archive.ics.uci.edu/dataset/360/air+quality
# Cargar el dataset
data <- read.csv("https://raw.githubusercontent.com/data-datum/datasets_ml/refs/heads/main/AirQualityUCI.csv", sep = ";", dec = ",", na.strings = c("NA", "-200"))
# Verificar nombres de columnas
names(data)
## [1] "Date" "Time" "CO.GT." "PT08.S1.CO."
## [5] "NMHC.GT." "C6H6.GT." "PT08.S2.NMHC." "NOx.GT."
## [9] "PT08.S3.NOx." "NO2.GT." "PT08.S4.NO2." "PT08.S5.O3."
## [13] "T" "RH" "AH" "X"
## [17] "X.1"
# Seleccionar las variables de interés y eliminar valores faltantes
data <- data %>%
select(PT08.S5.O3., T, RH, NOx.GT., NO2.GT., NMHC.GT.) %>%
drop_na()
set.seed(123) # Para reproducibilidad
split <- initial_split(data, prop = 0.8, strata = PT08.S5.O3.)
train_data <- training(split)
test_data <- testing(split)
# Crear la receta de preprocesamiento
rf_recipe <- recipe(PT08.S5.O3. ~ T + RH + NOx.GT. + NO2.GT. + NMHC.GT., data = train_data) %>%
step_normalize(all_predictors()) # Estandarizar las variables predictoras
# Definir el modelo de árbol de decisión
tree_model <- decision_tree(
cost_complexity = tune(), # Hiperparámetro para ajustar
tree_depth = tune(), # Hiperparámetro para ajustar
min_n = tune() # Hiperparámetro para ajustar
) %>%
set_engine("rpart") %>%
set_mode("regression")
# Crear un workflow que combine la receta y el modelo
workflow_tree <- workflow() %>%
add_recipe(rf_recipe) %>%
add_model(tree_model)
# Definir una cuadrícula para los hiperparámetros a explorar
tree_grid <- grid_regular(
cost_complexity(),
tree_depth(),
min_n(),
levels = 5 # Ajusta el número de niveles según tus necesidades
)
# Configurar la validación cruzada
set.seed(123)
folds <- vfold_cv(train_data, v = 5, strata = PT08.S5.O3.)
# Realizar el ajuste de hiperparámetros
tune_results <- tune_grid(
workflow_tree,
resamples = folds,
grid = tree_grid,
metrics = metric_set(rmse, rsq)
)
# Ver los resultados
collect_metrics(tune_results)
## # A tibble: 250 × 9
## cost_complexity tree_depth min_n .metric .estimator mean n std_err
## <dbl> <int> <int> <chr> <chr> <dbl> <int> <dbl>
## 1 0.0000000001 1 2 rmse standard 268. 5 8.71
## 2 0.0000000001 1 2 rsq standard 0.556 5 0.00811
## 3 0.0000000178 1 2 rmse standard 268. 5 8.71
## 4 0.0000000178 1 2 rsq standard 0.556 5 0.00811
## 5 0.00000316 1 2 rmse standard 268. 5 8.71
## 6 0.00000316 1 2 rsq standard 0.556 5 0.00811
## 7 0.000562 1 2 rmse standard 268. 5 8.71
## 8 0.000562 1 2 rsq standard 0.556 5 0.00811
## 9 0.1 1 2 rmse standard 268. 5 8.71
## 10 0.1 1 2 rsq standard 0.556 5 0.00811
## # ℹ 240 more rows
## # ℹ 1 more variable: .config <chr>
# Seleccionar el mejor conjunto de hiperparámetros
best_params <- select_best(tune_results, "rmse")
# Finalizar el modelo con los mejores hiperparámetros
final_workflow <- finalize_workflow(workflow_tree, best_params)
# Entrenar el modelo final en el conjunto de entrenamiento completo
final_model <- fit(final_workflow, data = train_data)
# Evaluar el modelo en el conjunto de prueba
predictions <- predict(final_model, new_data = test_data)
predictions %>%
head()
## # A tibble: 6 × 1
## .pred
## <dbl>
## 1 1168.
## 2 758
## 3 798.
## 4 1487.
## 5 1285.
## 6 798.
test_data_with_predictions <- bind_cols(test_data, predictions)
test_data_with_predictions
## PT08.S5.O3. T RH NOx.GT. NO2.GT. NMHC.GT. .pred
## 1 1268 13.6 48.9 166 113 150 1168.4615
## 2 972 13.3 47.7 103 92 112 758.0000
## 3 620 10.7 59.7 45 60 24 797.8824
## 4 1409 10.3 64.2 281 151 307 1486.9231
## 5 1285 9.1 64.0 240 136 197 1284.7500
## 6 552 8.2 60.8 47 53 26 797.8824
## 7 384 6.1 65.9 21 32 10 491.8333
## 8 748 6.4 65.1 109 104 33 873.2500
## 9 1061 16.3 35.7 163 123 169 1056.6667
## 10 1139 15.8 37.0 190 126 185 1487.2000
## 11 1112 15.9 37.2 178 120 165 1251.5000
## 12 1448 14.4 43.4 202 145 242 1487.2000
## 13 1144 9.1 63.9 139 97 67 873.2500
## 14 1192 6.5 71.6 256 96 132 1383.6250
## 15 1583 15.6 42.2 296 158 283 1842.7059
## 16 850 18.0 34.8 118 102 112 775.2857
## 17 947 18.4 33.6 119 116 108 839.5714
## 18 1612 15.8 42.4 277 165 258 1842.7059
## 19 1206 21.3 30.8 180 128 156 1251.5000
## 20 1905 17.6 46.1 325 173 341 1842.7059
## 21 1648 16.7 49.6 217 146 214 1487.2000
## 22 1235 11.3 70.2 228 89 107 1174.9333
## 23 1680 12.4 63.9 360 114 336 1757.5000
## 24 2122 20.4 42.5 404 187 685 2065.0000
## 25 1494 15.7 60.2 193 125 224 1487.2000
## 26 1272 14.1 65.7 127 103 155 975.2222
## 27 1040 14.8 60.6 79 88 49 1110.0769
## 28 1296 23.9 25.7 184 139 237 1251.5000
## 29 1432 21.3 34.8 181 137 261 1251.5000
## 30 1432 20.4 36.7 166 143 230 1056.6667
## 31 1362 16.7 48.9 130 117 88 1187.0667
## 32 808 12.7 57.9 54 60 17 639.7500
## 33 1734 17.7 40.1 280 134 368 1673.5714
## 34 1334 25.9 25.9 161 123 152 1184.1250
## 35 1569 25.9 16.0 299 158 386 1842.7059
## 36 828 12.0 58.9 37 57 20 639.7500
## 37 795 14.5 49.4 65 80 89 650.4615
## 38 735 13.8 51.4 53 72 73 650.4615
## 39 684 14.0 51.3 47 64 48 650.4615
## 40 832 10.8 62.5 76 79 74 698.4167
## 41 612 10.8 57.2 43 57 32 639.7500
## 42 872 16.6 35.6 126 106 159 901.6000
## 43 680 11.3 63.9 100 91 122 973.8182
## 44 1159 11.7 69.1 195 130 320 1174.9333
## 45 631 10.3 59.9 80 89 81 698.4167
## 46 479 9.1 68.1 38 54 38 664.0588
## 47 923 9.7 69.7 133 99 300 873.2500
## 48 819 14.2 48.4 132 96 211 1017.3636
## 49 1488 12.6 52.5 231 133 698 1284.7500
## 50 924 8.0 71.8 103 78 64 698.4167
## 51 974 8.5 66.3 132 90 88 873.2500
## 52 777 13.4 41.0 112 87 143 775.2857
## 53 944 10.4 48.0 120 98 267 1017.3636
## 54 778 9.8 50.8 99 91 97 758.0000
## 55 646 8.6 57.8 70 79 57 834.9444
## 56 1016 15.5 36.1 129 93 184 901.6000
## 57 766 19.1 26.8 106 96 232 630.5882
## 58 765 17.8 28.4 113 101 150 775.2857
## 59 1056 12.5 48.1 140 107 231 1111.3846
## 60 1003 11.8 50.8 102 96 125 758.0000
## 61 1073 12.5 50.1 133 83 127 1017.3636
## 62 417 10.6 48.0 46 58 40 361.1500
## 63 648 10.1 51.2 55 70 33 650.4615
## 64 705 11.8 37.6 118 98 147 775.2857
## 65 525 18.0 27.1 94 90 128 630.5882
## 66 802 16.1 31.3 126 114 299 987.1176
## 67 693 12.2 37.2 41 69 29 516.2727
## 68 1013 11.3 40.3 111 98 86 775.2857
## 69 1311 11.9 38.4 191 115 294 1284.7500
## 70 1689 16.3 28.8 283 150 695 1842.7059
## 71 1548 18.3 26.6 249 146 649 1647.9500
## 72 1354 22.4 20.0 200 128 546 1368.0000
## 73 926 17.3 34.7 112 99 151 775.2857
## 74 764 14.9 43.5 80 91 101 926.5455
## 75 678 14.8 40.8 47 70 66 516.2727
## 76 1004 12.2 70.0 140 109 156 975.2222
## 77 1054 13.6 69.2 171 114 295 1037.4211
## 78 1192 14.9 59.3 195 119 386 1421.0000
## 79 1282 19.2 39.6 192 113 375 1421.0000
## 80 1200 15.8 48.0 127 113 188 1187.0667
## 81 1067 14.1 54.4 126 104 157 1111.3846
## 82 737 11.3 72.9 56 66 51 664.0588
## 83 1510 13.6 65.3 249 139 798 1560.6667
## 84 1347 22.9 33.8 167 135 454 1056.6667
## 85 1138 24.7 30.5 152 126 391 1184.1250
## 86 1486 24.3 30.1 223 165 721 1532.0000
## 87 1399 18.5 43.0 180 146 415 1251.5000
## 88 1270 17.1 51.8 98 106 139 1110.0769
## 89 891 14.5 58.0 60 74 57 639.7500
## 90 1451 15.6 57.4 236 124 424 1421.0000
## 91 1353 19.7 45.4 194 132 710 1532.0000
## 92 1079 17.5 50.9 109 100 178 1187.0667
## 93 951 17.0 53.9 90 92 177 1110.0769
## 94 659 14.6 58.8 73 64 62 1110.0769
## 95 1806 16.2 52.5 302 129 1084 1673.5714
## 96 1064 20.4 34.7 167 116 405 1056.6667
## 97 684 18.6 36.9 115 113 220 908.6250
## 98 710 18.8 36.8 109 98 226 908.6250
## 99 923 17.5 40.6 143 116 456 987.1176
## 100 800 15.7 49.4 91 90 227 1110.0769
## 101 830 14.3 58.2 152 111 277 1168.4615
## 102 589 16.4 47.7 65 69 120 650.4615
## 103 518 9.8 71.8 62 71 62 664.0588
## 104 971 14.1 46.0 95 99 92 758.0000
## 105 975 12.0 66.9 83 79 77 698.4167
## 106 807 10.8 70.3 46 52 69 664.0588
## 107 842 10.2 73.4 81 55 89 895.2000
## 108 1098 12.4 64.0 138 76 167 1017.3636
## 109 498 19.8 23.1 90 81 108 630.5882
## 110 540 21.4 22.7 89 89 98 761.5385
## 111 740 18.3 31.6 122 111 165 839.5714
## 112 712 7.8 70.4 38 52 49 664.0588
## 113 368 22.2 18.9 26 35 54 436.0000
## 114 369 22.0 19.0 42 46 48 436.0000
## 115 498 15.6 37.5 63 78 65 516.2727
## 116 366 13.7 38.3 41 58 31 361.1500
## 117 274 13.9 30.9 16 25 9 361.1500
## 118 263 13.5 37.9 16 26 14 361.1500
## 119 367 13.5 50.1 39 53 36 361.1500
## 120 537 10.3 65.2 88 77 67 698.4167
## 121 697 10.6 59.5 127 93 79 1017.3636
## 122 474 10.3 58.0 61 68 49 639.7500
## 123 360 10.5 60.8 18 29 29 491.8333
## 124 873 10.0 68.7 149 99 163 1037.4211
## 125 911 12.6 50.8 133 93 144 1017.3636
## 126 791 15.1 42.7 116 84 140 775.2857
## 127 966 15.8 40.5 145 103 227 987.1176
## 128 998 10.3 65.6 113 71 116 975.2222
## 129 1638 13.5 53.4 309 114 536 1560.6667
## 130 948 11.9 74.8 63 69 70 664.0588
## 131 1188 15.3 51.4 186 105 251 1419.1765
## 132 1373 15.7 62.5 230 149 535 1368.0000
## 133 1105 12.8 81.0 171 99 171 1037.4211
## 134 1895 19.1 42.1 358 142 759 2065.0000
## 135 1684 23.7 31.0 245 142 394 1647.9500
## 136 1731 26.9 25.7 240 169 639 1532.0000
## 137 1402 29.7 20.6 179 142 541 1251.5000
## 138 1246 29.2 21.5 160 136 468 1184.1250
## 139 1757 24.9 30.8 250 164 872 1647.9500
## 140 1703 21.5 40.8 230 146 692 1532.0000
## 141 939 16.9 49.8 45 60 55 436.0000
## 142 1019 14.0 59.7 75 64 119 973.8182
## 143 1291 26.8 29.5 139 114 358 1080.3846
## 144 972 26.9 32.1 84 74 238 761.5385
## 145 880 25.6 36.0 91 89 192 761.5385
## 146 909 27.0 31.8 99 95 205 761.5385
## 147 1248 24.9 38.1 147 123 283 1080.3846
## 148 873 15.6 73.5 88 79 202 1110.0769
## 149 528 12.9 79.7 53 54 64 874.1000
## 150 403 16.6 38.6 61 56 44 436.0000
## 151 474 20.2 24.1 83 74 64 761.5385
## 152 955 14.8 39.5 116 93 347 908.6250
## 153 761 14.5 42.0 102 79 252 926.5455
## 154 517 14.0 43.8 70 72 60 965.7143
## 155 844 15.0 41.7 134 89 235 987.1176
## 156 720 16.3 38.4 113 84 89 775.2857
## 157 886 18.6 35.4 119 101 171 839.5714
## 158 1703 15.9 48.5 314 122 584 1673.5714
## 159 1109 28.2 20.4 150 107 411 1184.1250
## 160 1011 25.7 23.8 149 110 277 1184.1250
## 161 1155 24.5 27.8 172 127 377 1251.5000
## 162 1359 21.5 34.5 181 137 555 1251.5000
## 163 1692 16.1 54.8 274 100 588 1673.5714
## 164 1219 28.1 26.9 119 108 293 908.6250
## 165 1300 27.3 30.0 178 145 509 1251.5000
## 166 1716 23.1 42.2 222 155 816 1532.0000
## 167 1339 19.8 46.8 151 129 332 1244.2308
## 168 1204 23.8 30.2 145 118 220 1080.3846
## 169 1411 17.0 70.7 172 97 174 1419.1765
## 170 1318 22.7 45.8 185 121 265 1419.1765
## 171 1623 20.8 54.4 215 122 460 1421.0000
## 172 1789 17.8 66.8 185 110 275 1419.1765
metrics(test_data_with_predictions, truth = PT08.S5.O3., estimate = .pred)
## # A tibble: 3 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 rmse standard 167.
## 2 rsq standard 0.822
## 3 mae standard 133.
library(ggplot2)
# Crear el gráfico de valores reales vs valores predichos
ggplot(test_data_with_predictions, aes(x = PT08.S5.O3., y = .pred)) +
geom_point(color = "blue", alpha = 0.6) + # Puntos azules para las observaciones
geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") + # Línea de igualdad
labs(
title = "Predicted vs Real Values of PT08.S5(O3)",
x = "Real Values (PT08.S5(O3))",
y = "Predicted Values"
) +
theme_minimal()