Evaluación de la calidad del aire con dataset AirQuality

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5      ✔ rsample      1.2.1 
## ✔ dials        1.2.1      ✔ tune         1.1.2 
## ✔ infer        1.0.6      ✔ workflows    1.1.4 
## ✔ modeldata    1.3.0      ✔ workflowsets 1.0.1 
## ✔ parsnip      1.2.0      ✔ yardstick    1.3.0 
## ✔ recipes      1.0.10     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Search for functions across packages at https://www.tidymodels.org/find/

Dataset disponible en: https://archive.ics.uci.edu/dataset/360/air+quality

# Cargar el dataset
data <- read.csv("https://raw.githubusercontent.com/data-datum/datasets_ml/refs/heads/main/AirQualityUCI.csv", sep = ";", dec = ",", na.strings = c("NA", "-200"))
# Verificar nombres de columnas
names(data)
##  [1] "Date"          "Time"          "CO.GT."        "PT08.S1.CO."  
##  [5] "NMHC.GT."      "C6H6.GT."      "PT08.S2.NMHC." "NOx.GT."      
##  [9] "PT08.S3.NOx."  "NO2.GT."       "PT08.S4.NO2."  "PT08.S5.O3."  
## [13] "T"             "RH"            "AH"            "X"            
## [17] "X.1"
# Seleccionar las variables de interés y eliminar valores faltantes
data <- data %>%
  select(PT08.S5.O3., T, RH, NOx.GT., NO2.GT., NMHC.GT.) %>%
  drop_na()
set.seed(123)  # Para reproducibilidad
split <- initial_split(data, prop = 0.8, strata = PT08.S5.O3.)
train_data <- training(split)
test_data <- testing(split)
# Crear la receta de preprocesamiento
rf_recipe <- recipe(PT08.S5.O3. ~ T + RH + NOx.GT. + NO2.GT. + NMHC.GT., data = train_data) %>%
  step_normalize(all_predictors())  # Estandarizar las variables predictoras
# Definir el modelo de árbol de decisión
tree_model <- decision_tree(
  cost_complexity = tune(),      # Hiperparámetro para ajustar
  tree_depth = tune(),            # Hiperparámetro para ajustar
  min_n = tune()                  # Hiperparámetro para ajustar
) %>%
  set_engine("rpart") %>%
  set_mode("regression")
# Crear un workflow que combine la receta y el modelo
workflow_tree <- workflow() %>%
  add_recipe(rf_recipe) %>%
  add_model(tree_model)
# Definir una cuadrícula para los hiperparámetros a explorar
tree_grid <- grid_regular(
  cost_complexity(),
  tree_depth(),
  min_n(),
  levels = 5  # Ajusta el número de niveles según tus necesidades
)
# Configurar la validación cruzada
set.seed(123)
folds <- vfold_cv(train_data, v = 5, strata = PT08.S5.O3.)
# Realizar el ajuste de hiperparámetros
tune_results <- tune_grid(
  workflow_tree,
  resamples = folds,
  grid = tree_grid,
  metrics = metric_set(rmse, rsq)
)
# Ver los resultados
collect_metrics(tune_results)
## # A tibble: 250 × 9
##    cost_complexity tree_depth min_n .metric .estimator    mean     n std_err
##              <dbl>      <int> <int> <chr>   <chr>        <dbl> <int>   <dbl>
##  1    0.0000000001          1     2 rmse    standard   268.        5 8.71   
##  2    0.0000000001          1     2 rsq     standard     0.556     5 0.00811
##  3    0.0000000178          1     2 rmse    standard   268.        5 8.71   
##  4    0.0000000178          1     2 rsq     standard     0.556     5 0.00811
##  5    0.00000316            1     2 rmse    standard   268.        5 8.71   
##  6    0.00000316            1     2 rsq     standard     0.556     5 0.00811
##  7    0.000562              1     2 rmse    standard   268.        5 8.71   
##  8    0.000562              1     2 rsq     standard     0.556     5 0.00811
##  9    0.1                   1     2 rmse    standard   268.        5 8.71   
## 10    0.1                   1     2 rsq     standard     0.556     5 0.00811
## # ℹ 240 more rows
## # ℹ 1 more variable: .config <chr>
# Seleccionar el mejor conjunto de hiperparámetros
best_params <- select_best(tune_results, "rmse")

# Finalizar el modelo con los mejores hiperparámetros
final_workflow <- finalize_workflow(workflow_tree, best_params)

# Entrenar el modelo final en el conjunto de entrenamiento completo
final_model <- fit(final_workflow, data = train_data)
# Evaluar el modelo en el conjunto de prueba
predictions <- predict(final_model, new_data = test_data)
predictions %>%
  head()
## # A tibble: 6 × 1
##   .pred
##   <dbl>
## 1 1168.
## 2  758 
## 3  798.
## 4 1487.
## 5 1285.
## 6  798.
test_data_with_predictions <- bind_cols(test_data, predictions)
test_data_with_predictions
##     PT08.S5.O3.    T   RH NOx.GT. NO2.GT. NMHC.GT.     .pred
## 1          1268 13.6 48.9     166     113      150 1168.4615
## 2           972 13.3 47.7     103      92      112  758.0000
## 3           620 10.7 59.7      45      60       24  797.8824
## 4          1409 10.3 64.2     281     151      307 1486.9231
## 5          1285  9.1 64.0     240     136      197 1284.7500
## 6           552  8.2 60.8      47      53       26  797.8824
## 7           384  6.1 65.9      21      32       10  491.8333
## 8           748  6.4 65.1     109     104       33  873.2500
## 9          1061 16.3 35.7     163     123      169 1056.6667
## 10         1139 15.8 37.0     190     126      185 1487.2000
## 11         1112 15.9 37.2     178     120      165 1251.5000
## 12         1448 14.4 43.4     202     145      242 1487.2000
## 13         1144  9.1 63.9     139      97       67  873.2500
## 14         1192  6.5 71.6     256      96      132 1383.6250
## 15         1583 15.6 42.2     296     158      283 1842.7059
## 16          850 18.0 34.8     118     102      112  775.2857
## 17          947 18.4 33.6     119     116      108  839.5714
## 18         1612 15.8 42.4     277     165      258 1842.7059
## 19         1206 21.3 30.8     180     128      156 1251.5000
## 20         1905 17.6 46.1     325     173      341 1842.7059
## 21         1648 16.7 49.6     217     146      214 1487.2000
## 22         1235 11.3 70.2     228      89      107 1174.9333
## 23         1680 12.4 63.9     360     114      336 1757.5000
## 24         2122 20.4 42.5     404     187      685 2065.0000
## 25         1494 15.7 60.2     193     125      224 1487.2000
## 26         1272 14.1 65.7     127     103      155  975.2222
## 27         1040 14.8 60.6      79      88       49 1110.0769
## 28         1296 23.9 25.7     184     139      237 1251.5000
## 29         1432 21.3 34.8     181     137      261 1251.5000
## 30         1432 20.4 36.7     166     143      230 1056.6667
## 31         1362 16.7 48.9     130     117       88 1187.0667
## 32          808 12.7 57.9      54      60       17  639.7500
## 33         1734 17.7 40.1     280     134      368 1673.5714
## 34         1334 25.9 25.9     161     123      152 1184.1250
## 35         1569 25.9 16.0     299     158      386 1842.7059
## 36          828 12.0 58.9      37      57       20  639.7500
## 37          795 14.5 49.4      65      80       89  650.4615
## 38          735 13.8 51.4      53      72       73  650.4615
## 39          684 14.0 51.3      47      64       48  650.4615
## 40          832 10.8 62.5      76      79       74  698.4167
## 41          612 10.8 57.2      43      57       32  639.7500
## 42          872 16.6 35.6     126     106      159  901.6000
## 43          680 11.3 63.9     100      91      122  973.8182
## 44         1159 11.7 69.1     195     130      320 1174.9333
## 45          631 10.3 59.9      80      89       81  698.4167
## 46          479  9.1 68.1      38      54       38  664.0588
## 47          923  9.7 69.7     133      99      300  873.2500
## 48          819 14.2 48.4     132      96      211 1017.3636
## 49         1488 12.6 52.5     231     133      698 1284.7500
## 50          924  8.0 71.8     103      78       64  698.4167
## 51          974  8.5 66.3     132      90       88  873.2500
## 52          777 13.4 41.0     112      87      143  775.2857
## 53          944 10.4 48.0     120      98      267 1017.3636
## 54          778  9.8 50.8      99      91       97  758.0000
## 55          646  8.6 57.8      70      79       57  834.9444
## 56         1016 15.5 36.1     129      93      184  901.6000
## 57          766 19.1 26.8     106      96      232  630.5882
## 58          765 17.8 28.4     113     101      150  775.2857
## 59         1056 12.5 48.1     140     107      231 1111.3846
## 60         1003 11.8 50.8     102      96      125  758.0000
## 61         1073 12.5 50.1     133      83      127 1017.3636
## 62          417 10.6 48.0      46      58       40  361.1500
## 63          648 10.1 51.2      55      70       33  650.4615
## 64          705 11.8 37.6     118      98      147  775.2857
## 65          525 18.0 27.1      94      90      128  630.5882
## 66          802 16.1 31.3     126     114      299  987.1176
## 67          693 12.2 37.2      41      69       29  516.2727
## 68         1013 11.3 40.3     111      98       86  775.2857
## 69         1311 11.9 38.4     191     115      294 1284.7500
## 70         1689 16.3 28.8     283     150      695 1842.7059
## 71         1548 18.3 26.6     249     146      649 1647.9500
## 72         1354 22.4 20.0     200     128      546 1368.0000
## 73          926 17.3 34.7     112      99      151  775.2857
## 74          764 14.9 43.5      80      91      101  926.5455
## 75          678 14.8 40.8      47      70       66  516.2727
## 76         1004 12.2 70.0     140     109      156  975.2222
## 77         1054 13.6 69.2     171     114      295 1037.4211
## 78         1192 14.9 59.3     195     119      386 1421.0000
## 79         1282 19.2 39.6     192     113      375 1421.0000
## 80         1200 15.8 48.0     127     113      188 1187.0667
## 81         1067 14.1 54.4     126     104      157 1111.3846
## 82          737 11.3 72.9      56      66       51  664.0588
## 83         1510 13.6 65.3     249     139      798 1560.6667
## 84         1347 22.9 33.8     167     135      454 1056.6667
## 85         1138 24.7 30.5     152     126      391 1184.1250
## 86         1486 24.3 30.1     223     165      721 1532.0000
## 87         1399 18.5 43.0     180     146      415 1251.5000
## 88         1270 17.1 51.8      98     106      139 1110.0769
## 89          891 14.5 58.0      60      74       57  639.7500
## 90         1451 15.6 57.4     236     124      424 1421.0000
## 91         1353 19.7 45.4     194     132      710 1532.0000
## 92         1079 17.5 50.9     109     100      178 1187.0667
## 93          951 17.0 53.9      90      92      177 1110.0769
## 94          659 14.6 58.8      73      64       62 1110.0769
## 95         1806 16.2 52.5     302     129     1084 1673.5714
## 96         1064 20.4 34.7     167     116      405 1056.6667
## 97          684 18.6 36.9     115     113      220  908.6250
## 98          710 18.8 36.8     109      98      226  908.6250
## 99          923 17.5 40.6     143     116      456  987.1176
## 100         800 15.7 49.4      91      90      227 1110.0769
## 101         830 14.3 58.2     152     111      277 1168.4615
## 102         589 16.4 47.7      65      69      120  650.4615
## 103         518  9.8 71.8      62      71       62  664.0588
## 104         971 14.1 46.0      95      99       92  758.0000
## 105         975 12.0 66.9      83      79       77  698.4167
## 106         807 10.8 70.3      46      52       69  664.0588
## 107         842 10.2 73.4      81      55       89  895.2000
## 108        1098 12.4 64.0     138      76      167 1017.3636
## 109         498 19.8 23.1      90      81      108  630.5882
## 110         540 21.4 22.7      89      89       98  761.5385
## 111         740 18.3 31.6     122     111      165  839.5714
## 112         712  7.8 70.4      38      52       49  664.0588
## 113         368 22.2 18.9      26      35       54  436.0000
## 114         369 22.0 19.0      42      46       48  436.0000
## 115         498 15.6 37.5      63      78       65  516.2727
## 116         366 13.7 38.3      41      58       31  361.1500
## 117         274 13.9 30.9      16      25        9  361.1500
## 118         263 13.5 37.9      16      26       14  361.1500
## 119         367 13.5 50.1      39      53       36  361.1500
## 120         537 10.3 65.2      88      77       67  698.4167
## 121         697 10.6 59.5     127      93       79 1017.3636
## 122         474 10.3 58.0      61      68       49  639.7500
## 123         360 10.5 60.8      18      29       29  491.8333
## 124         873 10.0 68.7     149      99      163 1037.4211
## 125         911 12.6 50.8     133      93      144 1017.3636
## 126         791 15.1 42.7     116      84      140  775.2857
## 127         966 15.8 40.5     145     103      227  987.1176
## 128         998 10.3 65.6     113      71      116  975.2222
## 129        1638 13.5 53.4     309     114      536 1560.6667
## 130         948 11.9 74.8      63      69       70  664.0588
## 131        1188 15.3 51.4     186     105      251 1419.1765
## 132        1373 15.7 62.5     230     149      535 1368.0000
## 133        1105 12.8 81.0     171      99      171 1037.4211
## 134        1895 19.1 42.1     358     142      759 2065.0000
## 135        1684 23.7 31.0     245     142      394 1647.9500
## 136        1731 26.9 25.7     240     169      639 1532.0000
## 137        1402 29.7 20.6     179     142      541 1251.5000
## 138        1246 29.2 21.5     160     136      468 1184.1250
## 139        1757 24.9 30.8     250     164      872 1647.9500
## 140        1703 21.5 40.8     230     146      692 1532.0000
## 141         939 16.9 49.8      45      60       55  436.0000
## 142        1019 14.0 59.7      75      64      119  973.8182
## 143        1291 26.8 29.5     139     114      358 1080.3846
## 144         972 26.9 32.1      84      74      238  761.5385
## 145         880 25.6 36.0      91      89      192  761.5385
## 146         909 27.0 31.8      99      95      205  761.5385
## 147        1248 24.9 38.1     147     123      283 1080.3846
## 148         873 15.6 73.5      88      79      202 1110.0769
## 149         528 12.9 79.7      53      54       64  874.1000
## 150         403 16.6 38.6      61      56       44  436.0000
## 151         474 20.2 24.1      83      74       64  761.5385
## 152         955 14.8 39.5     116      93      347  908.6250
## 153         761 14.5 42.0     102      79      252  926.5455
## 154         517 14.0 43.8      70      72       60  965.7143
## 155         844 15.0 41.7     134      89      235  987.1176
## 156         720 16.3 38.4     113      84       89  775.2857
## 157         886 18.6 35.4     119     101      171  839.5714
## 158        1703 15.9 48.5     314     122      584 1673.5714
## 159        1109 28.2 20.4     150     107      411 1184.1250
## 160        1011 25.7 23.8     149     110      277 1184.1250
## 161        1155 24.5 27.8     172     127      377 1251.5000
## 162        1359 21.5 34.5     181     137      555 1251.5000
## 163        1692 16.1 54.8     274     100      588 1673.5714
## 164        1219 28.1 26.9     119     108      293  908.6250
## 165        1300 27.3 30.0     178     145      509 1251.5000
## 166        1716 23.1 42.2     222     155      816 1532.0000
## 167        1339 19.8 46.8     151     129      332 1244.2308
## 168        1204 23.8 30.2     145     118      220 1080.3846
## 169        1411 17.0 70.7     172      97      174 1419.1765
## 170        1318 22.7 45.8     185     121      265 1419.1765
## 171        1623 20.8 54.4     215     122      460 1421.0000
## 172        1789 17.8 66.8     185     110      275 1419.1765
metrics(test_data_with_predictions, truth = PT08.S5.O3., estimate = .pred)
## # A tibble: 3 × 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard     167.   
## 2 rsq     standard       0.822
## 3 mae     standard     133.
library(ggplot2)

# Crear el gráfico de valores reales vs valores predichos
ggplot(test_data_with_predictions, aes(x = PT08.S5.O3., y = .pred)) +
  geom_point(color = "blue", alpha = 0.6) +  # Puntos azules para las observaciones
  geom_abline(slope = 1, intercept = 0, color = "red", linetype = "dashed") +  # Línea de igualdad
  labs(
    title = "Predicted vs Real Values of PT08.S5(O3)",
    x = "Real Values (PT08.S5(O3))",
    y = "Predicted Values"
  ) +
  theme_minimal()