Objetivo

Versión rápida del informe XGBoost para publicar en RPubs con menor tiempo de cómputo.

Paquetes

if (!require("pacman")) install.packages("pacman")
## Cargando paquete requerido: pacman
pacman::p_load(
  tidyverse,
  lubridate,
  xgboost,
  ggplot2
)
## Installing package into 'C:/Users/lucas/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'xgboost' successfully unpacked and MD5 sums checked
## 
## The downloaded binary packages are in
##  C:\Users\lucas\AppData\Local\Temp\RtmpWEc4Rk\downloaded_packages
## 
## xgboost installed

Datos

url <- "https://raw.githubusercontent.com/vneumannufprbr/TrabajosRStudio/main/energy_dataset.csv"

data <- read.csv(url, stringsAsFactors = FALSE) %>%
  mutate(time = lubridate::ymd_hms(time)) %>%
  arrange(time) %>%
  select(time,
         generation.solar,
         generation.wind.onshore,
         total.load.actual) %>%
  na.omit()

Parámetros

targets <- c("generation.solar", "generation.wind.onshore", "total.load.actual")
window_size <- 24
test_size <- 24
forecast_horizon <- 24

Funciones

create_features <- function(serie, window) {
  n <- length(serie)
  stopifnot(n > window)
  X <- matrix(NA_real_, nrow = n - window, ncol = window)
  for (i in 1:window) X[, i] <- serie[i:(n - window + i - 1)]
  y <- serie[(window + 1):n]
  colnames(X) <- paste0("X", seq_len(window))
  as.data.frame(cbind(X, target = y))
}

safe_calculate_metrics <- function(actual, predicted) {
  if (length(actual) != length(predicted) || length(actual) < 2) {
    return(tibble::tibble(R2 = NA_real_, RMSE = NA_real_, MAE = NA_real_))
  }
  valid <- is.finite(actual) & is.finite(predicted)
  if (sum(valid) < 2) return(tibble::tibble(R2 = NA_real_, RMSE = NA_real_, MAE = NA_real_))
  a <- actual[valid]; p <- predicted[valid]
  rmse <- sqrt(mean((a - p)^2))
  mae  <- mean(abs(a - p))
  ss_res <- sum((a - p)^2)
  ss_tot <- sum((a - mean(a))^2)
  r2 <- ifelse(ss_tot < .Machine$double.eps, NA_real_, 1 - ss_res/ss_tot)
  tibble::tibble(R2 = r2, RMSE = rmse, MAE = mae)
}

Entrenamiento y pronóstico (rápido)

set.seed(1912)

results_forecast <- list()
results_metrics  <- list()

for (target_var in targets) {
  serie <- data[[target_var]] %>% as.numeric()
  n <- length(serie)
  train_series <- serie[1:(n - test_size)]
  test_series  <- serie[(n - test_size + 1):n]

  train_data <- create_features(train_series, window_size)
  test_data  <- create_features(c(tail(train_series, window_size), test_series), window_size)

  dtrain <- xgboost::xgb.DMatrix(as.matrix(dplyr::select(train_data, -target)),
                                 label = train_data$target)
  dtest  <- xgboost::xgb.DMatrix(as.matrix(dplyr::select(test_data, -target)),
                                 label = test_data$target)

  params <- list(
    objective = "reg:squarederror",
    eval_metric = "rmse",
    eta = 0.1,
    max_depth = 4,
    subsample = 0.9,
    colsample_bytree = 0.9,
    min_child_weight = 1
  )

  watchlist <- list(train = dtrain, eval = dtest)

  xgb_model <- xgboost::xgb.train(
    params = params,
    data = dtrain,
    nrounds = 400,
    watchlist = watchlist,
    early_stopping_rounds = 20,
    verbose = 0
  )

  test_preds <- predict(xgb_model, dtest)
  results_metrics[[target_var]] <- safe_calculate_metrics(test_data$target, test_preds) %>%
    dplyr::mutate(Variable = target_var, Conjunto = "Test", Modelo = "XGBoost (Fast)")

  full_data <- create_features(serie, window_size)
  dfull <- xgboost::xgb.DMatrix(as.matrix(dplyr::select(full_data, -target)),
                                label = full_data$target)

  xgb_full <- xgboost::xgb.train(
    params = params,
    data = dfull,
    nrounds = xgb_model$best_iteration,
    verbose = 0
  )

  last_window <- tail(serie, window_size)
  future_preds <- numeric(forecast_horizon)
  for (i in 1:forecast_horizon) {
    input <- matrix(last_window, nrow = 1)
    colnames(input) <- paste0("X", seq_len(window_size))
    dinput <- xgboost::xgb.DMatrix(as.matrix(input))
    pred <- predict(xgb_full, dinput)
    future_preds[i] <- as.numeric(pred)
    last_window <- c(last_window[-1], pred)
  }
  results_forecast[[target_var]] <- future_preds
}

metrics_df <- dplyr::bind_rows(results_metrics) %>%
  dplyr::select(Modelo, Variable, Conjunto, R2, RMSE, MAE) %>%
  dplyr::mutate(
    Variable = dplyr::recode(Variable,
      "generation.solar" = "Solar",
      "generation.wind.onshore" = "Eólica",
      "total.load.actual" = "Carga"
    )
  )

metrics_df

Pronóstico 24h

last_time <- max(data$time)
future_dates <- seq(from = last_time + lubridate::hours(1),
                    by = "1 hour",
                    length.out = forecast_horizon)

forecast_df <- tibble::tibble(
  time = rep(future_dates, times = length(targets)),
  variable = rep(c("Solar", "Eólica", "Carga"), each = forecast_horizon),
  value = c(results_forecast[["generation.solar"]],
            results_forecast[["generation.wind.onshore"]],
            results_forecast[["total.load.actual"]])
)

ggplot(forecast_df, aes(x = time, y = value, color = variable)) +
  geom_line(linewidth = 0.9) +
  facet_wrap(~variable, scales = "free_y", ncol = 1) +
  labs(title = "Pronóstico 24 horas con XGBoost (Versión Rápida)",
       x = "Tiempo", y = "Valor") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

Métricas (rápido)

metrics_long <- tidyr::pivot_longer(metrics_df, c(R2, RMSE, MAE),
                                    names_to = "Métrica", values_to = "Valor")
ggplot(metrics_long, aes(x = Variable, y = Valor, fill = Métrica)) +
  geom_col(position = "dodge") +
  facet_wrap(~Métrica, scales = "free_y") +
  labs(title = "Métricas en Test - XGBoost (Versión Rápida)", x = NULL, y = NULL) +
  theme_minimal(base_size = 13)