Versión rápida del informe XGBoost para publicar en RPubs con menor tiempo de cómputo.
if (!require("pacman")) install.packages("pacman")
## Cargando paquete requerido: pacman
pacman::p_load(
tidyverse,
lubridate,
xgboost,
ggplot2
)
## Installing package into 'C:/Users/lucas/AppData/Local/R/win-library/4.5'
## (as 'lib' is unspecified)
## package 'xgboost' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\lucas\AppData\Local\Temp\RtmpWEc4Rk\downloaded_packages
##
## xgboost installed
url <- "https://raw.githubusercontent.com/vneumannufprbr/TrabajosRStudio/main/energy_dataset.csv"
data <- read.csv(url, stringsAsFactors = FALSE) %>%
mutate(time = lubridate::ymd_hms(time)) %>%
arrange(time) %>%
select(time,
generation.solar,
generation.wind.onshore,
total.load.actual) %>%
na.omit()
targets <- c("generation.solar", "generation.wind.onshore", "total.load.actual")
window_size <- 24
test_size <- 24
forecast_horizon <- 24
create_features <- function(serie, window) {
n <- length(serie)
stopifnot(n > window)
X <- matrix(NA_real_, nrow = n - window, ncol = window)
for (i in 1:window) X[, i] <- serie[i:(n - window + i - 1)]
y <- serie[(window + 1):n]
colnames(X) <- paste0("X", seq_len(window))
as.data.frame(cbind(X, target = y))
}
safe_calculate_metrics <- function(actual, predicted) {
if (length(actual) != length(predicted) || length(actual) < 2) {
return(tibble::tibble(R2 = NA_real_, RMSE = NA_real_, MAE = NA_real_))
}
valid <- is.finite(actual) & is.finite(predicted)
if (sum(valid) < 2) return(tibble::tibble(R2 = NA_real_, RMSE = NA_real_, MAE = NA_real_))
a <- actual[valid]; p <- predicted[valid]
rmse <- sqrt(mean((a - p)^2))
mae <- mean(abs(a - p))
ss_res <- sum((a - p)^2)
ss_tot <- sum((a - mean(a))^2)
r2 <- ifelse(ss_tot < .Machine$double.eps, NA_real_, 1 - ss_res/ss_tot)
tibble::tibble(R2 = r2, RMSE = rmse, MAE = mae)
}
set.seed(1912)
results_forecast <- list()
results_metrics <- list()
for (target_var in targets) {
serie <- data[[target_var]] %>% as.numeric()
n <- length(serie)
train_series <- serie[1:(n - test_size)]
test_series <- serie[(n - test_size + 1):n]
train_data <- create_features(train_series, window_size)
test_data <- create_features(c(tail(train_series, window_size), test_series), window_size)
dtrain <- xgboost::xgb.DMatrix(as.matrix(dplyr::select(train_data, -target)),
label = train_data$target)
dtest <- xgboost::xgb.DMatrix(as.matrix(dplyr::select(test_data, -target)),
label = test_data$target)
params <- list(
objective = "reg:squarederror",
eval_metric = "rmse",
eta = 0.1,
max_depth = 4,
subsample = 0.9,
colsample_bytree = 0.9,
min_child_weight = 1
)
watchlist <- list(train = dtrain, eval = dtest)
xgb_model <- xgboost::xgb.train(
params = params,
data = dtrain,
nrounds = 400,
watchlist = watchlist,
early_stopping_rounds = 20,
verbose = 0
)
test_preds <- predict(xgb_model, dtest)
results_metrics[[target_var]] <- safe_calculate_metrics(test_data$target, test_preds) %>%
dplyr::mutate(Variable = target_var, Conjunto = "Test", Modelo = "XGBoost (Fast)")
full_data <- create_features(serie, window_size)
dfull <- xgboost::xgb.DMatrix(as.matrix(dplyr::select(full_data, -target)),
label = full_data$target)
xgb_full <- xgboost::xgb.train(
params = params,
data = dfull,
nrounds = xgb_model$best_iteration,
verbose = 0
)
last_window <- tail(serie, window_size)
future_preds <- numeric(forecast_horizon)
for (i in 1:forecast_horizon) {
input <- matrix(last_window, nrow = 1)
colnames(input) <- paste0("X", seq_len(window_size))
dinput <- xgboost::xgb.DMatrix(as.matrix(input))
pred <- predict(xgb_full, dinput)
future_preds[i] <- as.numeric(pred)
last_window <- c(last_window[-1], pred)
}
results_forecast[[target_var]] <- future_preds
}
metrics_df <- dplyr::bind_rows(results_metrics) %>%
dplyr::select(Modelo, Variable, Conjunto, R2, RMSE, MAE) %>%
dplyr::mutate(
Variable = dplyr::recode(Variable,
"generation.solar" = "Solar",
"generation.wind.onshore" = "Eólica",
"total.load.actual" = "Carga"
)
)
metrics_df
last_time <- max(data$time)
future_dates <- seq(from = last_time + lubridate::hours(1),
by = "1 hour",
length.out = forecast_horizon)
forecast_df <- tibble::tibble(
time = rep(future_dates, times = length(targets)),
variable = rep(c("Solar", "Eólica", "Carga"), each = forecast_horizon),
value = c(results_forecast[["generation.solar"]],
results_forecast[["generation.wind.onshore"]],
results_forecast[["total.load.actual"]])
)
ggplot(forecast_df, aes(x = time, y = value, color = variable)) +
geom_line(linewidth = 0.9) +
facet_wrap(~variable, scales = "free_y", ncol = 1) +
labs(title = "Pronóstico 24 horas con XGBoost (Versión Rápida)",
x = "Tiempo", y = "Valor") +
theme_minimal(base_size = 13) +
theme(legend.position = "none")
metrics_long <- tidyr::pivot_longer(metrics_df, c(R2, RMSE, MAE),
names_to = "Métrica", values_to = "Valor")
ggplot(metrics_long, aes(x = Variable, y = Valor, fill = Métrica)) +
geom_col(position = "dodge") +
facet_wrap(~Métrica, scales = "free_y") +
labs(title = "Métricas en Test - XGBoost (Versión Rápida)", x = NULL, y = NULL) +
theme_minimal(base_size = 13)