Objetivo

Versión rápida del informe RF: mismos resultados esperados (R², RMSE, MAE, pronóstico 24h), pero con menos costo computacional para publicar en RPubs.

Paquetes

if (!require("pacman")) install.packages("pacman")
## Cargando paquete requerido: pacman
pacman::p_load(
  tidyverse,
  lubridate,
  caret,
  randomForest,
  ggplot2
)

Datos

url <- "https://raw.githubusercontent.com/vneumannufprbr/TrabajosRStudio/main/energy_dataset.csv"

data <- read.csv(url, stringsAsFactors = FALSE) %>%
  mutate(time = lubridate::ymd_hms(time)) %>%
  arrange(time) %>%
  select(time,
         generation.solar,
         generation.wind.onshore,
         total.load.actual) %>%
  na.omit()

Parámetros

targets <- c("generation.solar", "generation.wind.onshore", "total.load.actual")
window_size <- 24
test_size <- 24
forecast_horizon <- 24

Funciones

create_features <- function(serie, window) {
  n <- length(serie)
  stopifnot(n > window)
  X <- matrix(NA_real_, nrow = n - window, ncol = window)
  for (i in 1:window) X[, i] <- serie[i:(n - window + i - 1)]
  y <- serie[(window + 1):n]
  colnames(X) <- paste0("X", seq_len(window))
  as.data.frame(cbind(X, target = y))
}

safe_calculate_metrics <- function(actual, predicted) {
  if (length(actual) != length(predicted) || length(actual) < 2) {
    return(tibble::tibble(R2 = NA_real_, RMSE = NA_real_, MAE = NA_real_))
  }
  valid <- is.finite(actual) & is.finite(predicted)
  if (sum(valid) < 2) return(tibble::tibble(R2 = NA_real_, RMSE = NA_real_, MAE = NA_real_))
  a <- actual[valid]; p <- predicted[valid]
  rmse <- sqrt(mean((a - p)^2))
  mae  <- mean(abs(a - p))
  ss_res <- sum((a - p)^2)
  ss_tot <- sum((a - mean(a))^2)
  r2 <- ifelse(ss_tot < .Machine$double.eps, NA_real_, 1 - ss_res/ss_tot)
  tibble::tibble(R2 = r2, RMSE = rmse, MAE = mae)
}

Entrenamiento y pronóstico (rápido)

set.seed(1912)

# CV más liviana
train_control <- trainControl(
  method = "cv",
  number = 3,
  allowParallel = TRUE
)

results_forecast <- list()
results_metrics  <- list()

for (target_var in targets) {
  serie <- data[[target_var]] %>% as.numeric()
  n <- length(serie)
  train_series <- serie[1:(n - test_size)]
  test_series  <- serie[(n - test_size + 1):n]

  train_data <- create_features(train_series, window_size)
  test_data  <- create_features(c(tail(train_series, window_size), test_series), window_size)

  # ntree más bajo
  rf_model <- train(
    target ~ .,
    data = train_data,
    method = "rf",
    trControl = train_control,
    ntree = 100
  )

  test_preds <- predict(rf_model, newdata = dplyr::select(test_data, -target))
  results_metrics[[target_var]] <- safe_calculate_metrics(test_data$target, test_preds) %>%
    dplyr::mutate(Variable = target_var, Conjunto = "Test", Modelo = "RandomForest (Fast)")

  full_data <- create_features(serie, window_size)
  rf_full <- randomForest::randomForest(
    x = dplyr::select(full_data, -target),
    y = full_data$target,
    ntree = 100,
    mtry = rf_model$bestTune$mtry
  )

  last_window <- tail(serie, window_size)
  future_preds <- numeric(forecast_horizon)
  for (i in 1:forecast_horizon) {
    input <- as.data.frame(t(last_window))
    colnames(input) <- paste0("X", seq_len(window_size))
    pred <- predict(rf_full, newdata = input)
    future_preds[i] <- as.numeric(pred)
    last_window <- c(last_window[-1], pred)
  }
  results_forecast[[target_var]] <- future_preds
}

metrics_df <- dplyr::bind_rows(results_metrics) %>%
  dplyr::select(Modelo, Variable, Conjunto, R2, RMSE, MAE) %>%
  dplyr::mutate(
    Variable = dplyr::recode(Variable,
      "generation.solar" = "Solar",
      "generation.wind.onshore" = "Eólica",
      "total.load.actual" = "Carga"
    )
  )

metrics_df

Pronóstico 24h

last_time <- max(data$time)
future_dates <- seq(from = last_time + lubridate::hours(1),
                    by = "1 hour",
                    length.out = forecast_horizon)

forecast_df <- tibble::tibble(
  time = rep(future_dates, times = length(targets)),
  variable = rep(c("Solar", "Eólica", "Carga"), each = forecast_horizon),
  value = c(results_forecast[["generation.solar"]],
            results_forecast[["generation.wind.onshore"]],
            results_forecast[["total.load.actual"]])
)

ggplot(forecast_df, aes(x = time, y = value, color = variable)) +
  geom_line(linewidth = 0.9) +
  facet_wrap(~variable, scales = "free_y", ncol = 1) +
  labs(title = "Pronóstico 24 horas con Random Forest (Versión Rápida)",
       x = "Tiempo", y = "Valor") +
  theme_minimal(base_size = 13) +
  theme(legend.position = "none")

Métricas (rápido)

metrics_long <- tidyr::pivot_longer(metrics_df, c(R2, RMSE, MAE),
                                    names_to = "Métrica", values_to = "Valor")
ggplot(metrics_long, aes(x = Variable, y = Valor, fill = Métrica)) +
  geom_col(position = "dodge") +
  facet_wrap(~Métrica, scales = "free_y") +
  labs(title = "Métricas en Test - RF (Versión Rápida)", x = NULL, y = NULL) +
  theme_minimal(base_size = 13)