Oficina de R

previsão de séries temporais usando algoritmo Boosting

Autor

Hudson Torrent

Data de Publicação

10 de abril de 2023

Método Boosting

Vamos considerar aqui o método Component-wise Boosting.
Esse método pode ser interessante em problemas com um grande número de regressores e/ou com um número não tão grande de observações.
O algoritmo pode ser descrito (Lindenmeyer, Skorin, e Torrent (2021)), no caso particular de regressão linear, da seguinte forma:

Vale ressaltar que \hat{f} poderia ser um estimador não-linear, por exemplo, splines.
Note que, não necessariamente, todos os regressores disponíveis serão selecionados.
O parâmetro v, geralmente, é igual a 0.1. Esse parâmetro controla o incremento a cada iteração.
No Boosting desejamos utilizar um weak learner. O parâmetro v tem papel central nesse ponto.

Escolha de m

Um outro ponto importante é a escolha do número de iterações, m.
No contexto de séries temporais, é indicado que se use o AIC ou BIC.
Aqui, vamos usar o AIC corrigido:

O pacote `mboost`

Vamos analisar uma função que implementa o algoritmo Boosting para o caso de regressão.

library(mboost)

f_boost_simple <- function(
  x, # matriz dos regressores
  y, # vetor com a variável dependente
  eta = 0.1, # parâmetro de amortecimento
  num_ite = 100 # número de iterações
) {
  require(mboost)

  x <- as.matrix(x)

  reg_opt <- glmboost(
    y = as.vector(y),
    x = x,
    offset = 0,
    center = TRUE,
    control = boost_control(
      mstop = num_ite,
      nu = eta
    )
  )

  aic <- AIC(reg_opt, method = "corrected")
  aic_seq <- attributes(aic)$AIC
  m_opt <- min(
    c(
      which(diff(aic_seq) > 0)[1],
      which.min(aic_seq)
    ),
    na.rm = TRUE
  )
  reg_opt[m_opt]

  df_opt <- attributes(aic)$df[m_opt]

  coef_opt_aux <- mboost::extract(reg_opt, what = "coefficients")
  coef_opt <- rep(0, ncol(x))
  names(coef_opt) <- colnames(x)
  coef_opt[names(coef_opt_aux)] <- coef_opt_aux

  return(
    list(
      reg_opt = reg_opt,
      coef_opt = coef_opt,
      df_opt = df_opt,
      m_opt = m_opt,
      aic_opt = min(aic_seq)
    )
  )
}

Previsão da Inflação Americana

Vamos implementar um exercício empírico apresentado em Medeiros et al. (2021).

O problema

Os códigos

Podemos encontrar os códigos em

https://github.com/gabrielrvsc/ForecastingInflation.git

Medidas de performance

Colocando a mão na massa

Preparando os dados

Os autores do artigo Medeiros et al. (2021) disponibilizam, no GitHub, uma rotina para captação dos dados.
Esses dados estão armazenados no arquivo data.rda.

rm(list = ls())

library(tidyverse)

load("teste/data.rda")

# dependent variable: `CPIAUCSL`

row.names(data) <- data$date
data_1 <- select(data, -"date")

data_2 <- cbind(
  data_1,
  factors = princomp(scale(data_1))$scores[, 1:4]
)
dados_cpi <- data_2$CPIAUCSL

tempo <- seq(
  from = as.Date(row.names(data)[1]),
  to = as.Date(tail(row.names(data), 1)),
  by = "months"
)
plot(tempo, dados_cpi, type = "l")

dados_y <- data.frame(y = dados_cpi)
dados_x <- select(data_2, -"CPIAUCSL")
y_vec <- dados_y$y

Por enquanto, o banco de dados dos regressores possui a seguinte dimensão:

dim(dados_x)

[1] 757 110

Mas queremos incluir quatro defasagens das variáveis regressoras e da própria variável dependente.
Vamos fazer duas funções com esse objetivo:

f_add_lags_aux <- function(
  x, # vetor com os valores numéricos
  lag_max, # lag máx. desejado
  name # nome a ser colocado no vetor resultante
) {
  result <- embed(x, lag_max) %>% data.frame()
  colnames(result) <- c(
      paste(name, "_t_", 1:lag_max, sep = "")
  )

  return(result)
}

f_add_lags <- function(
  x, # data.frame com colunas nomeadas
  lag_max # lag máx. desejado
) {

  if (!is.data.frame(x)) {
      stop(
          "x must be a data.frame"
      )
  }

  dados <- data.frame(
      temp = rep(1, nrow(x) - lag_max + 1)
  )
  name_aux <- colnames(x)

  for (i in seq_len(ncol(x))) {
      dados <- data.frame(
          dados,
          f_add_lags_aux(
              x = x[, i],
              lag_max = lag_max,
              name = name_aux[i]
          )
      )
  }
  dados_aux <- data.frame(dados[, -1])
  colnames(dados_aux) <- colnames(dados)[-1]
  return(dados_aux)
}

Vejamos um exemplo:

a <- 1:10
dados_a <- data.frame(y = a)

new_dados_a <- f_add_lags(dados_a, 4)

print(new_dados_a)

  y_t_1 y_t_2 y_t_3 y_t_4
1     4     3     2     1
2     5     4     3     2
3     6     5     4     3
4     7     6     5     4
5     8     7     6     5
6     9     8     7     6
7    10     9     8     7

Queremos considerar como modelo concorrente (benchmark), o modelo AR(4), seguindo os autores do artigo.
Para isso, vamos construir a seguinte função:

f_run_ar <- function(
  x, y, x_out, type = "fixed"
) {

  d1 <- x[, "dummy_var"]
  x_ar <- x[, colnames(x) != "dummy_var"]

  if (type == "fixed") {
      reg_aux <- lm(y ~ 1 + x_ar + d1)
      coef_aux <- reg_aux$coefficients
      coef_aux[is.na(coef_aux)] <- 0
      coef_aux[length(coef_aux)] <- 0 # eliminar o efeito. Segui o github
      coef_opt <- coef_aux
  } else {
      coef_list <- list()
      bic_sel <- rep(NA, ncol(x_ar))

      for (i in seq_len(ncol(x_ar))) {
          reg_aux <- lm(y ~ 1 + x_ar[, 1:i] + d1)
          coef_aux <- reg_aux$coefficients
          coef_aux[is.na(coef_aux)] <- 0
          coef_aux[length(coef_aux)] <- 0 # eliminar o efeito. Segui o github
          coef_list[[i]] <- coef_aux
          bic_sel[i] <- BIC(reg_aux)
      }

      coef_opt <- coef_list[[which.min(bic_sel)]]
  }

  pred <- sum(x_out * coef_opt[-1]) + coef_opt[1]

  return(pred)
}

Exercício de previsão

A ideia é implementar o seguinte (não sei se é boa ideia rodar durante a oficina):

## basic parameters

n_available <- 672
for_set <- seq(from = 361, to = n_available)

for_hor <- 12

n_train <- for_set[1] - for_hor

y_for_real <- y_vec[for_set]
y_for_rw <- y_vec[for_set - for_hor]

## training set

train_set <- seq(from = 1, to = n_train)

y_for_boost <- y_for_ar <- rep(NA, length(for_set))
var_imp_boost <- list()

for (i in seq_along(for_set)) {

    # Preparando os dados

    data_for <- f_for_data(
        dados_x = dados_x,
        dados_y = dados_y,
        train_set = train_set,
        for_hor = for_hor,
        lag_max = 4
    )

    y_train <- data_for$y_train
    x_mat_train_1 <- data_for$x_mat_train_1
    x_mat_train_2 <- data_for$x_mat_train_2
    x_for_set_1 <- data_for$x_for_set_1
    x_for_set_2 <- data_for$x_for_set_2

    ## mboost regression 2

    boost_aux <- f_run_boost(
        x_mat_train = x_mat_train_2,
        y_train = y_train,
        x_for_set = x_for_set_2,
        num_ite = 200
    )

    y_for_boost[i] <- boost_aux$prediction

    var_imp_boost[[i]] <- boost_aux$var_imp

    ## AR

    y_for_ar[i] <- f_run_ar(
        x = x_mat_train_1,
        y = y_train,
        x_out = x_for_set_1,
        type = "fixed"
    )

    ## updating index

    train_set <- train_set + 1

    print(i / length(for_set))
}

## results

file_name <- paste(
    "cpi_forecast_horizon_",
    for_hor,
    ".RData",
    sep = ""
)

save(
    y_for_real,
    y_for_rw,
    y_for_ar,
    y_for_boost,
    var_imp_boost,
    for_hor,
    file = file_name
)

Detalhando as funções

Para a construção dos dados temos:

f_for_data <- function(
    dados_x,
    dados_y,
    train_set,
    for_hor,
    lag_max = 4
) {
    dados_x_train <- data.frame(dados_x[train_set, ])
    dados_y_train <- data.frame(y = dados_y$y[train_set])

    dados_lag_1 <- f_add_lags(
        x = dados_y_train,
        lag_max = 4
    )

    dummy_var <- dados_lag_1$y_t_1 * 0
    dummy_var[which(rownames(dados_lag_1) == "2008-11-01")] <- 1

    dados_lag_1 <- data.frame(
        dados_lag_1,
        dummy_var = dummy_var
    )

    dados_lag_2 <- cbind(
        dados_lag_1,
        f_add_lags(
            x = dados_x_train,
            lag_max = 4
        )
    )

    x_mat_train_1 <- as.matrix(dados_lag_1) %>% head(-for_hor)
    x_mat_train_2 <- as.matrix(dados_lag_2) %>% head(-for_hor)
    y_train <- tail(dados_y_train$y, nrow(x_mat_train_1))

    x_for_set_1 <- as.matrix(dados_lag_1) %>% tail(1)
    x_for_set_2 <- as.matrix(dados_lag_2) %>% tail(1)

    return(
        list(
            y_train = y_train,
            x_mat_train_1 = x_mat_train_1,
            x_mat_train_2 = x_mat_train_2,
            x_for_set_1 = x_for_set_1,
            x_for_set_2 = x_for_set_2
        )
    )
}

Para rodar o algoritmo de boosting temos:

f_run_boost <- function(
    x_mat_train,
    y_train,
    x_for_set,
    num_ite = 100
) {

    ## mboost regression 1

    reg <- f_boost_simple(
        x = x_mat_train,
        y = y_train,
        num_ite = num_ite
    )

    ## mboost prediction

    x_mean_train <- as.vector(
        apply(x_mat_train, 2, mean)
    )

    y_mean_train <- mean(y_train)

    coef_train <- reg$coef_opt %>% as.vector()

    y_for_hat <- sum(
        (x_for_set - x_mean_train) * coef_train
    ) + y_mean_train

    var_imp <- varimp(reg$reg_opt)

    return(
        list(
            prediction = y_for_hat,
            regression = reg,
            var_imp = var_imp
        )
    )
}

De fato, um arquivo com toda a rotina está no arquivo file_1_estimation.r.

Analisando os resultados

rm(list = ls())

library(ggplot2)
library(gridExtra)

f_rmse <- function(x, y) {
    res <- sqrt(
        cumsum(
            (x - y)^2
        )
    )

    return(res)
}

f_var_imp <- function(x) {
    var_names <- attr(x, "names")

    var_imp_mat <- matrix(
        data = NA,
        nrow = length(var_names),
        ncol = length(x)
    )

    rownames(var_imp_mat) <- var_names

    for (i in seq_along(x)) {
        var_imp_mat[, i] <- round(attr(x, "selfreqs"), 2)
    }

    pos_in <- apply(var_imp_mat, 1, function(x) sum(x) > 0.025)

    return(
        list(
            var_imp = var_imp_mat,
            var_imp_zoom = var_imp_mat[pos_in, ]
        )
    )
}

for_hor_sel <- c(1, 3, 6, 9, 12)

rmse_table <- matrix(
    data = NA,
    nrow = 2,
    ncol = length(for_hor_sel)
)

rownames(rmse_table) <- c("boost", "AR")

for (i in seq_along(for_hor_sel)) {

    file_name <- paste(
        "cpi_forecast_horizon_",
        for_hor_sel[i],
        ".RData",
        sep = ""
    )
    load(file_name)

    rmse_rw <- f_rmse(y_for_rw, y_for_real)
    rmse_ar <- f_rmse(y_for_ar, y_for_real)
    rmse_boost <- f_rmse(y_for_boost, y_for_real)

    # RMSE table

    rmse_table["AR", i] <- tail(rmse_ar, 1) / tail(rmse_rw, 1)
    rmse_table["boost", i] <- tail(rmse_boost, 1) / tail(rmse_rw, 1)

    # Forecast plot

    # plot.ts(
    #     x = y_for_real,
    #     col = 1,
    #     ylim = c(-2, 2),
    #     main = paste(
    #         "h = ",
    #         for_hor,
    #         sep = ""
    #     )
    # )
    # lines(y_for_rw, col = 3)
    # lines(y_for_ar, col = 2)
    # lines(y_for_boost, col = 4)
    # legend(
    #     legend = c(
    #         "Real",
    #         "RW",
    #         "AR",
    #         "Boost"
    #     ),
    #     col = c(1, 3, 2, 4),
    #     lty = 1,
    #     "topright"
    # )

    data_plot_for <- data.frame(
        x = seq(
          from = as.Date("1990-01-01"),
          to = as.Date("2015-12-01"),
          by = "months"
        ),
        y_for_real,
        y_for_rw,
        y_for_ar,
        y_for_boost
    )

    g1 <- ggplot(
        data = data_plot_for
    ) +
    geom_line(
        mapping = aes(x = x, y = y_for_real, colour = "Real")
    ) +
    geom_line(
        mapping = aes(x = x, y = y_for_rw, colour = "RW")
    ) +
    geom_line(
        mapping = aes(x = x, y = y_for_ar, colour = "AR")
    ) +
    geom_line(
        mapping = aes(x = x, y = y_for_boost, colour = "Boost")
    ) +
    scale_color_manual(
        values = c(
            "red",
            "blue",
            "black",
            "green"
        )
    ) +
    labs(
        title = paste(
            "h = ",
            for_hor,
            sep = ""
        )
    )

    g1$labels$colour <- c(" ")

    # RMSE plot

    # plot.ts(
    #     x = rmse_ar / rmse_rw,
    #     col = 2,
    #     ylim = c(0.7, 1.1),
    #     main = paste(
    #         "h = ",
    #         for_hor,
    #         sep = ""
    #     )
    # )
    # lines(rmse_boost / rmse_rw, col = 4)
    # abline(h = 1, col = 1)
    # legend(
    #     legend = c("AR / RW", "Boost / RW"),
    #     col = c(2, 4),
    #     lty = 1,
    #     "topright"
    # )

    data_plot_for_g2 <- data.frame(
        x = seq(
          from = as.Date("1990-01-01"),
          to = as.Date("2015-12-01"),
          by = "months"
        ),
        rmse_rw,
        rmse_ar,
        rmse_boost
    )

    g2 <- ggplot(
        data = data_plot_for_g2
    ) +
    ylim(c(0.6, 1.1)) +
    geom_line(
        mapping = aes(x = x, y = rmse_ar / rmse_rw, colour = "AR / RW")
    ) +
    geom_line(
        mapping = aes(x = x, y = rmse_boost / rmse_rw, colour = "Boost / RW")
    ) +
    geom_abline(intercept = 1, slope = 0) +
    scale_color_manual(
        values = c("red", "blue")
    ) +
    labs(
        title = paste(
            "h = ",
            for_hor,
            sep = ""
        )
    )

    g2$labels$colour <- c(" ")
    grid.arrange(g1, g2)

    # Var. Importance

    var_imp_aux <- f_var_imp(var_imp_boost[[i]])

    heatmap(
        x = var_imp_aux$var_imp,
        Colv = NA,
        Rowv = NA,
        scale = "none",
        main = paste(
            "h = ",
            for_hor,
            sep = ""
        )
    )

    # Var. Importance (zoom)

    heatmap(
        x = var_imp_aux$var_imp_zoom,
        Colv = NA,
        Rowv = NA,
        scale = "none",
        main = paste(
            "h = ",
            for_hor,
            " (zoom)",
            sep = ""
        )
    )

}

colnames(rmse_table) <- paste("h = ", for_hor_sel, sep = "")
print(rmse_table)

          h = 1     h = 3     h = 6     h = 9    h = 12
boost 0.8103255 0.7436870 0.7660924 0.7448355 0.7158788
AR    0.8873490 0.7828632 0.7671603 0.7590787 0.7376245

Podemos comparar com os resultados reportados em Medeiros et al. (2021)

Referências

Lindenmeyer, Guilherme, Pedro Pablo Skorin, e Hudson da Silva Torrent. 2021. “Using boosting for forecasting electric energy consumption during a recession: a case study for the Brazilian State Rio Grande do Sul”. Letters in Spatial and Resource Sciences 14 (2): 111–28. https://doi.org/10.1007/s12076-021-00268-3.

Medeiros, Marcelo C., Gabriel F. R. Vasconcelos, Álvaro Veiga, e Eduardo Zilberman. 2021. “Forecasting Inflation in a Data-Rich Environment: The Benefits of Machine Learning Methods”. Journal of Business & Economic Statistics 39 (1): 98–119. https://doi.org/10.1080/07350015.2019.1637745.