On going IBM

Leitura e Pre-processamento dos Dados (IBM_data.csv)

dt.intra <- read.csv(
  "IBM_data.csv",
  header           = TRUE,
  stringsAsFactors = FALSE,
  sep              = ",",
  dec              = "."
)

dt.intra$Period <- mdy_hm(paste(dt.intra$Date, dt.intra$Time))

dt1 <- as_tibble(dt.intra) %>%
  arrange(Period) %>%
  filter(!(hour(Period) == 9 & minute(Period) < 40)) %>%
  filter(hour(Period) < 16) %>%
  filter(!(hour(Period) == 15 & minute(Period) >= 56)) %>%
  group_by(date(Period)) %>%
  filter(n() > 100) %>%
  ungroup() %>%
  arrange(Period) %>%
  mutate(
    Ret.1min   = (Close - lag(Close)) / lag(Close),
    Close.1min = Close
  ) %>%
  filter(!is.na(Ret.1min))

ret_sd       <- sd(dt1$Ret.1min, na.rm = TRUE)
ret_mean     <- mean(dt1$Ret.1min, na.rm = TRUE)
outlier_mask <- abs(dt1$Ret.1min - ret_mean) > 10 * ret_sd
dt1$Ret.1min[outlier_mask] <- NA
dt1$Ret.1min <- zoo::na.locf(dt1$Ret.1min, na.rm = FALSE)

ret.1min <- as.xts(dt1$Ret.1min, order.by = dt1$Period)

cat("Total de observacoes apos limpeza:", nrow(dt1), "\n")
Total de observacoes apos limpeza: 2644941 
cat("Periodo:", format(min(dt1$Period)), "a", format(max(dt1$Period)), "\n")
Periodo: 1998-01-02 09:41:00 a 2026-02-20 15:55:00 

Observacao Inicial

plot(dt1$Close.1min,
     main = "IBM - Precos de Fechamento (1 min) - Serie Completa",
     col  = "black", type = "l",
     xlab = "Indice", ylab = "Preco")

boxplot(as.double(ret.1min),
        main = "Boxplot dos Retornos 1min - IBM (Serie Completa)",
        ylab = "Retorno")

Preparacao da Serie Diaria de Retornos

PRE_START_OFFSET  <- -10000
PRE_END_OFFSET    <-      0
DUR_START_OFFSET  <-      0
DUR_END_OFFSET    <-  10000
POST_START_OFFSET <-  10000
POST_END_OFFSET   <-  20000
MIN_FUTURE_IDX    <-  20000   # minimo de obs apos X para manter o ponto

n_total   <- nrow(dt1)
rets_full <- dt1$Ret.1min


daily_close <- dt1 %>%
  mutate(trade_date = as.Date(Period)) %>%
  group_by(trade_date) %>%
  summarise(close_last = last(Close.1min), .groups = "drop") %>%
  arrange(trade_date) %>%
  mutate(
    close_prev = lag(close_last),
    close_diff = close_last - close_prev,
    close_pct  = abs(close_diff / close_prev)
  )

dt1_indexed <- dt1 %>%
  mutate(global_idx = row_number(), trade_date = as.Date(Period))

last_idx_per_day <- dt1_indexed %>%
  group_by(trade_date) %>%
  summarise(last_global_idx = max(global_idx), .groups = "drop")

daily_close <- daily_close %>%
  left_join(last_idx_per_day, by = "trade_date")

dc_clean <- daily_close %>%
  filter(!is.na(close_diff), !is.na(last_global_idx)) %>%
  mutate(
    close_prev2 = close_last - close_diff,
    daily_ret   = ifelse(close_prev2 > 0, close_diff / close_prev2, NA_real_)
  ) %>%
  filter(!is.na(daily_ret))

ret_daily <- dc_clean$daily_ret
n_daily   <- length(ret_daily)
log_price <- log(dc_clean$close_last)

cat(sprintf("Serie diaria: %d obs  |  %s  a  %s\n",
            n_daily,
            format(min(dc_clean$trade_date)),
            format(max(dc_clean$trade_date))))
Serie diaria: 7076 obs  |  1998-01-05  a  2026-02-20
# ---- funcao auxiliar: linha diaria -> indice global intradiario ----
map_to_global <- function(row_idx_vec) {
  g_vec      <- dc_clean$last_global_idx[row_idx_vec]
  has_future <- !is.na(g_vec) & ((n_total - g_vec) >= MIN_FUTURE_IDX)
  sort(as.integer(g_vec[has_future]))
}

# ---- janela segura ----
safe_window <- function(start, end, n_max) {
  s <- max(1L, as.integer(start))
  e <- min(n_max, as.integer(end))
  if (s > e) return(NULL)
  s:e
}

Bai-Perron: Quebras na Media dos Retornos Diarios

Bai-Perron (1998, 2003) — Multiplas Quebras na Media

Generaliza o Teste de Chow para m quebras simultaneas desconhecidas. O modelo linear com \(m\) quebras e: \[y_t = \mathbf{x}_t' \boldsymbol{\beta}_j + \varepsilon_t, \quad t = T_{j-1}+1, \ldots, T_j, \quad j = 1,\ldots,m+1\] onde os pontos \(T_1 < T_2 < \cdots < T_m\) sao estimados conjuntamente por: \[\{\hat{T}_1, \ldots, \hat{T}_m\} = \arg\min_{T_1,\ldots,T_m} \sum_{j=1}^{m+1} \sum_{t=T_{j-1}+1}^{T_j} (y_t - \bar{y}_j)^2\]

O numero otimo de quebras \(m^*\) e selecionado pelo criterio BIC ou pelo teste sequencial de Bai-Perron, que controla a probabilidade de detectar quebras espurias. A busca e feita por programacao dinamica em \(O(n^2)\).

Entrada: \(\log(r_t^2)\) (log do quadrado dos retornos diarios), que aproxima a serie a uma distribuicao normal e e estacionario, tornando o procedimento valido. A transformacao \(\log(r_t^2) = \log(\sigma_t^2) + \log(z_t^2)\) lineariza a volatilidade, com o segundo termo atuando como ruido quasi-gaussiano de media conhecida.

O que detecta: mudancas persistentes e significativas no nivel medio de log-volatilidade ao longo de multiplos periodos sem datas pre-fixadas. Tres execucoes com \(h \in \{0.05,\, 0.10,\, 0.01\}\) permitem comparar a sensibilidade do detector ao tamanho minimo do segmento.

# Transformacao: log(r_t^2) aproxima a distribuicao a normal
# e lineariza a estrutura de volatilidade.
log_ret2 <- log(ret_daily^2 + 1e-10)   # pequeno offset para evitar log(0)

# ---- Tres execucoes com diferentes h ----
run_bp <- function(series, h_val, max_breaks = 4) {
  fit     <- strucchange::breakpoints(series ~ 1, breaks = max_breaks, h = h_val)
  bp_idx  <- fit$breakpoints
  bp_idx  <- bp_idx[!is.na(bp_idx)]
  list(fit = fit, bp_idx = bp_idx, h = h_val)
}

bp_h05 <- run_bp(log_ret2, h_val = 0.05)
bp_h10 <- run_bp(log_ret2, h_val = 0.10)
bp_h01 <- run_bp(log_ret2, h_val = 0.01)

# ---- Mapeamento para indices globais intradiarios ----
# Usa h = 0.05 como referencia principal para analises subsequentes
crash_idx_bp    <- map_to_global(bp_h05$bp_idx)
crash_idx_bp_h10 <- map_to_global(bp_h10$bp_idx)
crash_idx_bp_h01 <- map_to_global(bp_h01$bp_idx)

# ---- Tabelas-resumo para cada h ----
make_bp_summary <- function(bp_obj, label) {
  idx <- bp_obj$bp_idx
  if (length(idx) == 0) return(data.frame())
  data.frame(
    h          = label,
    rank       = seq_along(idx),
    trade_date = as.character(dc_clean$trade_date[idx]),
    log_r2     = round(log_ret2[idx], 4),
    global_idx = dc_clean$last_global_idx[idx]
  )
}

bp_all_summary <- bind_rows(
  make_bp_summary(bp_h05, "h = 0.05"),
  make_bp_summary(bp_h10, "h = 0.10"),
  make_bp_summary(bp_h01, "h = 0.01")
)

cat("Bai-Perron — quebras na media de log(r_t^2)\n")
Bai-Perron — quebras na media de log(r_t^2)
knitr::kable(bp_all_summary,
             col.names = c("h","Rank","Data","log(r_t^2)","Idx global"),
             caption   = "Bai-Perron: quebras estruturais em log(r_t^2) — tres valores de h")
Bai-Perron: quebras estruturais em log(r_t^2) — tres valores de h
h Rank Data log(r_t^2) Idx global
h = 0.05 1 2003-07-30 -8.8983 520910
h = 0.05 2 2007-10-15 -17.2661 917935
h = 0.05 3 2009-06-03 -9.3474 1071588
h = 0.05 4 2018-09-28 -11.8943 1950768
h = 0.10 1 2003-07-30 -8.8983 520910
h = 0.10 2 2007-04-10 -14.7849 868858
h = 0.10 3 2010-02-02 -9.8196 1134402
h = 0.10 4 2018-09-28 -11.8943 1950768
h = 0.01 1 2003-07-30 -8.8983 520910
h = 0.01 2 2007-10-15 -17.2661 917935
h = 0.01 3 2009-06-03 -9.3474 1071588
h = 0.01 4 2018-09-28 -11.8943 1950768
# --- Grafico: log(r_t^2) com os tres conjuntos de breakpoints ---
bp_plot_df <- bind_rows(
  if (length(bp_h05$bp_idx) > 0)
    data.frame(x = bp_h05$bp_idx,
               label = paste0("h05_", seq_along(bp_h05$bp_idx)),
               h = "h = 0.05"),
  if (length(bp_h10$bp_idx) > 0)
    data.frame(x = bp_h10$bp_idx,
               label = paste0("h10_", seq_along(bp_h10$bp_idx)),
               h = "h = 0.10"),
  if (length(bp_h01$bp_idx) > 0)
    data.frame(x = bp_h01$bp_idx,
               label = paste0("h01_", seq_along(bp_h01$bp_idx)),
               h = "h = 0.01")
)

color_h <- c("h = 0.05" = "#2980B9", "h = 0.10" = "#E67E22", "h = 0.01" = "#27AE60")
lty_h   <- c("h = 0.05" = "solid",   "h = 0.10" = "dashed",  "h = 0.01" = "dotted")

ggplot(data.frame(t = seq_len(n_daily), y = log_ret2),
       aes(x = t, y = y)) +
  geom_line(color = "grey40", linewidth = 0.3) +
  geom_vline(data = bp_plot_df,
             aes(xintercept = x, color = h, linetype = h),
             linewidth = 0.8) +
  geom_label(data = bp_plot_df,
             aes(x = x, y = max(log_ret2, na.rm = TRUE) * 0.92,
                 label = label, color = h),
             size = 2.5, show.legend = FALSE) +
  scale_color_manual(values = color_h) +
  scale_linetype_manual(values = lty_h) +
  labs(title    = "Bai-Perron: Quebras Estruturais na Media de log(r_t^2)",
       subtitle = "Tres execucoes com diferentes tamanhos minimos de segmento (h)",
       x = "Dia (indice)", y = "log(r_t^2)",
       color = "h", linetype = "h") +
  theme_minimal(base_size = 12) +
  theme(plot.title    = element_text(face = "bold"),
        legend.position = "bottom")

Teste ARCH-LM: Clustering de Volatilidade nos Breakpoints

Teste ARCH-LM (Engle, 1982) — Clustering de Volatilidade

O Multiplicador de Lagrange de Engle testa se os quadrados dos residuos \(\hat{\varepsilon}_t^2\) possuem autocorrelacao, indicando heterocedasticidade condicional (variancia nao constante ao longo do tempo).

O modelo ARCH(\(q\)) postula: \[\sigma_t^2 = \omega + \sum_{i=1}^{q} \alpha_i\,\varepsilon_{t-i}^2\]

O teste LM regride \(\hat{\varepsilon}_t^2\) em suas \(q\) defasagens: \[LM = n \cdot R^2 \;\sim\; \chi^2(q)\]

Hipoteses:

  • \(H_0\): \(\alpha_1 = \cdots = \alpha_q = 0\) — sem efeito ARCH (variancia constante)
  • \(H_1\): pelo menos um \(\alpha_i \neq 0\) — volatilidade depende do passado

Calibracao: defasagens fixadas em \(q = 5\) (5 minutos de retornos intradiarios), capturando clustering de curto prazo tipico em mercados de alta frequencia. Janelas de 10.000 observacoes (~26 dias de pregao) antes e depois de cada breakpoint. Nivel de significancia \(\alpha = 0.05\).

O que detecta: rejeitar \(H_0\) confirma que a volatilidade e heterocedástica ao redor do breakpoint — movimentos grandes tendem a ser seguidos de movimentos grandes — validando a relevancia economica do ponto de quebra. O teste e aplicado separadamente para os breakpoints de cada valor de \(h\).

run_arch_lm <- function(r, lag = 5) {
  r <- as.numeric(na.omit(r))
  if (length(r) < lag + 10) return(list(stat = NA_real_, pval = NA_real_))
  tryCatch({
    fit <- FinTS::ArchTest(r, lags = lag)
    list(stat = unname(fit$statistic), pval = fit$p.value)
  }, error = function(e) list(stat = NA_real_, pval = NA_real_))
}

# ---- Constroi linhas ARCH para um conjunto de breakpoints ----
make_arch_rows <- function(gidx_vec, prefix) {
  do.call(rbind, lapply(seq_along(gidx_vec), function(k) {
    X      <- gidx_vec[k]
    r_pre  <- rets_full[safe_window(X + PRE_START_OFFSET,  X + PRE_END_OFFSET,  n_total)]
    r_post <- rets_full[safe_window(X + POST_START_OFFSET, X + POST_END_OFFSET, n_total)]
    a_pre  <- run_arch_lm(r_pre)
    a_post <- run_arch_lm(r_post)
    data.frame(
      h          = prefix,
      evento     = paste0(prefix, "_", k),
      crash_idx  = X,
      trade_date = format(dt1$Period[min(X, n_total)], "%Y-%m-%d"),
      LM_pre     = round(a_pre$stat,  4),
      p_pre      = round(a_pre$pval,  6),
      LM_post    = round(a_post$stat, 4),
      p_post     = round(a_post$pval, 6),
      ARCH_pre   = ifelse(!is.na(a_pre$pval)  & a_pre$pval  < 0.05, "Sim", "Nao"),
      ARCH_post  = ifelse(!is.na(a_post$pval) & a_post$pval < 0.05, "Sim", "Nao")
    )
  }))
}

arch_tbl <- bind_rows(
  make_arch_rows(crash_idx_bp,      "BP h=0.05"),
  make_arch_rows(crash_idx_bp_h10,  "BP h=0.10"),
  make_arch_rows(crash_idx_bp_h01,  "BP h=0.01")
) %>% arrange(h, crash_idx)

cat("=== Teste ARCH-LM (lags=5) — pre vs pos breakpoint ===\n")
=== Teste ARCH-LM (lags=5) — pre vs pos breakpoint ===
cat("H0: sem efeito ARCH (variancia constante)  |  Rejeitar H0 se p < 0.05\n\n")
H0: sem efeito ARCH (variancia constante)  |  Rejeitar H0 se p < 0.05
knitr::kable(arch_tbl,
             col.names = c("h","Evento","Idx","Data",
                           "LM (pre)","p (pre)","LM (pos)","p (pos)",
                           "ARCH pre?","ARCH pos?"),
             caption = "Teste ARCH-LM(5): clustering de volatilidade pre vs pos breakpoint — tres valores de h")
Teste ARCH-LM(5): clustering de volatilidade pre vs pos breakpoint — tres valores de h
h Evento Idx Data LM (pre) p (pre) LM (pos) p (pos) ARCH pre? ARCH pos?
Chi-squared…1 BP h=0.01 BP h=0.01_1 520910 2003-07-30 23.1063 0.000322 21.3652 0.000691 Sim Sim
Chi-squared1…2 BP h=0.01 BP h=0.01_2 917935 2007-10-15 51.6307 0.000000 470.7777 0.000000 Sim Sim
Chi-squared2…3 BP h=0.01 BP h=0.01_3 1071588 2009-06-03 539.6291 0.000000 41.8826 0.000000 Sim Sim
Chi-squared3…4 BP h=0.01 BP h=0.01_4 1950768 2018-09-28 9.8096 0.080813 51.4496 0.000000 Nao Sim
Chi-squared…5 BP h=0.05 BP h=0.05_1 520910 2003-07-30 23.1063 0.000322 21.3652 0.000691 Sim Sim
Chi-squared1…6 BP h=0.05 BP h=0.05_2 917935 2007-10-15 51.6307 0.000000 470.7777 0.000000 Sim Sim
Chi-squared2…7 BP h=0.05 BP h=0.05_3 1071588 2009-06-03 539.6291 0.000000 41.8826 0.000000 Sim Sim
Chi-squared3…8 BP h=0.05 BP h=0.05_4 1950768 2018-09-28 9.8096 0.080813 51.4496 0.000000 Nao Sim
Chi-squared…9 BP h=0.10 BP h=0.10_1 520910 2003-07-30 23.1063 0.000322 21.3652 0.000691 Sim Sim
Chi-squared1…10 BP h=0.10 BP h=0.10_2 868858 2007-04-10 61.5429 0.000000 68.0399 0.000000 Sim Sim
Chi-squared2…11 BP h=0.10 BP h=0.10_3 1134402 2010-02-02 98.7151 0.000000 17.4014 0.003798 Sim Sim
Chi-squared3…12 BP h=0.10 BP h=0.10_4 1950768 2018-09-28 9.8096 0.080813 51.4496 0.000000 Nao Sim
# ---- Grafico: LM-stat pre vs pos por h ----
chi_crit <- qchisq(0.95, df = 5)

arch_long <- arch_tbl %>%
  select(h, evento, crash_idx, LM_pre, LM_post) %>%
  pivot_longer(cols = c(LM_pre, LM_post),
               names_to  = "fase",
               values_to = "LM_stat") %>%
  mutate(fase = ifelse(fase == "LM_pre", "pre", "post"),
         fase = factor(fase, levels = c("pre", "post")))

print(
  ggplot(arch_long %>% filter(!is.na(LM_stat)),
         aes(x = reorder(evento, crash_idx), y = LM_stat, fill = fase)) +
    geom_col(position = "dodge", width = 0.65, alpha = 0.85) +
    geom_hline(yintercept = chi_crit, linetype = "dashed",
               color = "red", linewidth = 0.7) +
    facet_wrap(~ h, scales = "free_x", ncol = 1) +
    scale_fill_manual(values = c(pre = "#2ECC71", post = "#E74C3C")) +
    labs(title    = "ARCH-LM (q=5) — Estatistica LM: pre vs pos breakpoint",
         subtitle = paste0("Linha vermelha = chi^2(5) critico (p=0.05, val=",
                           round(chi_crit, 2), ") | Facetas = valor de h"),
         x = "Evento", y = "LM-estatistica", fill = "Fase") +
    theme_minimal(base_size = 12) +
    theme(axis.text.x = element_text(angle = 35, hjust = 1),
          plot.title  = element_text(face = "bold"),
          strip.text  = element_text(face = "bold"))
)

Visualizacao Comparativa: Bai-Perron — Comparacao entre h

price_max <- max(dt1$Close.1min, na.rm = TRUE)

# ---- Monta data.frame com todos os breakpoints dos tres h ----
make_marks <- function(gidx_vec, h_label, y_frac) {
  if (length(gidx_vec) == 0) return(NULL)
  data.frame(
    x      = gidx_vec,
    h      = h_label,
    label  = paste0(gsub("h = ", "h", h_label), "_", seq_along(gidx_vec)),
    y_pos  = price_max * y_frac,
    stringsAsFactors = FALSE
  )
}

all_marks <- bind_rows(
  make_marks(crash_idx_bp,     "h = 0.05", 0.89),
  make_marks(crash_idx_bp_h10, "h = 0.10", 0.76),
  make_marks(crash_idx_bp_h01, "h = 0.01", 0.63)
) %>%
  mutate(h = factor(h, levels = c("h = 0.05", "h = 0.10", "h = 0.01")))

color_h  <- c("h = 0.05" = "#2980B9", "h = 0.10" = "#E67E22", "h = 0.01" = "#27AE60")
lty_h    <- c("h = 0.05" = "solid",   "h = 0.10" = "dashed",  "h = 0.01" = "dotted")

print(
  ggplot(data.frame(index = seq_len(n_total), price = dt1$Close.1min),
         aes(x = index, y = price)) +
    geom_line(color = "black", linewidth = 0.3) +
    geom_vline(data = all_marks,
               aes(xintercept = x, color = h, linetype = h),
               linewidth = 0.75, alpha = 0.85) +
    geom_label(data = all_marks,
               aes(x = x, y = y_pos, label = label, color = h),
               size = 2.6, show.legend = FALSE) +
    scale_color_manual(values = color_h) +
    scale_linetype_manual(values = lty_h) +
    labs(
      title    = "IBM - Bai-Perron: Comparacao entre valores de h",
      subtitle = "Aplicado sobre log(r_t^2) | h controla o tamanho minimo do segmento",
      x = "Indice (observacao)", y = "Preco de Fechamento",
      color = "h", linetype = "h"
    ) +
    theme_minimal(base_size = 12) +
    theme(plot.title = element_text(face = "bold"), legend.position = "bottom")
)

Visualizacao: Janelamentos dos Breakpoints

fill_colors_seg <- c(pre = "#E8FA22", during = "#FC886D", post = "#6DD6FC")

# ---- Funcao auxiliar: plota janelas para um conjunto de breakpoints ----
plot_windows_for_h <- function(gidx_vec, h_label, line_color) {
  top5_vec <- head(gidx_vec, 5)
  if (length(top5_vec) == 0) {
    cat(sprintf("[%s] Nenhum breakpoint disponivel.\n", h_label))
    return(invisible(NULL))
  }

  for (k in seq_along(top5_vec)) {
    X <- top5_vec[k]

    segs_df <- do.call(rbind, lapply(c("pre", "during", "post"), function(nm) {
      win <- switch(nm,
        pre    = safe_window(X - 10000, X,         n_total),
        during = safe_window(X,         X + 10000, n_total),
        post   = safe_window(X + 10000, X + 20000, n_total)
      )
      if (is.null(win)) return(NULL)
      data.frame(janela = nm, xmin = win[1] - 0.5, xmax = tail(win, 1) + 0.5)
    }))

    zoom_s  <- max(1L,      X - 22000L)
    zoom_e  <- min(n_total, X + 32000L)
    plot_df <- data.frame(index = seq_len(n_total),
                          price = dt1$Close.1min)[zoom_s:zoom_e, ]

    crash_date <- dt1$Period[min(X, n_total)]
    crash_ret  <- rets_full[min(X, n_total)]

    print(
      ggplot(plot_df, aes(x = index, y = price)) +
        geom_rect(data = segs_df,
                  aes(xmin = xmin, xmax = xmax, ymin = -Inf, ymax = Inf, fill = janela),
                  inherit.aes = FALSE, alpha = 0.30) +
        geom_line(color = "black", linewidth = 0.35) +
        geom_vline(xintercept = X, color = line_color,
                   linewidth = 0.9, linetype = "solid") +
        scale_fill_manual(name = "Janela", values = fill_colors_seg) +
        labs(
          title    = paste0("[", h_label, "] Evento_", k,
                            " — Janelamentos sobre Preco IBM"),
          subtitle = paste0(
            "Linha = breakpoint (", format(crash_date, "%Y-%m-%d"),
            ") | Ret: ", round(crash_ret * 100, 3), "%",
            "\npre=[X-10k, X]  |  during=[X, X+10k]  |  post=[X+10k, X+20k]"
          ),
          x = "Indice (observacao global)", y = "Preco de Fechamento"
        ) +
        theme_minimal(base_size = 11) +
        theme(plot.title = element_text(face = "bold"), legend.position = "top") +
        coord_cartesian(expand = FALSE)
    )
  }
}

# ---- Plota para cada valor de h ----
plot_windows_for_h(crash_idx_bp,     "h = 0.05", "#2980B9")

plot_windows_for_h(crash_idx_bp_h10, "h = 0.10", "#E67E22")

plot_windows_for_h(crash_idx_bp_h01, "h = 0.01", "#27AE60")