getwd()
## [1] "C:/Users/clin0/OneDrive/Desktop/DSE Applied stastics/Projects/Group_project/datasets"
list.files()
##  [1] "BAMLC0A1CAAAEY.csv"                  
##  [2] "Data_refund.xlsx"                    
##  [3] "DFF.csv"                             
##  [4] "DGS10.csv"                           
##  [5] "EXPINF30YR.csv"                      
##  [6] "FEDFUNDS.csv"                        
##  [7] "GS2.csv"                             
##  [8] "INDPRO.csv"                          
##  [9] "Monthly_Earliest_Index_1919-2025.csv"
## [10] "Monthly_Joined_Index_2003-2025.csv"  
## [11] "UNRATE.csv"                          
## [12] "VIXCLS.csv"
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ lubridate 1.9.4     ✔ tibble    3.3.0
## ✔ purrr     1.1.0     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(vars)
## Warning: package 'vars' was built under R version 4.5.2
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.5.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.2
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.5.2
## 
## Attaching package: 'strucchange'
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## Loading required package: urca
## Warning: package 'urca' was built under R version 4.5.2
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.5.2
library(lmtest)
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.5.2
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:zoo':
## 
##     index
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(lubridate)
library(urca)
library(tseries)
## Warning: package 'tseries' was built under R version 4.5.2
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(patchwork)
## 
## Attaching package: 'patchwork'
## 
## The following object is masked from 'package:MASS':
## 
##     area
library(forecast)
## Warning: package 'forecast' was built under R version 4.5.2
library(tsDyn)
## Warning: package 'tsDyn' was built under R version 4.5.2
library(lubridate)
df <- read.csv("Monthly_Earliest_Index_1919-2025.csv")

df <- df %>%
  mutate(date = as.Date(date)) %>%
  as_tsibble(index = date)

head(df)
select
## function (obj) 
## UseMethod("select")
## <bytecode: 0x0000013f248d5ce8>
## <environment: namespace:MASS>
df_BOF <- df %>%
  dplyr::select(
    date,
    market_yield_val,
    unemploy_val,
    ind_prod_val,
    volatility_val,
    aaa_corp_bond_val,
    bbb_corp_bond_val
  ) %>%
  mutate(date = as.Date(date)) %>%
  filter(date >= as.Date("1997-01-01")) %>%
  filter(!(date >= as.Date("2020-01-01") & date <= as.Date("2021-12-31")))


head(df_BOF)
nrow(df_BOF)
## [1] 320

——————————————————————-


Here is what we should focus on below:

#evaluate

Row_vars <- names(df_BOF)[names(df_BOF) != "date"]

for (v in Row_vars) {

  ts_raw <- ts(df_BOF[[v]], start = c(1997,1), frequency = 12)

  # ACF plot
  p_acf <- ggAcf(ts_raw,lag.max = 40) +
    ggtitle(paste("ACF of", v)) +
    theme_minimal(base_size = 14)

  # PACF plot
  p_pacf <- ggPacf(ts_raw,lag.max = 40) +
    ggtitle(paste("PACF of", v)) +
    theme_minimal(base_size = 14)

  print(p_acf / p_pacf)
}

library(ggplotify)
## Warning: package 'ggplotify' was built under R version 4.5.2
vars <- names(df_BOF)[names(df_BOF) != "date"]

for (v in vars) {
  
  cat("========================================\n")
  cat("variable:", v, "\n")
  cat("========================================\n\n")
  
  x <- ts(df_BOF[[v]], start=c(1997,1), frequency=12)
  dx <- diff(x)

  # row data
  df_raw <- data.frame(
    time = as.numeric(time(x)),
    value = as.numeric(x)
  )
  
  p1 <- ggplot(df_raw, aes(time, value)) +
    geom_line() +
    ggtitle(paste(v, "- raw data"))

  # diff once
  df_dx <- data.frame(time = as.numeric(time(dx)), value = as.numeric(dx))
  p2 <- ggplot(df_dx, aes(time, value)) +
    geom_line() +
    ggtitle(paste(v, "- diff once"))

  # ACF
  acf_obj <- acf(x, plot = FALSE, lag.max = 40)
  acf_df <- data.frame(
    lag = acf_obj$lag[-1],
    acf = acf_obj$acf[-1]
  )
  
  Tn <- length(x)

  bart_var <- sapply(1:nrow(acf_df), function(k) {
    if (k == 1) return(1/Tn)
    return( (1/Tn) * (1 + 2 * sum(acf_df$acf[1:(k-1)]^2)) )
  })

  acf_df$upper <-  1.96 * sqrt(bart_var)
  acf_df$lower <- -1.96 * sqrt(bart_var)

  p3 <- ggplot(acf_df, aes(lag, acf)) +
    geom_ribbon(aes(ymin = lower, ymax = upper), 
                fill = "skyblue", alpha = 0.3) +
    geom_segment(aes(xend = lag, yend = 0)) +
    geom_point(size = 1.8) +
    geom_hline(yintercept = 0) +
    ggtitle(paste(v, "- ACF (Bartlett CI fan shape)"))

  # PACF
  p4 <- as.ggplot(function() {
    pacf(x, lag.max = 40, main = paste(v, "- PACF"))
  })

  #Distribution
  df_res <- data.frame(x = as.numeric(x))
  p5 <- ggplot(df_res, aes(x)) +
    geom_histogram(aes(y = ..density..), bins = 30, fill="skyblue") +
    geom_density(color="red") +
    ggtitle(paste(v, "- Distribution"))

  # rolling
  roll_mean <- zoo::rollmean(x, k = 12, fill = NA)
  df_roll <- data.frame(
    time = as.numeric(time(x)),
    x = as.numeric(x),
    rm = as.numeric(roll_mean)
  )

  p6 <- ggplot(df_roll, aes(time)) +
    geom_line(aes(y=x), color="black") +
    geom_line(aes(y=rm), color="red") +
    ggtitle(paste(v, "- Rolling Mean (12M)"))

  #------------------------------------------------
  print((p1 | p2) / (p3 | p4) / (p5 | p6))
}
## ========================================
## variable: market_yield_val 
## ========================================
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
## ========================================
## variable: unemploy_val 
## ========================================

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
## ========================================
## variable: ind_prod_val 
## ========================================

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
## ========================================
## variable: volatility_val 
## ========================================

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
## ========================================
## variable: aaa_corp_bond_val 
## ========================================

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
## ========================================
## variable: bbb_corp_bond_val 
## ========================================

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).

ADF test and Difference

Before we go forward to models, we have to keep them stationary.

ADF Test: H0: This is not stationary H1: This is stationary

Lag selection:

test <- ur.df(df_BOF$market_yield_val, type="trend", lags=1, selectlags="Fixed")

sapply(slotNames(test), function(s) class(slot(test, s)))
## $y
## [1] "numeric"
## 
## $model
## [1] "character"
## 
## $lags
## [1] "integer"
## 
## $cval
## [1] "matrix" "array" 
## 
## $res
## [1] "numeric"
## 
## $teststat
## [1] "matrix" "array" 
## 
## $testreg
## [1] "summary.lm"
## 
## $test.name
## [1] "character"

ADF equation: \[ \Delta y_t = \alpha + \beta t + \gamma y_{t-1} + \sum_{i=1}^{p} \delta_i \Delta y_{t-i} + \varepsilon_t . \] Using AIC/BIC to evaluate.

\[ AIC=−2log(L)+2k \] \[ BIC=−2log(L)+klog(n) \]

Set the equation manually to find the lag number:

Setting ADF:

ts_data <- ts(df_BOF, start = c(1997,1),frequency = 12)

adf.test(ts_data[,"market_yield_val"],k=1)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_data[, "market_yield_val"]
## Dickey-Fuller = -2.1582, Lag order = 1, p-value = 0.5098
## alternative hypothesis: stationary
adf.test(ts_data[,"unemploy_val"],k=1)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_data[, "unemploy_val"]
## Dickey-Fuller = -0.84513, Lag order = 1, p-value = 0.9569
## alternative hypothesis: stationary
adf.test(ts_data[,"ind_prod_val"],k=1)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_data[, "ind_prod_val"]
## Dickey-Fuller = -2.6268, Lag order = 1, p-value = 0.3122
## alternative hypothesis: stationary
adf.test(ts_data[,"volatility_val"],k=1)
## Warning in adf.test(ts_data[, "volatility_val"], k = 1): p-value smaller than
## printed p-value
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_data[, "volatility_val"]
## Dickey-Fuller = -5.6181, Lag order = 1, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_data[,"aaa_corp_bond_val"],k=1)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_data[, "aaa_corp_bond_val"]
## Dickey-Fuller = -1.7299, Lag order = 1, p-value = 0.6904
## alternative hypothesis: stationary
adf.test(ts_data[,"bbb_corp_bond_val"],k=1)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_data[, "bbb_corp_bond_val"]
## Dickey-Fuller = -2.6058, Lag order = 1, p-value = 0.3211
## alternative hypothesis: stationary

P-value > 0.05, which means Null hypothesis is true.

But, is it true?

Here is Kpss test:

H0: stationary H1: none stationary

# Level
kpss.test(ts_data[,"unemploy_val"],null = "Level")
## Warning in kpss.test(ts_data[, "unemploy_val"], null = "Level"): p-value
## smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  ts_data[, "unemploy_val"]
## KPSS Level = 0.92066, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_data[,"ind_prod_val"],null = "Level")
## Warning in kpss.test(ts_data[, "ind_prod_val"], null = "Level"): p-value
## smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  ts_data[, "ind_prod_val"]
## KPSS Level = 3.7314, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_data[,"volatility_val"],null = "Level")
## Warning in kpss.test(ts_data[, "volatility_val"], null = "Level"): p-value
## smaller than printed p-value
## 
##  KPSS Test for Level Stationarity
## 
## data:  ts_data[, "volatility_val"]
## KPSS Level = 0.81377, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_data[,"unemploy_val"],null = "Trend")
## Warning in kpss.test(ts_data[, "unemploy_val"], null = "Trend"): p-value
## smaller than printed p-value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_data[, "unemploy_val"]
## KPSS Trend = 0.87405, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_data[,"ind_prod_val"],null = "Trend")
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_data[, "ind_prod_val"]
## KPSS Trend = 0.20896, Truncation lag parameter = 5, p-value = 0.01264
kpss.test(ts_data[,"volatility_val"],null = "Trend")
## Warning in kpss.test(ts_data[, "volatility_val"], null = "Trend"): p-value
## greater than printed p-value
## 
##  KPSS Test for Trend Stationarity
## 
## data:  ts_data[, "volatility_val"]
## KPSS Trend = 0.11294, Truncation lag parameter = 5, p-value = 0.1

Lets look at the box plots:

df_long <- df_BOF %>% 
  dplyr::select(date, market_yield_val, ind_prod_val, volatility_val, unemploy_val, aaa_corp_bond_val, bbb_corp_bond_val) %>%
  pivot_longer(
    cols = -date,
    names_to = "variable",
    values_to = "value"
  )

ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
  geom_boxplot() +
  theme_minimal() +
  labs(
    title = "Boxplots of Differenced Series",
    x = "Variable",
    y = "Value"
  ) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    legend.position = "none"
  )

Here is simply to confirm whether the data is truly stable by ur.df methods.

compare the lag I selected to auto select lag number.

summary(ur.df(ts_data[,"unemploy_val"], type = "trend", lags = 1))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53020 -0.10276 -0.00045  0.09881  0.46831 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  3.298e-02  3.272e-02   1.008   0.3142  
## z.lag.1     -4.086e-03  4.835e-03  -0.845   0.3987  
## tt          -8.148e-05  9.226e-05  -0.883   0.3779  
## z.diff.lag   1.272e-01  5.599e-02   2.271   0.0238 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.15 on 314 degrees of freedom
## Multiple R-squared:  0.02039,    Adjusted R-squared:  0.01103 
## F-statistic: 2.179 on 3 and 314 DF,  p-value: 0.09046
## 
## 
## Value of test-statistic is: -0.8451 0.4764 0.6743 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36
summary(ur.df(ts_data[,"unemploy_val"], type = "trend", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.53020 -0.10276 -0.00045  0.09881  0.46831 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  3.298e-02  3.272e-02   1.008   0.3142  
## z.lag.1     -4.086e-03  4.835e-03  -0.845   0.3987  
## tt          -8.148e-05  9.226e-05  -0.883   0.3779  
## z.diff.lag   1.272e-01  5.599e-02   2.271   0.0238 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.15 on 314 degrees of freedom
## Multiple R-squared:  0.02039,    Adjusted R-squared:  0.01103 
## F-statistic: 2.179 on 3 and 314 DF,  p-value: 0.09046
## 
## 
## Value of test-statistic is: -0.8451 0.4764 0.6743 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36
summary(ur.df(ts_data[,"ind_prod_val"], type = "trend", lags = 1))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0558 -0.3459  0.0220  0.3401  1.5586 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  2.3048370  0.8244363   2.796  0.00550 **
## z.lag.1     -0.0248042  0.0094428  -2.627  0.00904 **
## tt           0.0009083  0.0006447   1.409  0.15990   
## z.diff.lag   0.1532115  0.0550711   2.782  0.00573 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5919 on 314 degrees of freedom
## Multiple R-squared:  0.05133,    Adjusted R-squared:  0.04227 
## F-statistic: 5.664 on 3 and 314 DF,  p-value: 0.0008648
## 
## 
## Value of test-statistic is: -2.6268 4.2478 4.3631 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36
summary(ur.df(ts_data[,"ind_prod_val"], type = "trend", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0558 -0.3459  0.0220  0.3401  1.5586 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  2.3048370  0.8244363   2.796  0.00550 **
## z.lag.1     -0.0248042  0.0094428  -2.627  0.00904 **
## tt           0.0009083  0.0006447   1.409  0.15990   
## z.diff.lag   0.1532115  0.0550711   2.782  0.00573 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5919 on 314 degrees of freedom
## Multiple R-squared:  0.05133,    Adjusted R-squared:  0.04227 
## F-statistic: 5.664 on 3 and 314 DF,  p-value: 0.0008648
## 
## 
## Value of test-statistic is: -2.6268 4.2478 4.3631 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36
summary(ur.df(ts_data[,"volatility_val"], type = "trend", lags = 1))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3531  -1.9823  -0.4082   0.8580  31.0944 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.107624   0.841337   4.882 1.67e-06 ***
## z.lag.1     -0.168961   0.030074  -5.618 4.26e-08 ***
## tt          -0.004586   0.002418  -1.896  0.05882 .  
## z.diff.lag   0.156614   0.055702   2.812  0.00524 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.75 on 314 degrees of freedom
## Multiple R-squared:  0.09615,    Adjusted R-squared:  0.08751 
## F-statistic: 11.13 on 3 and 314 DF,  p-value: 5.791e-07
## 
## 
## Value of test-statistic is: -5.6181 10.5257 15.7866 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36
summary(ur.df(ts_data[,"volatility_val"], type = "trend", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.3531  -1.9823  -0.4082   0.8580  31.0944 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.107624   0.841337   4.882 1.67e-06 ***
## z.lag.1     -0.168961   0.030074  -5.618 4.26e-08 ***
## tt          -0.004586   0.002418  -1.896  0.05882 .  
## z.diff.lag   0.156614   0.055702   2.812  0.00524 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.75 on 314 degrees of freedom
## Multiple R-squared:  0.09615,    Adjusted R-squared:  0.08751 
## F-statistic: 11.13 on 3 and 314 DF,  p-value: 5.791e-07
## 
## 
## Value of test-statistic is: -5.6181 10.5257 15.7866 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36

How to know if they are stable, look at the A = tau3(5pct): -3.42 and B = The first number of Value of test-statistic: -5.9771 And, if B < A, which means it is stationary data. I(0), so I found out vol and ind are I(0).

But why ur.df shows they are stable?

Here is the data after diff once

df_BOF_diff <- df_BOF %>%
  mutate(across(
    .cols = -date,
    .fns  = ~ c(NA, diff(.)),
    .names = "d_{.col}"
  )) %>%
  slice(-1)

orig_vars <- names(df_BOF)[names(df_BOF) != "date"]

head(df_BOF_diff)

Using for loop to make their ACF and PACF plots

for (v in orig_vars) {

  # raw and diff series
  raw_series  <- ts(df_BOF[[v]], start = c(1997,1), frequency = 12)
  diff_series <- ts(df_BOF_diff[[paste0("d_", v)]], start = c(1997,2), frequency = 12)

  # ACF
  p_raw_acf <- ggAcf(raw_series) +
    ggtitle(paste("ACF of", v)) +
    theme_minimal(base_size = 14)

  p_diff_acf <- ggAcf(diff_series) +
    ggtitle(paste("ACF of", paste0("d_", v))) +
    theme_minimal(base_size = 14)

  # PACF
  p_raw_pacf <- ggPacf(raw_series) +
    ggtitle(paste("PACF of", v)) +
    theme_minimal(base_size = 14)

  p_diff_pacf <- ggPacf(diff_series) +
    ggtitle(paste("PACF of", paste0("d_", v))) +
    theme_minimal(base_size = 14)

  # Combine into 2×2 layout
  full_plot <- (p_raw_acf | p_diff_acf) /
               (p_raw_pacf | p_diff_pacf)

  print(full_plot)
}

ts_data_diff <- ts(df_BOF_diff, start = c(1997,1),frequency = 12)

summary(ur.df(ts_data_diff[,"d_ind_prod_val"], type = "trend", selectlags = "AIC"))
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression trend 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.0765 -0.3225  0.0058  0.3193  1.7690 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.1132855  0.0677311   1.673  0.09541 .  
## z.lag.1     -0.6863009  0.0721736  -9.509  < 2e-16 ***
## tt          -0.0003772  0.0003634  -1.038  0.30006    
## z.diff.lag  -0.1940835  0.0553455  -3.507  0.00052 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.5877 on 313 degrees of freedom
## Multiple R-squared:  0.4474, Adjusted R-squared:  0.4421 
## F-statistic: 84.47 on 3 and 313 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -9.509 30.1478 45.2162 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2  6.15  4.71  4.05
## phi3  8.34  6.30  5.36

here is what we should do next? ______________________________________________________ ______________________________________________________ we have I(1), we can do cointegration test such as Johansen

if it passes, then VAR, if not, maybe ARIMA, test white noise

df_aaa <- df_BOF %>%
 dplyr::select(
    aaa_corp_bond_val,
    market_yield_val,
    unemploy_val,
    ind_prod_val
  ) %>%
  mutate(across(everything(), as.numeric)) %>% 
  na.omit()

lag_sel2 <- VARselect(df_aaa, lag.max = 12, type = "trend")
lag_sel2$selection
## AIC(n)  HQ(n)  SC(n) FPE(n) 
##      2      1      1      2
p_opt2 <- as.integer(lag_sel2$selection["AIC(n)"])
p_opt2
## [1] 2
df_bbb <- df_BOF %>%
 dplyr::select(
    bbb_corp_bond_val,
    market_yield_val,
    unemploy_val,
    ind_prod_val
  ) %>%
  mutate(across(everything(), as.numeric)) %>% 
  na.omit()

lag_sel1 <- VARselect(df_bbb, lag.max = 12, type = "trend")
lag_sel1$selection
## AIC(n)  HQ(n)  SC(n) FPE(n) 
##      2      2      1      2
p_opt1 <- as.integer(lag_sel1$selection["AIC(n)"])
p_opt1
## [1] 2

Using Johensen Test aaa_df’s r.

ts_aaa <- ts(df_aaa, start = c(1997,1), frequency = 12)

johansen_test_aaa <- ca.jo(
  ts_aaa,
  type = "trace",
  ecdet = "trend",
  K = 2
)

summary(johansen_test_aaa)
## 
## ###################### 
## # Johansen-Procedure # 
## ###################### 
## 
## Test type: trace statistic , with linear trend in cointegration 
## 
## Eigenvalues (lambda):
## [1] 2.400208e-01 1.379271e-01 6.320180e-02 3.132269e-02 1.224493e-02
## [6] 4.422176e-18
## 
## Values of teststatistic and critical values of test:
## 
##            test 10pct  5pct  1pct
## r <= 4 |   3.92 10.49 12.25 16.26
## r <= 3 |  14.04 22.76 25.32 30.45
## r <= 2 |  34.80 39.06 42.44 48.45
## r <= 1 |  82.00 59.14 62.99 70.05
## r = 0  | 169.27 83.20 87.31 96.58
## 
## Eigenvectors, normalised to first column:
## (These are the cointegration relations)
## 
##                      aaa_corp_bond_val.l2 market_yield_val.l2 unemploy_val.l2
## aaa_corp_bond_val.l2         1.0000000000          1.00000000      1.00000000
## market_yield_val.l2         -1.0581162059          4.87815265      2.15116153
## unemploy_val.l2              0.1276958944          1.44673308      0.70683847
## ind_prod_val.l2              0.0196228591         -0.65204040      0.54536008
## date.l2                      0.0005503576         -0.01439076     -0.01392828
## trend.l2                    -0.0187902000          0.53552580      0.45218521
##                      ind_prod_val.l2     date.l2    trend.l2
## aaa_corp_bond_val.l2      1.00000000  1.00000000  1.00000000
## market_yield_val.l2      -0.44346498 -2.32761730 -5.11868397
## unemploy_val.l2          -3.49978207  9.51337415 -0.81148946
## ind_prod_val.l2          -3.42700642  1.66831341  0.14915215
## date.l2                  -0.07471344  0.02201836 -0.01020533
## trend.l2                  2.66234422 -0.77160029  0.21053422
## 
## Weights W:
## (This is the loading matrix)
## 
##                     aaa_corp_bond_val.l2 market_yield_val.l2 unemploy_val.l2
## aaa_corp_bond_val.d          -0.07502961        -0.011725149   -0.0084967278
## market_yield_val.d            0.01952314        -0.009364052   -0.0155255346
## unemploy_val.d                0.12374997         0.003499374   -0.0008210305
## ind_prod_val.d               -0.42477885         0.025905937   -0.0261285170
## date.d                       -2.47893387        -0.478592344    0.2355113376
##                     ind_prod_val.l2       date.l2      trend.l2
## aaa_corp_bond_val.d   -1.397907e-03 -6.569503e-04 -4.431480e-16
## market_yield_val.d    -1.416843e-04  3.109457e-05 -5.875155e-16
## unemploy_val.d        -3.273902e-04 -6.391295e-04 -9.382984e-17
## ind_prod_val.d         2.640552e-05 -2.792741e-05  5.772250e-16
## date.d                 3.165729e-01 -2.198676e-01  4.759284e-14

models and prediction

For aaa’s vector prediction (same as bbb): \[ Y_t = \begin{pmatrix} aaa_t \\ market\_yield_t \\ ind\_prod_t \\ unemploy_t \end{pmatrix}. \]

Johansen’s test indicates one cointegration relation \((r = 1)\).
Thus, the VECM is: \[ \Delta Y_t = \alpha \beta' Y_{t-1} + \sum_{i=1}^{p-1} \Gamma_i \Delta Y_{t-i} + u_t. \]

The long-run equilibrium (cointegration vector) is: \[ \beta' Y_{t-1} = aaa_{t-1} - \beta_1 \cdot market\_yield_{t-1} - \beta_2 \cdot ind\_prod_{t-1} - \beta_3 \cdot unemploy_{t-1}. \]

Using Time Series cross-Validation : Setting cross parameters

start_year <- 1997
end_year <- 2025
initial_train_end <- 2005

all_pred <- data.frame(
  date = as.Date(character()),
  real = numeric(),
  pred = numeric()
)
get_rank <- function(jo, cval_level = "5pct") {

  tst <- jo@teststat
  if (is.matrix(tst)) {

    tst_vec <- as.numeric(tst[, 1])
  } else {
    tst_vec <- as.numeric(tst)
  }
  
  cv <- jo@cval
  if (is.matrix(cv)) {

    cn <- colnames(cv)
    pick_col <- which(grepl("5", cn, ignore.case = TRUE)[1])
    if (is.na(pick_col) || length(pick_col) == 0) pick_col <- 1
    cval_vec <- as.numeric(cv[, pick_col])
  } else {
    cval_vec <- as.numeric(cv)
  }
  
  n <- min(length(tst_vec), length(cval_vec))
  tst_vec <- tst_vec[1:n]
  cval_vec <- cval_vec[1:n]
  
  # rank = number of trace stats > critical value
  r <- sum(tst_vec > cval_vec)
  return(r)
}
library(vars)
library(urca)
library(lubridate)
library(ggplot2)


start_year <- 1997
end_year <- 2025
initial_train_end <- 2004
max_lag_try <- 8   

df_aaa$date <- as.Date(df_aaa$date)
dates <- df_aaa$date

ts_aaa <- df_aaa[, c(
  "aaa_corp_bond_val",
  "market_yield_val",
  "unemploy_val",
  "ind_prod_val"
)]
get_rank <- function(jo, cval_level = "5pct") {
  tst <- jo@teststat
  tst_vec <- if (is.matrix(tst)) as.numeric(tst[, 1]) else as.numeric(tst)
  
  cv <- jo@cval
  if (is.matrix(cv)) {
    cn <- colnames(cv)
    pick_col <- which(grepl("5", cn, ignore.case = TRUE)[1])
    if (is.na(pick_col) || length(pick_col) == 0) pick_col <- 1
    cval_vec <- as.numeric(cv[, pick_col])
  } else {
    cval_vec <- as.numeric(cv)
  }
  
  n <- min(length(tst_vec), length(cval_vec))
  tst_vec <- tst_vec[1:n]
  cval_vec <- cval_vec[1:n]
  
  r <- sum(tst_vec > cval_vec)
  return(r)
}


all_pred <- data.frame(
  date = as.Date(character()),
  real = numeric(),
  pred = numeric(),
  lower = numeric(),
  upper = numeric(),
  method = character(),
  stringsAsFactors = FALSE
)

suppress_warns <- FALSE



for (yr in initial_train_end:(end_year - 1)) {
  
 
  train_idx <- which(year(dates) <= yr)
  test_idx_all <- which(year(dates) == yr + 1)
  if (length(test_idx_all) == 0) next
  test_idx <- test_idx_all[1]
  
  train_data <- ts_aaa[train_idx, , drop = FALSE]
  test_data  <- ts_aaa[test_idx, , drop = FALSE]
  
 
  if (nrow(train_data) <= max_lag_try + 1) {
    p_opt <- max(1, floor(nrow(train_data)/5))
  } else {
    vs <- VARselect(train_data, lag.max = max_lag_try, type = "trend")
    if (!is.null(vs$selection) && "AIC(n)" %in% names(vs$selection)) {
      p_opt <- as.integer(vs$selection["AIC(n)"])
    } else if (!is.null(vs$criteria)) {
      p_opt <- which.min(vs$criteria["AIC", ])
    } else {
      p_opt <- 2
    }
    if (is.na(p_opt) || p_opt < 1) p_opt <- 1
  }
  

  ca_call <- function() ca.jo(train_data, type = "trace", ecdet = "trend", K = p_opt)
  jo_aaa <- if (suppress_warns) suppressWarnings(ca_call()) else ca_call()
  
  r <- tryCatch(get_rank(jo_aaa), error = function(e) 0)
  

  if (r == 0) {

    var_model <- if (suppress_warns) suppressWarnings(VAR(train_data, p = p_opt, type = "const")) else VAR(train_data, p = p_opt, type = "trend")
    fc <- predict(var_model, n.ahead = 12)
    
    pred_val   <- as.numeric(fc$fcst[[1]][1, "fcst"])
    pred_lower <- as.numeric(fc$fcst[[1]][1, "lower"])
    pred_upper <- as.numeric(fc$fcst[[1]][1, "upper"])
    method_used <- "VAR"
    
  } else {
    # VECM -> VAR
    var_model_from_jo <- if (suppress_warns) suppressWarnings(vec2var(jo_aaa, r = r)) else vec2var(jo_aaa, r = r)
    pred_list <- predict(var_model_from_jo, n.ahead = 1)
    
    first_var_name <- colnames(train_data)[1]
    

    if (!is.null(pred_list$fcst) && first_var_name %in% names(pred_list$fcst)) {
      pred_val   <- as.numeric(pred_list$fcst[[first_var_name]][1, "fcst"])
      pred_lower <- tryCatch(as.numeric(pred_list$fcst[[first_var_name]][1, "lower"]), error = function(e) NA)
      pred_upper <- tryCatch(as.numeric(pred_list$fcst[[first_var_name]][1, "upper"]), error = function(e) NA)
    } else {
      pred_val   <- as.numeric(pred_list$fcst[[1]][1, 1])
      pred_lower <- NA
      pred_upper <- NA
    }
    
    method_used <- paste0("VECM_r", r)
  }
  
  real_val <- as.numeric(test_data[[1]])
  

  all_pred <- rbind(all_pred, data.frame(
    date  = dates[test_idx],
    real  = real_val,
    pred  = pred_val,
    lower = pred_lower,
    upper = pred_upper,
    method = method_used,
    stringsAsFactors = FALSE
  ))
  
  cat("Year", yr, ": p_opt =", p_opt, ", r =", r, ", method =", method_used, "\n")
}
## Year 2004 : p_opt = 2 , r = 0 , method = VAR 
## Year 2005 : p_opt = 2 , r = 0 , method = VAR 
## Year 2006 : p_opt = 2 , r = 0 , method = VAR 
## Year 2007 : p_opt = 2 , r = 0 , method = VAR 
## Year 2008 : p_opt = 2 , r = 1 , method = VECM_r1 
## Year 2009 : p_opt = 7 , r = 0 , method = VAR 
## Year 2010 : p_opt = 8 , r = 0 , method = VAR 
## Year 2011 : p_opt = 6 , r = 1 , method = VECM_r1 
## Year 2012 : p_opt = 6 , r = 1 , method = VECM_r1 
## Year 2013 : p_opt = 6 , r = 1 , method = VECM_r1 
## Year 2014 : p_opt = 6 , r = 1 , method = VECM_r1 
## Year 2015 : p_opt = 7 , r = 2 , method = VECM_r2 
## Year 2016 : p_opt = 7 , r = 2 , method = VECM_r2 
## Year 2017 : p_opt = 7 , r = 2 , method = VECM_r2 
## Year 2018 : p_opt = 7 , r = 2 , method = VECM_r2 
## Year 2021 : p_opt = 7 , r = 2 , method = VECM_r2 
## Year 2022 : p_opt = 7 , r = 2 , method = VECM_r2 
## Year 2023 : p_opt = 7 , r = 2 , method = VECM_r2 
## Year 2024 : p_opt = 7 , r = 2 , method = VECM_r2

monthly:

dates <- df_aaa$date
ts_aaa <- df_aaa[, c("aaa_corp_bond_val", "market_yield_val", "unemploy_val", "ind_prod_val")]


initial_train_end_idx <- which(year(dates) == initial_train_end & month(dates) == 12)
if (length(initial_train_end_idx) == 0) {
  stop("initial_train_end must match some date in df_aaa.")
}

all_pred <- data.frame(
  date = as.Date(character()),
  real = numeric(),
  pred = numeric(),
  lower = numeric(),
  upper = numeric(),
  method = character(),
  stringsAsFactors = FALSE
)

for (i in seq(from = initial_train_end_idx, to = (nrow(ts_aaa) - 1))) {
  
  train_idx <- 1:i
  test_idx  <- i + 1
  
  train_data <- ts_aaa[train_idx, , drop = FALSE]
  test_data  <- ts_aaa[test_idx, , drop = FALSE]

  # lag selection
  if (nrow(train_data) <= max_lag_try + 1) {
    p_opt <- max(1, floor(nrow(train_data)/5))
  } else {
    vs <- VARselect(train_data, lag.max = max_lag_try, type = "trend")
    if (!is.null(vs$selection) && "AIC(n)" %in% names(vs$selection)) {
      p_opt <- as.integer(vs$selection["AIC(n)"])
    } else if (!is.null(vs$criteria)) {
      p_opt <- which.min(vs$criteria["AIC", ])
    } else {
      p_opt <- 2
    }
    if (is.na(p_opt) || p_opt < 1) p_opt <- 1
  }

  # Johansen
  ca_call <- function() ca.jo(train_data, type = "trace", ecdet = "trend", K = p_opt)
  jo_aaa <- ca_call()
  
  r <- tryCatch(get_rank(jo_aaa), error = function(e) 0)

  # Model: VAR or VECM
  if (r == 0) {
    var_model <- VAR(train_data, p = p_opt, type = "trend")
    
    fc <- predict(var_model, n.ahead = 1)  # monthly forecast: only next month
    
    pred_val   <- as.numeric(fc$fcst[[1]][1, "fcst"])
    pred_lower <- as.numeric(fc$fcst[[1]][1, "lower"])
    pred_upper <- as.numeric(fc$fcst[[1]][1, "upper"])
    method_used <- "VAR"
    
  } else {
    var_model_from_jo <- vec2var(jo_aaa, r = r)
    pred_list <- predict(var_model_from_jo, n.ahead = 1)

    first_var_name <- colnames(train_data)[1]

    if (first_var_name %in% names(pred_list$fcst)) {
      pred_val   <- as.numeric(pred_list$fcst[[first_var_name]][1, "fcst"])
      pred_lower <- tryCatch(as.numeric(pred_list$fcst[[first_var_name]][1, "lower"]), error = function(e) NA)
      pred_upper <- tryCatch(as.numeric(pred_list$fcst[[first_var_name]][1, "upper"]), error = function(e) NA)
    } else {
      pred_val <- as.numeric(pred_list$fcst[[1]][1, 1])
      pred_lower <- NA
      pred_upper <- NA
    }

    method_used <- paste0("VECM_r", r)
  }

  real_val <- as.numeric(test_data[[1]])

  all_pred <- rbind(all_pred, data.frame(
    date  = dates[test_idx],
    real  = real_val,
    pred  = pred_val,
    lower = pred_lower,
    upper = pred_upper,
    method = method_used,
    stringsAsFactors = FALSE
  ))

}
all_pred <- na.omit(all_pred)


rmse_val <- sqrt(mean((all_pred$real - all_pred$pred)^2, na.rm = TRUE))
cat("RMSE:", rmse_val, "\n")
## RMSE: 0.4178355
a2 <- ggplot(all_pred, aes(x = date)) +
  geom_line(aes(y = real), size = 0.8, color = "red") +
  geom_line(aes(y = pred), linetype = "dashed", size = 0.8, color = "blue") +
  geom_ribbon(aes(ymin = lower, ymax = upper), fill = "lightblue", alpha = 0.2) +
  labs(title = "Actual vs Predicted (rolling forecast)",
       subtitle = paste0("RMSE = ", round(rmse_val, 6)),
       y = "aaa_corp_bond_val") +
  theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
df_aaa_filtered <- df_aaa[df_aaa$date >= as.Date("2004-02-01"), ]

a1 <- ggplot(df_aaa_filtered, aes(x = date)) +
  geom_line(aes(y = aaa_corp_bond_val), size = 0.8) +
  theme_minimal()

a1/a2

all_pred$residual <- all_pred$real - all_pred$pred

b1 <- ggplot(all_pred, aes(x = pred, y = residual)) +
  geom_point(alpha = 0.6) +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title = "Residual vs Predicted") +
  theme_minimal()

b2 <- ggplot(all_pred, aes(x = residual)) +
  geom_density(fill = "skyblue", alpha = 0.6) +
  labs(title = "Residual KDE Density",
       x = "Residual",
       y = "Density") +
  theme_minimal()

b1/b2