getwd()
## [1] "C:/Users/clin0/OneDrive/Desktop/DSE Applied stastics/Projects/Group_project/datasets"
list.files()
## [1] "BAMLC0A1CAAAEY.csv"
## [2] "Data_refund.xlsx"
## [3] "DFF.csv"
## [4] "DGS10.csv"
## [5] "EXPINF30YR.csv"
## [6] "FEDFUNDS.csv"
## [7] "GS2.csv"
## [8] "INDPRO.csv"
## [9] "Monthly_Earliest_Index_1919-2025.csv"
## [10] "Monthly_Joined_Index_2003-2025.csv"
## [11] "UNRATE.csv"
## [12] "VIXCLS.csv"
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.5.1
## ✔ lubridate 1.9.4 ✔ tibble 3.3.0
## ✔ purrr 1.1.0 ✔ tidyr 1.3.1
## ✔ readr 2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.5.2
## corrplot 0.95 loaded
library(vars)
## Warning: package 'vars' was built under R version 4.5.2
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.5.2
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.5.2
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.5.2
##
## Attaching package: 'strucchange'
##
## The following object is masked from 'package:stringr':
##
## boundary
##
## Loading required package: urca
## Warning: package 'urca' was built under R version 4.5.2
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.5.2
library(lmtest)
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.5.2
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
##
## The following object is masked from 'package:zoo':
##
## index
##
## The following object is masked from 'package:lubridate':
##
## interval
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(lubridate)
library(urca)
library(tseries)
## Warning: package 'tseries' was built under R version 4.5.2
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(patchwork)
##
## Attaching package: 'patchwork'
##
## The following object is masked from 'package:MASS':
##
## area
library(forecast)
## Warning: package 'forecast' was built under R version 4.5.2
library(tsDyn)
## Warning: package 'tsDyn' was built under R version 4.5.2
library(lubridate)
df <- read.csv("Monthly_Earliest_Index_1919-2025.csv")
df <- df %>%
mutate(date = as.Date(date)) %>%
as_tsibble(index = date)
head(df)
select
## function (obj)
## UseMethod("select")
## <bytecode: 0x0000013f248d5ce8>
## <environment: namespace:MASS>
df_BOF <- df %>%
dplyr::select(
date,
market_yield_val,
unemploy_val,
ind_prod_val,
volatility_val,
aaa_corp_bond_val,
bbb_corp_bond_val
) %>%
mutate(date = as.Date(date)) %>%
filter(date >= as.Date("1997-01-01")) %>%
filter(!(date >= as.Date("2020-01-01") & date <= as.Date("2021-12-31")))
head(df_BOF)
nrow(df_BOF)
## [1] 320
Here is what we should focus on below:
#evaluate
Row_vars <- names(df_BOF)[names(df_BOF) != "date"]
for (v in Row_vars) {
ts_raw <- ts(df_BOF[[v]], start = c(1997,1), frequency = 12)
# ACF plot
p_acf <- ggAcf(ts_raw,lag.max = 40) +
ggtitle(paste("ACF of", v)) +
theme_minimal(base_size = 14)
# PACF plot
p_pacf <- ggPacf(ts_raw,lag.max = 40) +
ggtitle(paste("PACF of", v)) +
theme_minimal(base_size = 14)
print(p_acf / p_pacf)
}
library(ggplotify)
## Warning: package 'ggplotify' was built under R version 4.5.2
vars <- names(df_BOF)[names(df_BOF) != "date"]
for (v in vars) {
cat("========================================\n")
cat("variable:", v, "\n")
cat("========================================\n\n")
x <- ts(df_BOF[[v]], start=c(1997,1), frequency=12)
dx <- diff(x)
# row data
df_raw <- data.frame(
time = as.numeric(time(x)),
value = as.numeric(x)
)
p1 <- ggplot(df_raw, aes(time, value)) +
geom_line() +
ggtitle(paste(v, "- raw data"))
# diff once
df_dx <- data.frame(time = as.numeric(time(dx)), value = as.numeric(dx))
p2 <- ggplot(df_dx, aes(time, value)) +
geom_line() +
ggtitle(paste(v, "- diff once"))
# ACF
acf_obj <- acf(x, plot = FALSE, lag.max = 40)
acf_df <- data.frame(
lag = acf_obj$lag[-1],
acf = acf_obj$acf[-1]
)
Tn <- length(x)
bart_var <- sapply(1:nrow(acf_df), function(k) {
if (k == 1) return(1/Tn)
return( (1/Tn) * (1 + 2 * sum(acf_df$acf[1:(k-1)]^2)) )
})
acf_df$upper <- 1.96 * sqrt(bart_var)
acf_df$lower <- -1.96 * sqrt(bart_var)
p3 <- ggplot(acf_df, aes(lag, acf)) +
geom_ribbon(aes(ymin = lower, ymax = upper),
fill = "skyblue", alpha = 0.3) +
geom_segment(aes(xend = lag, yend = 0)) +
geom_point(size = 1.8) +
geom_hline(yintercept = 0) +
ggtitle(paste(v, "- ACF (Bartlett CI fan shape)"))
# PACF
p4 <- as.ggplot(function() {
pacf(x, lag.max = 40, main = paste(v, "- PACF"))
})
#Distribution
df_res <- data.frame(x = as.numeric(x))
p5 <- ggplot(df_res, aes(x)) +
geom_histogram(aes(y = ..density..), bins = 30, fill="skyblue") +
geom_density(color="red") +
ggtitle(paste(v, "- Distribution"))
# rolling
roll_mean <- zoo::rollmean(x, k = 12, fill = NA)
df_roll <- data.frame(
time = as.numeric(time(x)),
x = as.numeric(x),
rm = as.numeric(roll_mean)
)
p6 <- ggplot(df_roll, aes(time)) +
geom_line(aes(y=x), color="black") +
geom_line(aes(y=rm), color="red") +
ggtitle(paste(v, "- Rolling Mean (12M)"))
#------------------------------------------------
print((p1 | p2) / (p3 | p4) / (p5 | p6))
}
## ========================================
## variable: market_yield_val
## ========================================
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
## ========================================
## variable: unemploy_val
## ========================================
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
## ========================================
## variable: ind_prod_val
## ========================================
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
## ========================================
## variable: volatility_val
## ========================================
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
## ========================================
## variable: aaa_corp_bond_val
## ========================================
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
## ========================================
## variable: bbb_corp_bond_val
## ========================================
## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).
Before we go forward to models, we have to keep them stationary.
ADF Test: H0: This is not stationary H1: This is stationary
Lag selection:
test <- ur.df(df_BOF$market_yield_val, type="trend", lags=1, selectlags="Fixed")
sapply(slotNames(test), function(s) class(slot(test, s)))
## $y
## [1] "numeric"
##
## $model
## [1] "character"
##
## $lags
## [1] "integer"
##
## $cval
## [1] "matrix" "array"
##
## $res
## [1] "numeric"
##
## $teststat
## [1] "matrix" "array"
##
## $testreg
## [1] "summary.lm"
##
## $test.name
## [1] "character"
ADF equation: \[ \Delta y_t = \alpha + \beta t + \gamma y_{t-1} + \sum_{i=1}^{p} \delta_i \Delta y_{t-i} + \varepsilon_t . \] Using AIC/BIC to evaluate.
\[ AIC=−2log(L)+2k \] \[ BIC=−2log(L)+klog(n) \]
Set the equation manually to find the lag number:
Setting ADF:
ts_data <- ts(df_BOF, start = c(1997,1),frequency = 12)
adf.test(ts_data[,"market_yield_val"],k=1)
##
## Augmented Dickey-Fuller Test
##
## data: ts_data[, "market_yield_val"]
## Dickey-Fuller = -2.1582, Lag order = 1, p-value = 0.5098
## alternative hypothesis: stationary
adf.test(ts_data[,"unemploy_val"],k=1)
##
## Augmented Dickey-Fuller Test
##
## data: ts_data[, "unemploy_val"]
## Dickey-Fuller = -0.84513, Lag order = 1, p-value = 0.9569
## alternative hypothesis: stationary
adf.test(ts_data[,"ind_prod_val"],k=1)
##
## Augmented Dickey-Fuller Test
##
## data: ts_data[, "ind_prod_val"]
## Dickey-Fuller = -2.6268, Lag order = 1, p-value = 0.3122
## alternative hypothesis: stationary
adf.test(ts_data[,"volatility_val"],k=1)
## Warning in adf.test(ts_data[, "volatility_val"], k = 1): p-value smaller than
## printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: ts_data[, "volatility_val"]
## Dickey-Fuller = -5.6181, Lag order = 1, p-value = 0.01
## alternative hypothesis: stationary
adf.test(ts_data[,"aaa_corp_bond_val"],k=1)
##
## Augmented Dickey-Fuller Test
##
## data: ts_data[, "aaa_corp_bond_val"]
## Dickey-Fuller = -1.7299, Lag order = 1, p-value = 0.6904
## alternative hypothesis: stationary
adf.test(ts_data[,"bbb_corp_bond_val"],k=1)
##
## Augmented Dickey-Fuller Test
##
## data: ts_data[, "bbb_corp_bond_val"]
## Dickey-Fuller = -2.6058, Lag order = 1, p-value = 0.3211
## alternative hypothesis: stationary
P-value > 0.05, which means Null hypothesis is true.
But, is it true?
Here is Kpss test:
H0: stationary H1: none stationary
# Level
kpss.test(ts_data[,"unemploy_val"],null = "Level")
## Warning in kpss.test(ts_data[, "unemploy_val"], null = "Level"): p-value
## smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: ts_data[, "unemploy_val"]
## KPSS Level = 0.92066, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_data[,"ind_prod_val"],null = "Level")
## Warning in kpss.test(ts_data[, "ind_prod_val"], null = "Level"): p-value
## smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: ts_data[, "ind_prod_val"]
## KPSS Level = 3.7314, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_data[,"volatility_val"],null = "Level")
## Warning in kpss.test(ts_data[, "volatility_val"], null = "Level"): p-value
## smaller than printed p-value
##
## KPSS Test for Level Stationarity
##
## data: ts_data[, "volatility_val"]
## KPSS Level = 0.81377, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_data[,"unemploy_val"],null = "Trend")
## Warning in kpss.test(ts_data[, "unemploy_val"], null = "Trend"): p-value
## smaller than printed p-value
##
## KPSS Test for Trend Stationarity
##
## data: ts_data[, "unemploy_val"]
## KPSS Trend = 0.87405, Truncation lag parameter = 5, p-value = 0.01
kpss.test(ts_data[,"ind_prod_val"],null = "Trend")
##
## KPSS Test for Trend Stationarity
##
## data: ts_data[, "ind_prod_val"]
## KPSS Trend = 0.20896, Truncation lag parameter = 5, p-value = 0.01264
kpss.test(ts_data[,"volatility_val"],null = "Trend")
## Warning in kpss.test(ts_data[, "volatility_val"], null = "Trend"): p-value
## greater than printed p-value
##
## KPSS Test for Trend Stationarity
##
## data: ts_data[, "volatility_val"]
## KPSS Trend = 0.11294, Truncation lag parameter = 5, p-value = 0.1
Lets look at the box plots:
df_long <- df_BOF %>%
dplyr::select(date, market_yield_val, ind_prod_val, volatility_val, unemploy_val, aaa_corp_bond_val, bbb_corp_bond_val) %>%
pivot_longer(
cols = -date,
names_to = "variable",
values_to = "value"
)
ggplot(df_long, aes(x = variable, y = value, fill = variable)) +
geom_boxplot() +
theme_minimal() +
labs(
title = "Boxplots of Differenced Series",
x = "Variable",
y = "Value"
) +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "none"
)
Here is simply to confirm whether the data is truly stable by ur.df methods.
compare the lag I selected to auto select lag number.
summary(ur.df(ts_data[,"unemploy_val"], type = "trend", lags = 1))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53020 -0.10276 -0.00045 0.09881 0.46831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.298e-02 3.272e-02 1.008 0.3142
## z.lag.1 -4.086e-03 4.835e-03 -0.845 0.3987
## tt -8.148e-05 9.226e-05 -0.883 0.3779
## z.diff.lag 1.272e-01 5.599e-02 2.271 0.0238 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.15 on 314 degrees of freedom
## Multiple R-squared: 0.02039, Adjusted R-squared: 0.01103
## F-statistic: 2.179 on 3 and 314 DF, p-value: 0.09046
##
##
## Value of test-statistic is: -0.8451 0.4764 0.6743
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
summary(ur.df(ts_data[,"unemploy_val"], type = "trend", selectlags = "AIC"))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.53020 -0.10276 -0.00045 0.09881 0.46831
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.298e-02 3.272e-02 1.008 0.3142
## z.lag.1 -4.086e-03 4.835e-03 -0.845 0.3987
## tt -8.148e-05 9.226e-05 -0.883 0.3779
## z.diff.lag 1.272e-01 5.599e-02 2.271 0.0238 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.15 on 314 degrees of freedom
## Multiple R-squared: 0.02039, Adjusted R-squared: 0.01103
## F-statistic: 2.179 on 3 and 314 DF, p-value: 0.09046
##
##
## Value of test-statistic is: -0.8451 0.4764 0.6743
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
summary(ur.df(ts_data[,"ind_prod_val"], type = "trend", lags = 1))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0558 -0.3459 0.0220 0.3401 1.5586
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.3048370 0.8244363 2.796 0.00550 **
## z.lag.1 -0.0248042 0.0094428 -2.627 0.00904 **
## tt 0.0009083 0.0006447 1.409 0.15990
## z.diff.lag 0.1532115 0.0550711 2.782 0.00573 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5919 on 314 degrees of freedom
## Multiple R-squared: 0.05133, Adjusted R-squared: 0.04227
## F-statistic: 5.664 on 3 and 314 DF, p-value: 0.0008648
##
##
## Value of test-statistic is: -2.6268 4.2478 4.3631
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
summary(ur.df(ts_data[,"ind_prod_val"], type = "trend", selectlags = "AIC"))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0558 -0.3459 0.0220 0.3401 1.5586
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.3048370 0.8244363 2.796 0.00550 **
## z.lag.1 -0.0248042 0.0094428 -2.627 0.00904 **
## tt 0.0009083 0.0006447 1.409 0.15990
## z.diff.lag 0.1532115 0.0550711 2.782 0.00573 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5919 on 314 degrees of freedom
## Multiple R-squared: 0.05133, Adjusted R-squared: 0.04227
## F-statistic: 5.664 on 3 and 314 DF, p-value: 0.0008648
##
##
## Value of test-statistic is: -2.6268 4.2478 4.3631
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
summary(ur.df(ts_data[,"volatility_val"], type = "trend", lags = 1))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.3531 -1.9823 -0.4082 0.8580 31.0944
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.107624 0.841337 4.882 1.67e-06 ***
## z.lag.1 -0.168961 0.030074 -5.618 4.26e-08 ***
## tt -0.004586 0.002418 -1.896 0.05882 .
## z.diff.lag 0.156614 0.055702 2.812 0.00524 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.75 on 314 degrees of freedom
## Multiple R-squared: 0.09615, Adjusted R-squared: 0.08751
## F-statistic: 11.13 on 3 and 314 DF, p-value: 5.791e-07
##
##
## Value of test-statistic is: -5.6181 10.5257 15.7866
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
summary(ur.df(ts_data[,"volatility_val"], type = "trend", selectlags = "AIC"))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.3531 -1.9823 -0.4082 0.8580 31.0944
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.107624 0.841337 4.882 1.67e-06 ***
## z.lag.1 -0.168961 0.030074 -5.618 4.26e-08 ***
## tt -0.004586 0.002418 -1.896 0.05882 .
## z.diff.lag 0.156614 0.055702 2.812 0.00524 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.75 on 314 degrees of freedom
## Multiple R-squared: 0.09615, Adjusted R-squared: 0.08751
## F-statistic: 11.13 on 3 and 314 DF, p-value: 5.791e-07
##
##
## Value of test-statistic is: -5.6181 10.5257 15.7866
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
How to know if they are stable, look at the A = tau3(5pct): -3.42 and B = The first number of Value of test-statistic: -5.9771 And, if B < A, which means it is stationary data. I(0), so I found out vol and ind are I(0).
But why ur.df shows they are stable?
Here is the data after diff once
df_BOF_diff <- df_BOF %>%
mutate(across(
.cols = -date,
.fns = ~ c(NA, diff(.)),
.names = "d_{.col}"
)) %>%
slice(-1)
orig_vars <- names(df_BOF)[names(df_BOF) != "date"]
head(df_BOF_diff)
Using for loop to make their ACF and PACF plots
for (v in orig_vars) {
# raw and diff series
raw_series <- ts(df_BOF[[v]], start = c(1997,1), frequency = 12)
diff_series <- ts(df_BOF_diff[[paste0("d_", v)]], start = c(1997,2), frequency = 12)
# ACF
p_raw_acf <- ggAcf(raw_series) +
ggtitle(paste("ACF of", v)) +
theme_minimal(base_size = 14)
p_diff_acf <- ggAcf(diff_series) +
ggtitle(paste("ACF of", paste0("d_", v))) +
theme_minimal(base_size = 14)
# PACF
p_raw_pacf <- ggPacf(raw_series) +
ggtitle(paste("PACF of", v)) +
theme_minimal(base_size = 14)
p_diff_pacf <- ggPacf(diff_series) +
ggtitle(paste("PACF of", paste0("d_", v))) +
theme_minimal(base_size = 14)
# Combine into 2×2 layout
full_plot <- (p_raw_acf | p_diff_acf) /
(p_raw_pacf | p_diff_pacf)
print(full_plot)
}
ts_data_diff <- ts(df_BOF_diff, start = c(1997,1),frequency = 12)
summary(ur.df(ts_data_diff[,"d_ind_prod_val"], type = "trend", selectlags = "AIC"))
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression trend
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + tt + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.0765 -0.3225 0.0058 0.3193 1.7690
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1132855 0.0677311 1.673 0.09541 .
## z.lag.1 -0.6863009 0.0721736 -9.509 < 2e-16 ***
## tt -0.0003772 0.0003634 -1.038 0.30006
## z.diff.lag -0.1940835 0.0553455 -3.507 0.00052 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5877 on 313 degrees of freedom
## Multiple R-squared: 0.4474, Adjusted R-squared: 0.4421
## F-statistic: 84.47 on 3 and 313 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -9.509 30.1478 45.2162
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau3 -3.98 -3.42 -3.13
## phi2 6.15 4.71 4.05
## phi3 8.34 6.30 5.36
here is what we should do next? ______________________________________________________ ______________________________________________________ we have I(1), we can do cointegration test such as Johansen
if it passes, then VAR, if not, maybe ARIMA, test white noise
df_aaa <- df_BOF %>%
dplyr::select(
aaa_corp_bond_val,
market_yield_val,
unemploy_val,
ind_prod_val
) %>%
mutate(across(everything(), as.numeric)) %>%
na.omit()
lag_sel2 <- VARselect(df_aaa, lag.max = 12, type = "trend")
lag_sel2$selection
## AIC(n) HQ(n) SC(n) FPE(n)
## 2 1 1 2
p_opt2 <- as.integer(lag_sel2$selection["AIC(n)"])
p_opt2
## [1] 2
df_bbb <- df_BOF %>%
dplyr::select(
bbb_corp_bond_val,
market_yield_val,
unemploy_val,
ind_prod_val
) %>%
mutate(across(everything(), as.numeric)) %>%
na.omit()
lag_sel1 <- VARselect(df_bbb, lag.max = 12, type = "trend")
lag_sel1$selection
## AIC(n) HQ(n) SC(n) FPE(n)
## 2 2 1 2
p_opt1 <- as.integer(lag_sel1$selection["AIC(n)"])
p_opt1
## [1] 2
Using Johensen Test aaa_df’s r.
ts_aaa <- ts(df_aaa, start = c(1997,1), frequency = 12)
johansen_test_aaa <- ca.jo(
ts_aaa,
type = "trace",
ecdet = "trend",
K = 2
)
summary(johansen_test_aaa)
##
## ######################
## # Johansen-Procedure #
## ######################
##
## Test type: trace statistic , with linear trend in cointegration
##
## Eigenvalues (lambda):
## [1] 2.400208e-01 1.379271e-01 6.320180e-02 3.132269e-02 1.224493e-02
## [6] 4.422176e-18
##
## Values of teststatistic and critical values of test:
##
## test 10pct 5pct 1pct
## r <= 4 | 3.92 10.49 12.25 16.26
## r <= 3 | 14.04 22.76 25.32 30.45
## r <= 2 | 34.80 39.06 42.44 48.45
## r <= 1 | 82.00 59.14 62.99 70.05
## r = 0 | 169.27 83.20 87.31 96.58
##
## Eigenvectors, normalised to first column:
## (These are the cointegration relations)
##
## aaa_corp_bond_val.l2 market_yield_val.l2 unemploy_val.l2
## aaa_corp_bond_val.l2 1.0000000000 1.00000000 1.00000000
## market_yield_val.l2 -1.0581162059 4.87815265 2.15116153
## unemploy_val.l2 0.1276958944 1.44673308 0.70683847
## ind_prod_val.l2 0.0196228591 -0.65204040 0.54536008
## date.l2 0.0005503576 -0.01439076 -0.01392828
## trend.l2 -0.0187902000 0.53552580 0.45218521
## ind_prod_val.l2 date.l2 trend.l2
## aaa_corp_bond_val.l2 1.00000000 1.00000000 1.00000000
## market_yield_val.l2 -0.44346498 -2.32761730 -5.11868397
## unemploy_val.l2 -3.49978207 9.51337415 -0.81148946
## ind_prod_val.l2 -3.42700642 1.66831341 0.14915215
## date.l2 -0.07471344 0.02201836 -0.01020533
## trend.l2 2.66234422 -0.77160029 0.21053422
##
## Weights W:
## (This is the loading matrix)
##
## aaa_corp_bond_val.l2 market_yield_val.l2 unemploy_val.l2
## aaa_corp_bond_val.d -0.07502961 -0.011725149 -0.0084967278
## market_yield_val.d 0.01952314 -0.009364052 -0.0155255346
## unemploy_val.d 0.12374997 0.003499374 -0.0008210305
## ind_prod_val.d -0.42477885 0.025905937 -0.0261285170
## date.d -2.47893387 -0.478592344 0.2355113376
## ind_prod_val.l2 date.l2 trend.l2
## aaa_corp_bond_val.d -1.397907e-03 -6.569503e-04 -4.431480e-16
## market_yield_val.d -1.416843e-04 3.109457e-05 -5.875155e-16
## unemploy_val.d -3.273902e-04 -6.391295e-04 -9.382984e-17
## ind_prod_val.d 2.640552e-05 -2.792741e-05 5.772250e-16
## date.d 3.165729e-01 -2.198676e-01 4.759284e-14
For aaa’s vector prediction (same as bbb): \[ Y_t = \begin{pmatrix} aaa_t \\ market\_yield_t \\ ind\_prod_t \\ unemploy_t \end{pmatrix}. \]
Johansen’s test indicates one cointegration relation \((r = 1)\).
Thus, the VECM is: \[
\Delta Y_t
= \alpha \beta' Y_{t-1}
+ \sum_{i=1}^{p-1} \Gamma_i \Delta Y_{t-i}
+ u_t.
\]
The long-run equilibrium (cointegration vector) is: \[ \beta' Y_{t-1} = aaa_{t-1} - \beta_1 \cdot market\_yield_{t-1} - \beta_2 \cdot ind\_prod_{t-1} - \beta_3 \cdot unemploy_{t-1}. \]
Using Time Series cross-Validation : Setting cross parameters
start_year <- 1997
end_year <- 2025
initial_train_end <- 2005
all_pred <- data.frame(
date = as.Date(character()),
real = numeric(),
pred = numeric()
)
get_rank <- function(jo, cval_level = "5pct") {
tst <- jo@teststat
if (is.matrix(tst)) {
tst_vec <- as.numeric(tst[, 1])
} else {
tst_vec <- as.numeric(tst)
}
cv <- jo@cval
if (is.matrix(cv)) {
cn <- colnames(cv)
pick_col <- which(grepl("5", cn, ignore.case = TRUE)[1])
if (is.na(pick_col) || length(pick_col) == 0) pick_col <- 1
cval_vec <- as.numeric(cv[, pick_col])
} else {
cval_vec <- as.numeric(cv)
}
n <- min(length(tst_vec), length(cval_vec))
tst_vec <- tst_vec[1:n]
cval_vec <- cval_vec[1:n]
# rank = number of trace stats > critical value
r <- sum(tst_vec > cval_vec)
return(r)
}
library(vars)
library(urca)
library(lubridate)
library(ggplot2)
start_year <- 1997
end_year <- 2025
initial_train_end <- 2004
max_lag_try <- 8
df_aaa$date <- as.Date(df_aaa$date)
dates <- df_aaa$date
ts_aaa <- df_aaa[, c(
"aaa_corp_bond_val",
"market_yield_val",
"unemploy_val",
"ind_prod_val"
)]
get_rank <- function(jo, cval_level = "5pct") {
tst <- jo@teststat
tst_vec <- if (is.matrix(tst)) as.numeric(tst[, 1]) else as.numeric(tst)
cv <- jo@cval
if (is.matrix(cv)) {
cn <- colnames(cv)
pick_col <- which(grepl("5", cn, ignore.case = TRUE)[1])
if (is.na(pick_col) || length(pick_col) == 0) pick_col <- 1
cval_vec <- as.numeric(cv[, pick_col])
} else {
cval_vec <- as.numeric(cv)
}
n <- min(length(tst_vec), length(cval_vec))
tst_vec <- tst_vec[1:n]
cval_vec <- cval_vec[1:n]
r <- sum(tst_vec > cval_vec)
return(r)
}
all_pred <- data.frame(
date = as.Date(character()),
real = numeric(),
pred = numeric(),
lower = numeric(),
upper = numeric(),
method = character(),
stringsAsFactors = FALSE
)
suppress_warns <- FALSE
for (yr in initial_train_end:(end_year - 1)) {
train_idx <- which(year(dates) <= yr)
test_idx_all <- which(year(dates) == yr + 1)
if (length(test_idx_all) == 0) next
test_idx <- test_idx_all[1]
train_data <- ts_aaa[train_idx, , drop = FALSE]
test_data <- ts_aaa[test_idx, , drop = FALSE]
if (nrow(train_data) <= max_lag_try + 1) {
p_opt <- max(1, floor(nrow(train_data)/5))
} else {
vs <- VARselect(train_data, lag.max = max_lag_try, type = "trend")
if (!is.null(vs$selection) && "AIC(n)" %in% names(vs$selection)) {
p_opt <- as.integer(vs$selection["AIC(n)"])
} else if (!is.null(vs$criteria)) {
p_opt <- which.min(vs$criteria["AIC", ])
} else {
p_opt <- 2
}
if (is.na(p_opt) || p_opt < 1) p_opt <- 1
}
ca_call <- function() ca.jo(train_data, type = "trace", ecdet = "trend", K = p_opt)
jo_aaa <- if (suppress_warns) suppressWarnings(ca_call()) else ca_call()
r <- tryCatch(get_rank(jo_aaa), error = function(e) 0)
if (r == 0) {
var_model <- if (suppress_warns) suppressWarnings(VAR(train_data, p = p_opt, type = "const")) else VAR(train_data, p = p_opt, type = "trend")
fc <- predict(var_model, n.ahead = 12)
pred_val <- as.numeric(fc$fcst[[1]][1, "fcst"])
pred_lower <- as.numeric(fc$fcst[[1]][1, "lower"])
pred_upper <- as.numeric(fc$fcst[[1]][1, "upper"])
method_used <- "VAR"
} else {
# VECM -> VAR
var_model_from_jo <- if (suppress_warns) suppressWarnings(vec2var(jo_aaa, r = r)) else vec2var(jo_aaa, r = r)
pred_list <- predict(var_model_from_jo, n.ahead = 1)
first_var_name <- colnames(train_data)[1]
if (!is.null(pred_list$fcst) && first_var_name %in% names(pred_list$fcst)) {
pred_val <- as.numeric(pred_list$fcst[[first_var_name]][1, "fcst"])
pred_lower <- tryCatch(as.numeric(pred_list$fcst[[first_var_name]][1, "lower"]), error = function(e) NA)
pred_upper <- tryCatch(as.numeric(pred_list$fcst[[first_var_name]][1, "upper"]), error = function(e) NA)
} else {
pred_val <- as.numeric(pred_list$fcst[[1]][1, 1])
pred_lower <- NA
pred_upper <- NA
}
method_used <- paste0("VECM_r", r)
}
real_val <- as.numeric(test_data[[1]])
all_pred <- rbind(all_pred, data.frame(
date = dates[test_idx],
real = real_val,
pred = pred_val,
lower = pred_lower,
upper = pred_upper,
method = method_used,
stringsAsFactors = FALSE
))
cat("Year", yr, ": p_opt =", p_opt, ", r =", r, ", method =", method_used, "\n")
}
## Year 2004 : p_opt = 2 , r = 0 , method = VAR
## Year 2005 : p_opt = 2 , r = 0 , method = VAR
## Year 2006 : p_opt = 2 , r = 0 , method = VAR
## Year 2007 : p_opt = 2 , r = 0 , method = VAR
## Year 2008 : p_opt = 2 , r = 1 , method = VECM_r1
## Year 2009 : p_opt = 7 , r = 0 , method = VAR
## Year 2010 : p_opt = 8 , r = 0 , method = VAR
## Year 2011 : p_opt = 6 , r = 1 , method = VECM_r1
## Year 2012 : p_opt = 6 , r = 1 , method = VECM_r1
## Year 2013 : p_opt = 6 , r = 1 , method = VECM_r1
## Year 2014 : p_opt = 6 , r = 1 , method = VECM_r1
## Year 2015 : p_opt = 7 , r = 2 , method = VECM_r2
## Year 2016 : p_opt = 7 , r = 2 , method = VECM_r2
## Year 2017 : p_opt = 7 , r = 2 , method = VECM_r2
## Year 2018 : p_opt = 7 , r = 2 , method = VECM_r2
## Year 2021 : p_opt = 7 , r = 2 , method = VECM_r2
## Year 2022 : p_opt = 7 , r = 2 , method = VECM_r2
## Year 2023 : p_opt = 7 , r = 2 , method = VECM_r2
## Year 2024 : p_opt = 7 , r = 2 , method = VECM_r2
monthly:
dates <- df_aaa$date
ts_aaa <- df_aaa[, c("aaa_corp_bond_val", "market_yield_val", "unemploy_val", "ind_prod_val")]
initial_train_end_idx <- which(year(dates) == initial_train_end & month(dates) == 12)
if (length(initial_train_end_idx) == 0) {
stop("initial_train_end must match some date in df_aaa.")
}
all_pred <- data.frame(
date = as.Date(character()),
real = numeric(),
pred = numeric(),
lower = numeric(),
upper = numeric(),
method = character(),
stringsAsFactors = FALSE
)
for (i in seq(from = initial_train_end_idx, to = (nrow(ts_aaa) - 1))) {
train_idx <- 1:i
test_idx <- i + 1
train_data <- ts_aaa[train_idx, , drop = FALSE]
test_data <- ts_aaa[test_idx, , drop = FALSE]
# lag selection
if (nrow(train_data) <= max_lag_try + 1) {
p_opt <- max(1, floor(nrow(train_data)/5))
} else {
vs <- VARselect(train_data, lag.max = max_lag_try, type = "trend")
if (!is.null(vs$selection) && "AIC(n)" %in% names(vs$selection)) {
p_opt <- as.integer(vs$selection["AIC(n)"])
} else if (!is.null(vs$criteria)) {
p_opt <- which.min(vs$criteria["AIC", ])
} else {
p_opt <- 2
}
if (is.na(p_opt) || p_opt < 1) p_opt <- 1
}
# Johansen
ca_call <- function() ca.jo(train_data, type = "trace", ecdet = "trend", K = p_opt)
jo_aaa <- ca_call()
r <- tryCatch(get_rank(jo_aaa), error = function(e) 0)
# Model: VAR or VECM
if (r == 0) {
var_model <- VAR(train_data, p = p_opt, type = "trend")
fc <- predict(var_model, n.ahead = 1) # monthly forecast: only next month
pred_val <- as.numeric(fc$fcst[[1]][1, "fcst"])
pred_lower <- as.numeric(fc$fcst[[1]][1, "lower"])
pred_upper <- as.numeric(fc$fcst[[1]][1, "upper"])
method_used <- "VAR"
} else {
var_model_from_jo <- vec2var(jo_aaa, r = r)
pred_list <- predict(var_model_from_jo, n.ahead = 1)
first_var_name <- colnames(train_data)[1]
if (first_var_name %in% names(pred_list$fcst)) {
pred_val <- as.numeric(pred_list$fcst[[first_var_name]][1, "fcst"])
pred_lower <- tryCatch(as.numeric(pred_list$fcst[[first_var_name]][1, "lower"]), error = function(e) NA)
pred_upper <- tryCatch(as.numeric(pred_list$fcst[[first_var_name]][1, "upper"]), error = function(e) NA)
} else {
pred_val <- as.numeric(pred_list$fcst[[1]][1, 1])
pred_lower <- NA
pred_upper <- NA
}
method_used <- paste0("VECM_r", r)
}
real_val <- as.numeric(test_data[[1]])
all_pred <- rbind(all_pred, data.frame(
date = dates[test_idx],
real = real_val,
pred = pred_val,
lower = pred_lower,
upper = pred_upper,
method = method_used,
stringsAsFactors = FALSE
))
}
all_pred <- na.omit(all_pred)
rmse_val <- sqrt(mean((all_pred$real - all_pred$pred)^2, na.rm = TRUE))
cat("RMSE:", rmse_val, "\n")
## RMSE: 0.4178355
a2 <- ggplot(all_pred, aes(x = date)) +
geom_line(aes(y = real), size = 0.8, color = "red") +
geom_line(aes(y = pred), linetype = "dashed", size = 0.8, color = "blue") +
geom_ribbon(aes(ymin = lower, ymax = upper), fill = "lightblue", alpha = 0.2) +
labs(title = "Actual vs Predicted (rolling forecast)",
subtitle = paste0("RMSE = ", round(rmse_val, 6)),
y = "aaa_corp_bond_val") +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
df_aaa_filtered <- df_aaa[df_aaa$date >= as.Date("2004-02-01"), ]
a1 <- ggplot(df_aaa_filtered, aes(x = date)) +
geom_line(aes(y = aaa_corp_bond_val), size = 0.8) +
theme_minimal()
a1/a2
all_pred$residual <- all_pred$real - all_pred$pred
b1 <- ggplot(all_pred, aes(x = pred, y = residual)) +
geom_point(alpha = 0.6) +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title = "Residual vs Predicted") +
theme_minimal()
b2 <- ggplot(all_pred, aes(x = residual)) +
geom_density(fill = "skyblue", alpha = 0.6) +
labs(title = "Residual KDE Density",
x = "Residual",
y = "Density") +
theme_minimal()
b1/b2