1. Import data and Description Data

rm(list = ls())
options(scipen = 999)
#Import Dataset
data <- read.csv("C:/Users/Nguyen Ba Trung/Downloads/data_CF_new.csv",
                 stringsAsFactors = FALSE)

psych::describe(data, fast = TRUE)
##       vars   n    mean      sd  median     min     max   range skew kurtosis
## wui      1 216    0.13    0.15    0.10    0.00    0.77    0.77 1.99     4.96
## wpui     2 216    0.03    0.06    0.00    0.00    0.34    0.34 2.52     6.82
## gepu     3 216  193.35   84.66  173.45   81.65  584.65  503.00 1.35     2.20
## cpi      4 216     NaN      NA      NA     Inf    -Inf    -Inf   NA       NA
## vn30     5 216     NaN      NA      NA     Inf    -Inf    -Inf   NA       NA
## gfc      6 216    0.11    0.31    0.00    0.00    1.00    1.00 2.46     4.06
## co19     7 216    0.17    0.37    0.00    0.00    1.00    1.00 1.78     1.16
## ex       8 216     NaN      NA      NA     Inf    -Inf    -Inf   NA       NA
## ir       9 216     NaN      NA      NA     Inf    -Inf    -Inf   NA       NA
## iip     10 216  150.48   32.55  146.74    0.00  267.16  267.16 0.69     3.83
## sp500   11 216 2749.46 1539.56 2258.85  735.09 6849.09 6114.00 0.85    -0.22
## month   12 216    6.50    3.46    6.50    1.00   12.00   11.00 0.00    -1.23
## year    13 216 2016.50    5.20 2016.50 2008.00 2025.00   17.00 0.00    -1.22
##           se
## wui     0.01
## wpui    0.00
## gepu    5.76
## cpi       NA
## vn30      NA
## gfc     0.02
## co19    0.03
## ex        NA
## ir        NA
## iip     2.21
## sp500 104.75
## month   0.24
## year    0.35
summary(data)
##       wui             wpui              gepu            cpi           
##  Min.   :0.000   Min.   :0.00000   Min.   : 81.65   Length:216        
##  1st Qu.:0.000   1st Qu.:0.00000   1st Qu.:127.64   Class :character  
##  Median :0.100   Median :0.00000   Median :173.45   Mode  :character  
##  Mean   :0.127   Mean   :0.02699   Mean   :193.35                     
##  3rd Qu.:0.190   3rd Qu.:0.00000   3rd Qu.:238.52                     
##  Max.   :0.770   Max.   :0.34000   Max.   :584.65                     
##      vn30                gfc              co19             ex           
##  Length:216         Min.   :0.0000   Min.   :0.0000   Length:216        
##  Class :character   1st Qu.:0.0000   1st Qu.:0.0000   Class :character  
##  Mode  :character   Median :0.0000   Median :0.0000   Mode  :character  
##                     Mean   :0.1111   Mean   :0.1667                     
##                     3rd Qu.:0.0000   3rd Qu.:0.0000                     
##                     Max.   :1.0000   Max.   :1.0000                     
##       ir                 iip            sp500            month      
##  Length:216         Min.   :  0.0   Min.   : 735.1   Min.   : 1.00  
##  Class :character   1st Qu.:129.9   1st Qu.:1405.0   1st Qu.: 3.75  
##  Mode  :character   Median :146.7   Median :2258.8   Median : 6.50  
##                     Mean   :150.5   Mean   :2749.5   Mean   : 6.50  
##                     3rd Qu.:167.0   3rd Qu.:3892.7   3rd Qu.: 9.25  
##                     Max.   :267.2   Max.   :6849.1   Max.   :12.00  
##       year     
##  Min.   :2008  
##  1st Qu.:2012  
##  Median :2016  
##  Mean   :2016  
##  3rd Qu.:2021  
##  Max.   :2025
library(plm)
library(glmnet)
library(MASS)
library(caret)
library(knitr)
library(broom)
library(ggplot2)
library(car)
library(scales)
library(tseries)
library(dplyr)
library(tidyr)
library(lmtest)
library(sandwich)
library(ARDL)
library(dLagM)
library(strucchange)
## 2. Transforming data
# As we observe in previous chart the variable's type were not settled in true form. Below we verify again to announce the form of each of them.
data_clean <- data %>%
  mutate(
    year  = as.character(year),
    month = as.character(month),
    cpi   = as.numeric(gsub("[^0-9.-]", "", cpi)),
    vn30  = as.numeric(gsub("[^0-9.-]", "", vn30)),
    gepu  = as.numeric(gsub("[^0-9.-]", "", gepu)),
    gfc   = as.numeric(gfc),
    co19  = as.numeric(co19),
    ex    = as.numeric(gsub("[^0-9.-]", "", ex)),
    ir    = as.numeric(gsub("[^0-9.-]", "", ir)),
    iip   = as.numeric(gsub("[^0-9.-]", "", iip)),
    sp500 = as.numeric(gsub("[^0-9.-]", "", sp500)),
    date  = as.Date(paste(year, month, "01", sep = "-"))
  ) %>%
  arrange(date)
# The database is raw, now we calculate several step to match the requirement of the test with lag. So that the change between two logarit variables is reasonable for ARDL test
data_desc <- data_clean %>%
  mutate(
    ret_vn      = 100*(log(vn30) - lag(log(vn30))),
    d_gepu     = gepu - lag(gepu),
    d_ex        = 100*(log(ex) - lag(log(ex))),
    d_ir        = ir - lag(ir),
    d_iip       = 100*(log(iip) - lag(log(iip))),
    inf         = 100*(log(cpi) - lag(log(cpi))),
    ret_sp500   = 100*(log(sp500) - lag(log(sp500)))
  ) %>% drop_na()
#Line chart illustrating the change monthly between the indexes of GEPU and VN30 scaled down 10 times (the index of GEPU has 2 digits while 2 with VN30). So that we scaled down the VN30 index to fit and visualize them on chart.
#Global EPU vs VN30_index chart
p1 <- ggplot(data_clean, aes(x = date)) +
  geom_line(aes(y = gepu, colour = "Global EPU"), linewidth = 0.7) +
  geom_line(aes(y = vn30/10, colour = "VN30 (÷10)"), linewidth = 0.7) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  labs(title = "Global GEPU và VN30-Index",
       y = "GEPU & VN30/10",
       colour = NULL) +
  theme_minimal() +
  theme(legend.position = "top")
p1

#Line chart of exchange rate, refinancing rate, Industrial Index Prodution
df_ts <- data_clean %>%
  select(date, ex, ir, iip) %>%
  pivot_longer(
    -date,
    names_to  = "variable",
    values_to = "value"
  )

p2 <- ggplot(df_ts, aes(date, value)) +
  geom_line(linewidth = 0.7) +
  facet_wrap(
    ~variable,
    scales = "free_y",
    ncol = 1,
    labeller = as_labeller(c(
      ex  = "Tỷ giá VND/USD",
      iip = "Chỉ số sản xuất công nghiệp (IIP)",
      ir  = "Lãi suất tái cấp vốn (%)"
    ))
  ) +
  scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
  labs(
    title = "Diễn biến Tỷ giá – Chỉ số SX Công nghiệp – Lãi suất tái cấp vốn (tháng)",
    x = NULL,
    y = NULL
  ) +
  theme_minimal(base_size = 12)
p2

#Distribution Chart of VN30 return
ggplot(data_desc, aes(ret_vn)) +
  geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.7) +
  geom_density(linewidth = 0.9) +
  labs(title = "Phân bố lợi suất VN30 (%)",
       x = "Return (%)", y = "Density") +
  theme_minimal()

#Correlation Matrix among Variables on chart
library(ggcorrplot)
cor_matrix <- data_clean %>%
  select(gepu, cpi, vn30, ex, ir, iip, sp500) %>%
  cor(use = "complete.obs")
ggcorrplot(cor_matrix,
           lab = TRUE,
           lab_size = 2.5,
           title = "Ma trận tương quan")

#VIF
initial_model_for_vif <- lm(vn30 ~ gepu + cpi + ex + ir + iip + sp500,
                            data = data_clean)
kable(vif(initial_model_for_vif),
      caption = "Hệ số VIF")
Hệ số VIF
x
gepu 2.403540
cpi 2.323131
ex 18.847623
ir 2.846974
iip 1.149279
sp500 21.752047
#ARDL test
#From this code we looking for the best model to test the data by max order. We can observe that the best model for this test will be 2,5,0,0,0
library(ARDL)
library(dLagM)
congthuc <- vn30 ~ gepu + cpi + ir + iip
ardl_auto <- auto_ardl(congthuc,
                       data = data_clean,
                       max_order = c(5,5,5,5,5),
                       selection = "BIC")
summary(ardl_auto)
##            Length Class      Mode   
## best_model 19     dynlm      list   
## best_order  5     -none-     numeric
## top_orders  6     data.frame list
ardl_auto
## $best_model
## 
## Time series regression with "ts" data:
## Start = 6, End = 216
## 
## Call:
## dynlm::dynlm(formula = full_formula, data = data, start = start, 
##     end = end)
## 
## Coefficients:
## (Intercept)   L(vn30, 1)   L(vn30, 2)         gepu   L(gepu, 1)   L(gepu, 2)  
##   91.203253     1.140955    -0.187490    -0.213782     0.101009     0.102232  
##  L(gepu, 3)   L(gepu, 4)   L(gepu, 5)          cpi           ir          iip  
##    0.020873    -0.014155     0.125607    -0.003366    -3.779068    -0.087158  
## 
## 
## $best_order
## vn30 gepu  cpi   ir  iip 
##    2    5    0    0    0 
## 
## $top_orders
##    vn30 gepu cpi ir iip      BIC
## 1     2    5   0  0   0 2227.125
## 2     2    4   0  0   0 2235.642
## 3     3    5   0  3   0 2238.060
## 4     2    4   0  0   1 2240.775
## 5     3    5   0  4   0 2241.410
## 6     3    5   0  3   1 2243.353
## 7     3    5   0  3   2 2243.390
## 8     2    4   0  1   1 2245.274
## 9     1    3   0  0   0 2246.122
## 10    3    5   0  4   1 2246.713
## 11    3    5   1  3   2 2246.844
## 12    3    5   1  3   1 2246.883
## 13    3    5   0  4   2 2246.938
## 14    2    4   1  1   1 2248.748
## 15    3    5   1  4   2 2250.497
## 16    4    5   4  3   0 2251.034
## 17    3    5   2  3   2 2252.192
## 18    2    3   1  1   1 2253.735
## 19    2    3   1  1   2 2254.673
## 20    4    5   4  4   0 2254.839
ardl <- ardl(congthuc,
             data = data_clean,
             order = c(2,5,0,0,0))
ardl
## 
## Time series regression with "ts" data:
## Start = 6, End = 216
## 
## Call:
## dynlm::dynlm(formula = full_formula, data = data, start = start, 
##     end = end)
## 
## Coefficients:
## (Intercept)   L(vn30, 1)   L(vn30, 2)         gepu   L(gepu, 1)   L(gepu, 2)  
##   91.203253     1.140955    -0.187490    -0.213782     0.101009     0.102232  
##  L(gepu, 3)   L(gepu, 4)   L(gepu, 5)          cpi           ir          iip  
##    0.020873    -0.014155     0.125607    -0.003366    -3.779068    -0.087158
##Unit root test
# Moving to this part. The requirement for ARDL test will be variables from database are mixed with I(0) and I(1) but no I(2). We using both methodology of Dickey Fuller (1981) and Phillips-Peron 
#In vn30 we reject Ho of ADF while rejecting it from PP (test for I(0)). Normally we can used the result from PP to make conclusion, however, we moved to next part of testing I(1) for both method to make guarantee that the vn30 variable is not I(2), and the result of both method rejected H0, we accept the test that vn30 is I(0). => Accepted vn30 is I(0)
adf.test(data_clean$vn30)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_clean$vn30
## Dickey-Fuller = -2.7356, Lag order = 5, p-value = 0.2674
## alternative hypothesis: stationary
pp.test(data_clean$vn30)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  data_clean$vn30
## Dickey-Fuller Z(alpha) = -21.868, Truncation lag parameter = 4, p-value
## = 0.04421
## alternative hypothesis: stationary
adf.test(diff(data_clean$cpi, na.rm = TRUE))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(data_clean$cpi, na.rm = TRUE)
## Dickey-Fuller = -6.1581, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(data_clean$cpi, na.rm = TRUE))
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff(data_clean$cpi, na.rm = TRUE)
## Dickey-Fuller Z(alpha) = -213.53, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
#The same with GEPU => Accepted gepu is I(0)
adf.test(data_clean$gepu)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_clean$gepu
## Dickey-Fuller = -3.1132, Lag order = 5, p-value = 0.109
## alternative hypothesis: stationary
pp.test(data_clean$gepu)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  data_clean$gepu
## Dickey-Fuller Z(alpha) = -56.023, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
adf.test(diff(data_clean$gepu, na.rm = TRUE))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(data_clean$gepu, na.rm = TRUE)
## Dickey-Fuller = -7.5249, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(data_clean$gepu, na.rm = TRUE))
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff(data_clean$gepu, na.rm = TRUE)
## Dickey-Fuller Z(alpha) = -229.38, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
# With test of CPI accepting H0 in both method, so that we test I(1) for both method to guarantee the cpi is only I(1) not I(2)
adf.test(data_clean$cpi)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_clean$cpi
## Dickey-Fuller = -0.73009, Lag order = 5, p-value = 0.9665
## alternative hypothesis: stationary
pp.test(data_clean$cpi)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  data_clean$cpi
## Dickey-Fuller Z(alpha) = -4.503, Truncation lag parameter = 4, p-value
## = 0.8573
## alternative hypothesis: stationary
adf.test(diff(data_clean$cpi, na.rm = TRUE))
## 
##  Augmented Dickey-Fuller Test
## 
## data:  diff(data_clean$cpi, na.rm = TRUE)
## Dickey-Fuller = -6.1581, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(data_clean$cpi, na.rm = TRUE))
## 
##  Phillips-Perron Unit Root Test
## 
## data:  diff(data_clean$cpi, na.rm = TRUE)
## Dickey-Fuller Z(alpha) = -213.53, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
# IR test rejected H0 in both method ==> I(0)
adf.test(data_clean$ir)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_clean$ir
## Dickey-Fuller = -4.1226, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
pp.test(data_clean$ir)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  data_clean$ir
## Dickey-Fuller Z(alpha) = -23.217, Truncation lag parameter = 4, p-value
## = 0.03316
## alternative hypothesis: stationary
#IIP test rejected H0 in both method ==> I(0)
adf.test(data_clean$iip)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  data_clean$iip
## Dickey-Fuller = -3.609, Lag order = 5, p-value = 0.03353
## alternative hypothesis: stationary
pp.test(data_clean$iip)
## 
##  Phillips-Perron Unit Root Test
## 
## data:  data_clean$iip
## Dickey-Fuller Z(alpha) = -50.231, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
#Bounds F and T test.
# The result from f-test figured out that F value = 3,6878  > upper I(0) at 10% sig.level whereas in t-test the F value fell between the range of I(0) and I(1) so that it had no statistic meaning
bounds_f_test(ardl, case = 3)
## 
##  Bounds F-test (Wald) for no cointegration
## 
## data:  d(vn30) ~ L(vn30, 1) + L(gepu, 1) + cpi + ir + iip + d(L(vn30,     1)) + d(gepu) + d(L(gepu, 1)) + d(L(gepu, 2)) + d(L(gepu,     3)) + d(L(gepu, 4))
## F = 3.6878, p-value = 0.08135
## alternative hypothesis: Possible cointegration
## null values:
##    k    T 
##    4 1000
bounds_t_test(ardl, case = 3)
## 
##  Bounds t-test for no cointegration
## 
## data:  d(vn30) ~ L(vn30, 1) + L(gepu, 1) + cpi + ir + iip + d(L(vn30,     1)) + d(gepu) + d(L(gepu, 1)) + d(L(gepu, 2)) + d(L(gepu,     3)) + d(L(gepu, 4))
## t = -3.0624, p-value = 0.2707
## alternative hypothesis: Possible cointegration
## null values:
##    k    T 
##    4 1000
#ECM test
uecm <- uecm(ardl)
test_uecm <- coint_eq(ardl, case = 3)
uecm
## 
## Time series regression with "ts" data:
## Start = 6, End = 216
## 
## Call:
## dynlm::dynlm(formula = full_formula, data = data, start = start, 
##     end = end)
## 
## Coefficients:
##   (Intercept)     L(vn30, 1)     L(gepu, 1)            cpi             ir  
##     91.203253      -0.046535       0.121783      -0.003366      -3.779068  
##           iip  d(L(vn30, 1))        d(gepu)  d(L(gepu, 1))  d(L(gepu, 2))  
##     -0.087158       0.187490      -0.213782      -0.234556      -0.132325  
## d(L(gepu, 3))  d(L(gepu, 4))  
##     -0.111452      -0.125607
test_uecm
## Time Series:
## Start = 1 
## End = 216 
## Frequency = 1 
##   [1] -1167.58325 -1307.27328 -1258.00601 -1336.45554 -1847.43054 -1968.92040
##   [7] -1919.42225 -1942.31326 -1770.17596 -1555.12263 -1651.59590 -1366.75805
##  [13] -1336.74981 -1234.82446 -1272.38717 -1277.04150 -1292.77068 -1219.24371
##  [19] -1260.17022 -1281.22861 -1259.13834 -1315.30411 -1292.37163 -1398.49786
##  [25] -1352.19012 -1300.55016 -1346.15237 -1369.94827 -1294.03870 -1296.21811
##  [31] -1291.89026 -1356.95366 -1341.99248 -1370.74042 -1451.85613 -1415.00122
##  [37] -1349.84646 -1566.24479 -1584.39734 -1686.15339 -1842.43927 -1741.15966
##  [43] -1660.33492 -1502.46121 -1544.64749 -1697.42497 -1650.66828 -1674.18562
##  [49] -1803.29813 -1866.52414 -1805.20738 -1741.81892 -1598.09656 -1440.70788
##  [55] -1458.86967 -1559.39747 -1464.75243 -1484.45629 -1472.72418 -1375.93582
##  [61] -1253.47341 -1307.47113 -1212.77277 -1228.01887 -1232.21533 -1170.89942
##  [67] -1232.32939 -1190.69032 -1148.57575 -1091.70050 -1249.25160 -1208.00452
##  [73] -1203.25802 -1225.14168 -1178.53977 -1180.77012 -1201.06008 -1238.56886
##  [79] -1227.99979 -1218.25212 -1158.18504 -1207.39158 -1173.99718 -1224.76538
##  [85] -1204.46336 -1169.72240 -1251.23594 -1293.91100 -1297.71511 -1254.20406
##  [91] -1243.10468 -1261.13355 -1165.03085 -1280.28696 -1304.14101 -1290.28590
##  [97] -1242.88965 -1231.36186 -1256.62167 -1317.17367 -1377.38419 -1109.87704
## [103] -1195.54740 -1362.94276 -1379.96890 -1429.76013 -1185.18091 -1264.46119
## [109]  -823.57694  -946.77355  -835.30607 -1019.90381 -1114.92503 -1028.76532
## [115] -1019.28231 -1115.79751 -1054.89782 -1098.69796 -1055.93158 -1092.84178
## [121] -1009.98155 -1092.70416 -1002.15505 -1025.17731 -1038.82468 -1054.29356
## [127]  -866.86549 -1032.76761  -970.74863  -924.25575  -805.09648  -929.64853
## [133]  -840.80831  -970.10543  -818.61187  -944.82328  -866.58268  -561.31120
## [139]  -813.23279  -741.12617  -835.70336  -954.41023  -813.59851  -856.61854
## [145]  -928.53279  -932.35655  -566.56539  -459.75622  -223.33921  -505.06332
## [151]  -559.62830  -577.94937  -485.33157  -530.71652  -247.76090  -224.24740
## [157]  -595.77678  -759.51366  -735.42992  -775.16882  -851.42044  -844.59816
## [163]  -821.90616  -757.71325  -733.21282  -818.21354  -735.55946  -656.35725
## [169]  -736.61781  -832.14418  -562.39957  -612.35090  -617.39517  -681.78840
## [175]  -575.90188  -742.78856  -789.05270  -836.72044  -676.98578  -857.50260
## [181]  -854.23227  -772.97000  -591.33496  -733.34958  -843.27089  -742.11120
## [187]  -842.30148  -893.51689  -823.31857  -871.53583  -815.76660  -739.69955
## [193]  -694.27224  -860.27906  -881.38885  -919.55325  -897.46908  -945.84076
## [199]  -768.91407  -880.26856  -846.63718  -849.87086  -677.29198  -537.78910
## [205]   146.68320   180.54298   507.19360   828.09243   549.88016   216.18385
## [211]   314.87739   141.96661    53.51152   326.80032   299.37093   291.30345
#Diagnostic Test after ARDL
bgtest(ardl)
## 
##  Breusch-Godfrey test for serial correlation of order up to 1
## 
## data:  ardl
## LM test = 0.21445, df = 1, p-value = 0.6433
bptest(ardl)
## 
##  studentized Breusch-Pagan test
## 
## data:  ardl
## BP = 20.893, df = 11, p-value = 0.0345
coeftest(
  ardl,
  vcov = NeweyWest(ardl, lag = 4, prewhite = FALSE)
)
## 
## t test of coefficients:
## 
##               Estimate Std. Error t value              Pr(>|t|)    
## (Intercept) 91.2032535 31.7046473  2.8767              0.004457 ** 
## L(vn30, 1)   1.1409551  0.0657043 17.3650 < 0.00000000000000022 ***
## L(vn30, 2)  -0.1874900  0.0638975 -2.9342              0.003737 ** 
## gepu        -0.2137816  0.0733693 -2.9138              0.003979 ** 
## L(gepu, 1)   0.1010085  0.0753664  1.3402              0.181698    
## L(gepu, 2)   0.1022315  0.0830309  1.2312              0.219684    
## L(gepu, 3)   0.0208729  0.1135010  0.1839              0.854278    
## L(gepu, 4)  -0.0141550  0.0771226 -0.1835              0.854562    
## L(gepu, 5)   0.1256067  0.0702001  1.7893              0.075093 .  
## cpi         -0.0033660  0.0013355 -2.5204              0.012508 *  
## ir          -3.7790681  1.1660288 -3.2410              0.001396 ** 
## iip         -0.0871580  0.0636214 -1.3699              0.172247    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Stability Test
library(strucchange)
resid_ardl <- residuals(ardl)
plot(efp(resid_ardl ~ 1, type = "Rec-CUSUM"))

plot(efp(resid_ardl ~ 1, type = "Rec-MOSUM"))