1. Import data and Description Data
rm(list = ls())
options(scipen = 999)
#Import Dataset
data <- read.csv("C:/Users/Nguyen Ba Trung/Downloads/data_CF_new.csv",
stringsAsFactors = FALSE)
psych::describe(data, fast = TRUE)
## vars n mean sd median min max range skew kurtosis
## wui 1 216 0.13 0.15 0.10 0.00 0.77 0.77 1.99 4.96
## wpui 2 216 0.03 0.06 0.00 0.00 0.34 0.34 2.52 6.82
## gepu 3 216 193.35 84.66 173.45 81.65 584.65 503.00 1.35 2.20
## cpi 4 216 NaN NA NA Inf -Inf -Inf NA NA
## vn30 5 216 NaN NA NA Inf -Inf -Inf NA NA
## gfc 6 216 0.11 0.31 0.00 0.00 1.00 1.00 2.46 4.06
## co19 7 216 0.17 0.37 0.00 0.00 1.00 1.00 1.78 1.16
## ex 8 216 NaN NA NA Inf -Inf -Inf NA NA
## ir 9 216 NaN NA NA Inf -Inf -Inf NA NA
## iip 10 216 150.48 32.55 146.74 0.00 267.16 267.16 0.69 3.83
## sp500 11 216 2749.46 1539.56 2258.85 735.09 6849.09 6114.00 0.85 -0.22
## month 12 216 6.50 3.46 6.50 1.00 12.00 11.00 0.00 -1.23
## year 13 216 2016.50 5.20 2016.50 2008.00 2025.00 17.00 0.00 -1.22
## se
## wui 0.01
## wpui 0.00
## gepu 5.76
## cpi NA
## vn30 NA
## gfc 0.02
## co19 0.03
## ex NA
## ir NA
## iip 2.21
## sp500 104.75
## month 0.24
## year 0.35
summary(data)
## wui wpui gepu cpi
## Min. :0.000 Min. :0.00000 Min. : 81.65 Length:216
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:127.64 Class :character
## Median :0.100 Median :0.00000 Median :173.45 Mode :character
## Mean :0.127 Mean :0.02699 Mean :193.35
## 3rd Qu.:0.190 3rd Qu.:0.00000 3rd Qu.:238.52
## Max. :0.770 Max. :0.34000 Max. :584.65
## vn30 gfc co19 ex
## Length:216 Min. :0.0000 Min. :0.0000 Length:216
## Class :character 1st Qu.:0.0000 1st Qu.:0.0000 Class :character
## Mode :character Median :0.0000 Median :0.0000 Mode :character
## Mean :0.1111 Mean :0.1667
## 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.0000
## ir iip sp500 month
## Length:216 Min. : 0.0 Min. : 735.1 Min. : 1.00
## Class :character 1st Qu.:129.9 1st Qu.:1405.0 1st Qu.: 3.75
## Mode :character Median :146.7 Median :2258.8 Median : 6.50
## Mean :150.5 Mean :2749.5 Mean : 6.50
## 3rd Qu.:167.0 3rd Qu.:3892.7 3rd Qu.: 9.25
## Max. :267.2 Max. :6849.1 Max. :12.00
## year
## Min. :2008
## 1st Qu.:2012
## Median :2016
## Mean :2016
## 3rd Qu.:2021
## Max. :2025
library(plm)
library(glmnet)
library(MASS)
library(caret)
library(knitr)
library(broom)
library(ggplot2)
library(car)
library(scales)
library(tseries)
library(dplyr)
library(tidyr)
library(lmtest)
library(sandwich)
library(ARDL)
library(dLagM)
library(strucchange)
## 2. Transforming data
# As we observe in previous chart the variable's type were not settled in true form. Below we verify again to announce the form of each of them.
data_clean <- data %>%
mutate(
year = as.character(year),
month = as.character(month),
cpi = as.numeric(gsub("[^0-9.-]", "", cpi)),
vn30 = as.numeric(gsub("[^0-9.-]", "", vn30)),
gepu = as.numeric(gsub("[^0-9.-]", "", gepu)),
gfc = as.numeric(gfc),
co19 = as.numeric(co19),
ex = as.numeric(gsub("[^0-9.-]", "", ex)),
ir = as.numeric(gsub("[^0-9.-]", "", ir)),
iip = as.numeric(gsub("[^0-9.-]", "", iip)),
sp500 = as.numeric(gsub("[^0-9.-]", "", sp500)),
date = as.Date(paste(year, month, "01", sep = "-"))
) %>%
arrange(date)
# The database is raw, now we calculate several step to match the requirement of the test with lag. So that the change between two logarit variables is reasonable for ARDL test
data_desc <- data_clean %>%
mutate(
ret_vn = 100*(log(vn30) - lag(log(vn30))),
d_gepu = gepu - lag(gepu),
d_ex = 100*(log(ex) - lag(log(ex))),
d_ir = ir - lag(ir),
d_iip = 100*(log(iip) - lag(log(iip))),
inf = 100*(log(cpi) - lag(log(cpi))),
ret_sp500 = 100*(log(sp500) - lag(log(sp500)))
) %>% drop_na()
#Line chart illustrating the change monthly between the indexes of GEPU and VN30 scaled down 10 times (the index of GEPU has 2 digits while 2 with VN30). So that we scaled down the VN30 index to fit and visualize them on chart.
#Global EPU vs VN30_index chart
p1 <- ggplot(data_clean, aes(x = date)) +
geom_line(aes(y = gepu, colour = "Global EPU"), linewidth = 0.7) +
geom_line(aes(y = vn30/10, colour = "VN30 (÷10)"), linewidth = 0.7) +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
labs(title = "Global GEPU và VN30-Index",
y = "GEPU & VN30/10",
colour = NULL) +
theme_minimal() +
theme(legend.position = "top")
p1

#Line chart of exchange rate, refinancing rate, Industrial Index Prodution
df_ts <- data_clean %>%
select(date, ex, ir, iip) %>%
pivot_longer(
-date,
names_to = "variable",
values_to = "value"
)
p2 <- ggplot(df_ts, aes(date, value)) +
geom_line(linewidth = 0.7) +
facet_wrap(
~variable,
scales = "free_y",
ncol = 1,
labeller = as_labeller(c(
ex = "Tỷ giá VND/USD",
iip = "Chỉ số sản xuất công nghiệp (IIP)",
ir = "Lãi suất tái cấp vốn (%)"
))
) +
scale_x_date(date_breaks = "1 year", date_labels = "%Y") +
labs(
title = "Diễn biến Tỷ giá – Chỉ số SX Công nghiệp – Lãi suất tái cấp vốn (tháng)",
x = NULL,
y = NULL
) +
theme_minimal(base_size = 12)
p2

#Distribution Chart of VN30 return
ggplot(data_desc, aes(ret_vn)) +
geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.7) +
geom_density(linewidth = 0.9) +
labs(title = "Phân bố lợi suất VN30 (%)",
x = "Return (%)", y = "Density") +
theme_minimal()

#Correlation Matrix among Variables on chart
library(ggcorrplot)
cor_matrix <- data_clean %>%
select(gepu, cpi, vn30, ex, ir, iip, sp500) %>%
cor(use = "complete.obs")
ggcorrplot(cor_matrix,
lab = TRUE,
lab_size = 2.5,
title = "Ma trận tương quan")

#VIF
initial_model_for_vif <- lm(vn30 ~ gepu + cpi + ex + ir + iip + sp500,
data = data_clean)
kable(vif(initial_model_for_vif),
caption = "Hệ số VIF")
Hệ số VIF
| gepu |
2.403540 |
| cpi |
2.323131 |
| ex |
18.847623 |
| ir |
2.846974 |
| iip |
1.149279 |
| sp500 |
21.752047 |
#ARDL test
#From this code we looking for the best model to test the data by max order. We can observe that the best model for this test will be 2,5,0,0,0
library(ARDL)
library(dLagM)
congthuc <- vn30 ~ gepu + cpi + ir + iip
ardl_auto <- auto_ardl(congthuc,
data = data_clean,
max_order = c(5,5,5,5,5),
selection = "BIC")
summary(ardl_auto)
## Length Class Mode
## best_model 19 dynlm list
## best_order 5 -none- numeric
## top_orders 6 data.frame list
ardl_auto
## $best_model
##
## Time series regression with "ts" data:
## Start = 6, End = 216
##
## Call:
## dynlm::dynlm(formula = full_formula, data = data, start = start,
## end = end)
##
## Coefficients:
## (Intercept) L(vn30, 1) L(vn30, 2) gepu L(gepu, 1) L(gepu, 2)
## 91.203253 1.140955 -0.187490 -0.213782 0.101009 0.102232
## L(gepu, 3) L(gepu, 4) L(gepu, 5) cpi ir iip
## 0.020873 -0.014155 0.125607 -0.003366 -3.779068 -0.087158
##
##
## $best_order
## vn30 gepu cpi ir iip
## 2 5 0 0 0
##
## $top_orders
## vn30 gepu cpi ir iip BIC
## 1 2 5 0 0 0 2227.125
## 2 2 4 0 0 0 2235.642
## 3 3 5 0 3 0 2238.060
## 4 2 4 0 0 1 2240.775
## 5 3 5 0 4 0 2241.410
## 6 3 5 0 3 1 2243.353
## 7 3 5 0 3 2 2243.390
## 8 2 4 0 1 1 2245.274
## 9 1 3 0 0 0 2246.122
## 10 3 5 0 4 1 2246.713
## 11 3 5 1 3 2 2246.844
## 12 3 5 1 3 1 2246.883
## 13 3 5 0 4 2 2246.938
## 14 2 4 1 1 1 2248.748
## 15 3 5 1 4 2 2250.497
## 16 4 5 4 3 0 2251.034
## 17 3 5 2 3 2 2252.192
## 18 2 3 1 1 1 2253.735
## 19 2 3 1 1 2 2254.673
## 20 4 5 4 4 0 2254.839
ardl <- ardl(congthuc,
data = data_clean,
order = c(2,5,0,0,0))
ardl
##
## Time series regression with "ts" data:
## Start = 6, End = 216
##
## Call:
## dynlm::dynlm(formula = full_formula, data = data, start = start,
## end = end)
##
## Coefficients:
## (Intercept) L(vn30, 1) L(vn30, 2) gepu L(gepu, 1) L(gepu, 2)
## 91.203253 1.140955 -0.187490 -0.213782 0.101009 0.102232
## L(gepu, 3) L(gepu, 4) L(gepu, 5) cpi ir iip
## 0.020873 -0.014155 0.125607 -0.003366 -3.779068 -0.087158
##Unit root test
# Moving to this part. The requirement for ARDL test will be variables from database are mixed with I(0) and I(1) but no I(2). We using both methodology of Dickey Fuller (1981) and Phillips-Peron
#In vn30 we reject Ho of ADF while rejecting it from PP (test for I(0)). Normally we can used the result from PP to make conclusion, however, we moved to next part of testing I(1) for both method to make guarantee that the vn30 variable is not I(2), and the result of both method rejected H0, we accept the test that vn30 is I(0). => Accepted vn30 is I(0)
adf.test(data_clean$vn30)
##
## Augmented Dickey-Fuller Test
##
## data: data_clean$vn30
## Dickey-Fuller = -2.7356, Lag order = 5, p-value = 0.2674
## alternative hypothesis: stationary
pp.test(data_clean$vn30)
##
## Phillips-Perron Unit Root Test
##
## data: data_clean$vn30
## Dickey-Fuller Z(alpha) = -21.868, Truncation lag parameter = 4, p-value
## = 0.04421
## alternative hypothesis: stationary
adf.test(diff(data_clean$cpi, na.rm = TRUE))
##
## Augmented Dickey-Fuller Test
##
## data: diff(data_clean$cpi, na.rm = TRUE)
## Dickey-Fuller = -6.1581, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(data_clean$cpi, na.rm = TRUE))
##
## Phillips-Perron Unit Root Test
##
## data: diff(data_clean$cpi, na.rm = TRUE)
## Dickey-Fuller Z(alpha) = -213.53, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
#The same with GEPU => Accepted gepu is I(0)
adf.test(data_clean$gepu)
##
## Augmented Dickey-Fuller Test
##
## data: data_clean$gepu
## Dickey-Fuller = -3.1132, Lag order = 5, p-value = 0.109
## alternative hypothesis: stationary
pp.test(data_clean$gepu)
##
## Phillips-Perron Unit Root Test
##
## data: data_clean$gepu
## Dickey-Fuller Z(alpha) = -56.023, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
adf.test(diff(data_clean$gepu, na.rm = TRUE))
##
## Augmented Dickey-Fuller Test
##
## data: diff(data_clean$gepu, na.rm = TRUE)
## Dickey-Fuller = -7.5249, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(data_clean$gepu, na.rm = TRUE))
##
## Phillips-Perron Unit Root Test
##
## data: diff(data_clean$gepu, na.rm = TRUE)
## Dickey-Fuller Z(alpha) = -229.38, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
# With test of CPI accepting H0 in both method, so that we test I(1) for both method to guarantee the cpi is only I(1) not I(2)
adf.test(data_clean$cpi)
##
## Augmented Dickey-Fuller Test
##
## data: data_clean$cpi
## Dickey-Fuller = -0.73009, Lag order = 5, p-value = 0.9665
## alternative hypothesis: stationary
pp.test(data_clean$cpi)
##
## Phillips-Perron Unit Root Test
##
## data: data_clean$cpi
## Dickey-Fuller Z(alpha) = -4.503, Truncation lag parameter = 4, p-value
## = 0.8573
## alternative hypothesis: stationary
adf.test(diff(data_clean$cpi, na.rm = TRUE))
##
## Augmented Dickey-Fuller Test
##
## data: diff(data_clean$cpi, na.rm = TRUE)
## Dickey-Fuller = -6.1581, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
pp.test(diff(data_clean$cpi, na.rm = TRUE))
##
## Phillips-Perron Unit Root Test
##
## data: diff(data_clean$cpi, na.rm = TRUE)
## Dickey-Fuller Z(alpha) = -213.53, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
# IR test rejected H0 in both method ==> I(0)
adf.test(data_clean$ir)
##
## Augmented Dickey-Fuller Test
##
## data: data_clean$ir
## Dickey-Fuller = -4.1226, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
pp.test(data_clean$ir)
##
## Phillips-Perron Unit Root Test
##
## data: data_clean$ir
## Dickey-Fuller Z(alpha) = -23.217, Truncation lag parameter = 4, p-value
## = 0.03316
## alternative hypothesis: stationary
#IIP test rejected H0 in both method ==> I(0)
adf.test(data_clean$iip)
##
## Augmented Dickey-Fuller Test
##
## data: data_clean$iip
## Dickey-Fuller = -3.609, Lag order = 5, p-value = 0.03353
## alternative hypothesis: stationary
pp.test(data_clean$iip)
##
## Phillips-Perron Unit Root Test
##
## data: data_clean$iip
## Dickey-Fuller Z(alpha) = -50.231, Truncation lag parameter = 4, p-value
## = 0.01
## alternative hypothesis: stationary
#Bounds F and T test.
# The result from f-test figured out that F value = 3,6878 > upper I(0) at 10% sig.level whereas in t-test the F value fell between the range of I(0) and I(1) so that it had no statistic meaning
bounds_f_test(ardl, case = 3)
##
## Bounds F-test (Wald) for no cointegration
##
## data: d(vn30) ~ L(vn30, 1) + L(gepu, 1) + cpi + ir + iip + d(L(vn30, 1)) + d(gepu) + d(L(gepu, 1)) + d(L(gepu, 2)) + d(L(gepu, 3)) + d(L(gepu, 4))
## F = 3.6878, p-value = 0.08135
## alternative hypothesis: Possible cointegration
## null values:
## k T
## 4 1000
bounds_t_test(ardl, case = 3)
##
## Bounds t-test for no cointegration
##
## data: d(vn30) ~ L(vn30, 1) + L(gepu, 1) + cpi + ir + iip + d(L(vn30, 1)) + d(gepu) + d(L(gepu, 1)) + d(L(gepu, 2)) + d(L(gepu, 3)) + d(L(gepu, 4))
## t = -3.0624, p-value = 0.2707
## alternative hypothesis: Possible cointegration
## null values:
## k T
## 4 1000
#ECM test
uecm <- uecm(ardl)
test_uecm <- coint_eq(ardl, case = 3)
uecm
##
## Time series regression with "ts" data:
## Start = 6, End = 216
##
## Call:
## dynlm::dynlm(formula = full_formula, data = data, start = start,
## end = end)
##
## Coefficients:
## (Intercept) L(vn30, 1) L(gepu, 1) cpi ir
## 91.203253 -0.046535 0.121783 -0.003366 -3.779068
## iip d(L(vn30, 1)) d(gepu) d(L(gepu, 1)) d(L(gepu, 2))
## -0.087158 0.187490 -0.213782 -0.234556 -0.132325
## d(L(gepu, 3)) d(L(gepu, 4))
## -0.111452 -0.125607
test_uecm
## Time Series:
## Start = 1
## End = 216
## Frequency = 1
## [1] -1167.58325 -1307.27328 -1258.00601 -1336.45554 -1847.43054 -1968.92040
## [7] -1919.42225 -1942.31326 -1770.17596 -1555.12263 -1651.59590 -1366.75805
## [13] -1336.74981 -1234.82446 -1272.38717 -1277.04150 -1292.77068 -1219.24371
## [19] -1260.17022 -1281.22861 -1259.13834 -1315.30411 -1292.37163 -1398.49786
## [25] -1352.19012 -1300.55016 -1346.15237 -1369.94827 -1294.03870 -1296.21811
## [31] -1291.89026 -1356.95366 -1341.99248 -1370.74042 -1451.85613 -1415.00122
## [37] -1349.84646 -1566.24479 -1584.39734 -1686.15339 -1842.43927 -1741.15966
## [43] -1660.33492 -1502.46121 -1544.64749 -1697.42497 -1650.66828 -1674.18562
## [49] -1803.29813 -1866.52414 -1805.20738 -1741.81892 -1598.09656 -1440.70788
## [55] -1458.86967 -1559.39747 -1464.75243 -1484.45629 -1472.72418 -1375.93582
## [61] -1253.47341 -1307.47113 -1212.77277 -1228.01887 -1232.21533 -1170.89942
## [67] -1232.32939 -1190.69032 -1148.57575 -1091.70050 -1249.25160 -1208.00452
## [73] -1203.25802 -1225.14168 -1178.53977 -1180.77012 -1201.06008 -1238.56886
## [79] -1227.99979 -1218.25212 -1158.18504 -1207.39158 -1173.99718 -1224.76538
## [85] -1204.46336 -1169.72240 -1251.23594 -1293.91100 -1297.71511 -1254.20406
## [91] -1243.10468 -1261.13355 -1165.03085 -1280.28696 -1304.14101 -1290.28590
## [97] -1242.88965 -1231.36186 -1256.62167 -1317.17367 -1377.38419 -1109.87704
## [103] -1195.54740 -1362.94276 -1379.96890 -1429.76013 -1185.18091 -1264.46119
## [109] -823.57694 -946.77355 -835.30607 -1019.90381 -1114.92503 -1028.76532
## [115] -1019.28231 -1115.79751 -1054.89782 -1098.69796 -1055.93158 -1092.84178
## [121] -1009.98155 -1092.70416 -1002.15505 -1025.17731 -1038.82468 -1054.29356
## [127] -866.86549 -1032.76761 -970.74863 -924.25575 -805.09648 -929.64853
## [133] -840.80831 -970.10543 -818.61187 -944.82328 -866.58268 -561.31120
## [139] -813.23279 -741.12617 -835.70336 -954.41023 -813.59851 -856.61854
## [145] -928.53279 -932.35655 -566.56539 -459.75622 -223.33921 -505.06332
## [151] -559.62830 -577.94937 -485.33157 -530.71652 -247.76090 -224.24740
## [157] -595.77678 -759.51366 -735.42992 -775.16882 -851.42044 -844.59816
## [163] -821.90616 -757.71325 -733.21282 -818.21354 -735.55946 -656.35725
## [169] -736.61781 -832.14418 -562.39957 -612.35090 -617.39517 -681.78840
## [175] -575.90188 -742.78856 -789.05270 -836.72044 -676.98578 -857.50260
## [181] -854.23227 -772.97000 -591.33496 -733.34958 -843.27089 -742.11120
## [187] -842.30148 -893.51689 -823.31857 -871.53583 -815.76660 -739.69955
## [193] -694.27224 -860.27906 -881.38885 -919.55325 -897.46908 -945.84076
## [199] -768.91407 -880.26856 -846.63718 -849.87086 -677.29198 -537.78910
## [205] 146.68320 180.54298 507.19360 828.09243 549.88016 216.18385
## [211] 314.87739 141.96661 53.51152 326.80032 299.37093 291.30345
#Diagnostic Test after ARDL
bgtest(ardl)
##
## Breusch-Godfrey test for serial correlation of order up to 1
##
## data: ardl
## LM test = 0.21445, df = 1, p-value = 0.6433
bptest(ardl)
##
## studentized Breusch-Pagan test
##
## data: ardl
## BP = 20.893, df = 11, p-value = 0.0345
coeftest(
ardl,
vcov = NeweyWest(ardl, lag = 4, prewhite = FALSE)
)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 91.2032535 31.7046473 2.8767 0.004457 **
## L(vn30, 1) 1.1409551 0.0657043 17.3650 < 0.00000000000000022 ***
## L(vn30, 2) -0.1874900 0.0638975 -2.9342 0.003737 **
## gepu -0.2137816 0.0733693 -2.9138 0.003979 **
## L(gepu, 1) 0.1010085 0.0753664 1.3402 0.181698
## L(gepu, 2) 0.1022315 0.0830309 1.2312 0.219684
## L(gepu, 3) 0.0208729 0.1135010 0.1839 0.854278
## L(gepu, 4) -0.0141550 0.0771226 -0.1835 0.854562
## L(gepu, 5) 0.1256067 0.0702001 1.7893 0.075093 .
## cpi -0.0033660 0.0013355 -2.5204 0.012508 *
## ir -3.7790681 1.1660288 -3.2410 0.001396 **
## iip -0.0871580 0.0636214 -1.3699 0.172247
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#Stability Test
library(strucchange)
resid_ardl <- residuals(ardl)
plot(efp(resid_ardl ~ 1, type = "Rec-CUSUM"))

plot(efp(resid_ardl ~ 1, type = "Rec-MOSUM"))
