#Building a VAR Model #Vector Autoregression (VAR) is a multivariate forecasting #algorithm that is used when two or more time series influence each other
#1.Analyze the time series characteristics #2.Test for causation amongst the time series #3.Test for stationarity #4.Transform the series to make it stationary, if needed #5.Find optimal order (p) #6.Prepare training and test datasets #7.Train the model #8.Roll back the transformations, if any. #9.Evaluate the model using test set #10.Forecast to future
#loading data
data <- read.csv("E:/PHU/Techcombank/MAF.csv")
data <- data.frame(data[1:45,])
data %>% head()
## AVG_BAL TXN
## 1 2.55134e+12 1674249
## 2 2.60509e+12 1473090
## 3 2.47542e+12 1606036
## 4 2.48724e+12 1671792
## 5 2.48210e+12 1621473
## 6 2.53384e+12 1669374
#Create date/month columns
dates <- seq(as.Date("2016-01-01"), as.Date("2019-09-01"), "1 months")
data <- cbind(data, dates)
#Visulize the data
plot(data$AVG_BAL, data$TXN)
ggplot(data, aes(x=dates, y = AVG_BAL)) + geom_line()+
geom_point()
#scatterplot
scatterplotMatrix(data[1:2])
#Calculate the Correlations
cor.test(data$AVG_BAL, data$TXN)
##
## Pearson's product-moment correlation
##
## data: data$AVG_BAL and data$TXN
## t = 30.617, df = 43, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9597683 0.9878278
## sample estimates:
## cor
## 0.977825
#Test for stationary
###Augmented Dicker Fuller###
ur.df(data$AVG_BAL, type = "none") %>%
summary()
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.896e+11 -1.033e+11 -4.058e+10 9.926e+10 5.944e+11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 0.037321 0.008707 4.286 0.000107 ***
## z.diff.lag 0.060196 0.162928 0.369 0.713682
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.084e+11 on 41 degrees of freedom
## Multiple R-squared: 0.5101, Adjusted R-squared: 0.4862
## F-statistic: 21.34 on 2 and 41 DF, p-value: 4.446e-07
##
##
## Value of test-statistic is: 4.2864
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.62 -1.95 -1.61
ur.df(data$TXN, type = "none") %>%
summary()
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3483092 -132852 -59441 280592 2017209
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 0.09073 0.02219 4.089 0.000197 ***
## z.diff.lag -0.64068 0.13941 -4.596 4.07e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 811500 on 41 degrees of freedom
## Multiple R-squared: 0.3872, Adjusted R-squared: 0.3573
## F-statistic: 12.95 on 2 and 41 DF, p-value: 4.367e-05
##
##
## Value of test-statistic is: 4.0894
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.62 -1.95 -1.61
###Time series are not stationary based on test-statistics is more than critical values
###Make differences
ur.df(diff(data$AVG_BAL, differences = 1), type = "none") %>%
summary()
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -6.145e+11 -1.008e+10 4.964e+10 1.580e+11 5.988e+11
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 -0.2145 0.1466 -1.463 0.15136
## z.diff.lag -0.4365 0.1497 -2.916 0.00578 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.294e+11 on 40 degrees of freedom
## Multiple R-squared: 0.3409, Adjusted R-squared: 0.3079
## F-statistic: 10.34 on 2 and 40 DF, p-value: 0.0002396
##
##
## Value of test-statistic is: -1.4627
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.62 -1.95 -1.61
ur.df(diff(data$TXN, differences = 1), type = "none") %>%
summary()
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3288784 31094 198741 543306 2398697
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 -1.1853 0.2606 -4.548 4.94e-05 ***
## z.diff.lag -0.1371 0.1577 -0.869 0.39
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 965800 on 40 degrees of freedom
## Multiple R-squared: 0.6887, Adjusted R-squared: 0.6732
## F-statistic: 44.25 on 2 and 40 DF, p-value: 7.291e-11
##
##
## Value of test-statistic is: -4.5479
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.62 -1.95 -1.61
dif.bal <- diff(data$AVG_BAL, differences = 1)
dif.txn <- diff(data$TXN, differences = 1)
df <- data.frame(dif.bal, dif.txn)
df$date <- data$dates[2:45]
#Visualize
df %>% ggplot(aes(date, dif.bal)) +geom_line()+
geom_point()
df %>% ggplot(aes(date, dif.txn)) +geom_line()+
geom_point()
#####Define optimal lags
VARselect(df$dif.bal, lag.max = 5, type = "const")
## $selection
## AIC(n) HQ(n) SC(n) FPE(n)
## 2 2 1 2
##
## $criteria
## 1 2 3 4 5
## AIC(n) 5.239306e+01 5.236641e+01 5.240183e+01 5.244421e+01 5.248766e+01
## HQ(n) 5.242367e+01 5.241233e+01 5.246305e+01 5.252073e+01 5.257949e+01
## SC(n) 5.247837e+01 5.249438e+01 5.257245e+01 5.265749e+01 5.274359e+01
## FPE(n) 5.676189e+22 5.528110e+22 5.729793e+22 5.982018e+22 6.254198e+22
VARselect(df$dif.txn, lag.max = 5, type = "const")
## $selection
## AIC(n) HQ(n) SC(n) FPE(n)
## 1 1 1 1
##
## $criteria
## 1 2 3 4 5
## AIC(n) 2.745946e+01 2.749744e+01 2.754016e+01 2.755695e+01 2.755305e+01
## HQ(n) 2.749007e+01 2.754335e+01 2.760138e+01 2.763347e+01 2.764488e+01
## SC(n) 2.754477e+01 2.762540e+01 2.771078e+01 2.777023e+01 2.780898e+01
## FPE(n) 8.424223e+11 8.752193e+11 9.138087e+11 9.299266e+11 9.272719e+11
###plot ACF/PACF
par(mfrow = c(1,2))
acf(df$dif.bal)
pacf(df$dif.bal)
##Make a time series
df1 <- cbind(df$dif.bal, df$dif.txn)
df$time <- ts(start = c(2016,2), end = c(2019,9), frequency = 12)
###Estimate VAR
varp <- VAR(df1, p =1, type = "const", season = 4)
## Warning in VAR(df1, p = 1, type = "const", season = 4): No column names supplied in y, using: y1, y2 , instead.
varp
##
## VAR Estimation Results:
## =======================
##
## Estimated coefficients for equation y1:
## =======================================
## Call:
## y1 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 3.886074e-01 8.988778e+04 8.501168e+10 3.011770e+09 -1.113438e+11
## sd3
## 7.971232e+10
##
##
## Estimated coefficients for equation y2:
## =======================================
## Call:
## y2 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 9.769023e-07 -5.023989e-01 2.681538e+05 -5.701957e+05 -5.657797e+04
## sd3
## 3.518836e+05
lmp <- varp$varresult
summary(lmp)
## Length Class Mode
## y1 12 lm list
## y2 12 lm list
varp.f <- predict(varp, n.ahead = 5, ci = 0.95)
varp.f %>% head()
## $fcst
## $fcst$y1
## fcst lower upper CI
## [1,] 228374143967 -157816316194 614564604129 386190460162
## [2,] 130484497280 -310756434667 571725429226 441240931947
## [3,] 237232437766 -213696179174 688161054706 450928616940
## [4,] 250367774265 -203866892236 704602440766 454234666501
## [5,] 210422174602 -244447144663 665291493867 454869319265
##
## $fcst$y2
## fcst lower upper CI
## [1,] 677664.76 -937291.1 2292621 1614956
## [2,] 162939.47 -1675876.1 2001755 1838816
## [3,] 734369.90 -1184704.7 2653445 1919075
## [4,] 199682.52 -1743962.2 2143327 1943645
## [5,] -89054.86 -2041894.6 1863785 1952840
##
##
## $endog
## y1 y2
## [1,] 5.3750e+10 -201159
## [2,] -1.2967e+11 132946
## [3,] 1.1820e+10 65756
## [4,] -5.1400e+09 -50319
## [5,] 5.1740e+10 47901
## [6,] -1.0420e+10 34644
## [7,] 1.6610e+10 130190
## [8,] 1.1440e+10 -55277
## [9,] -3.5710e+10 92046
## [10,] 4.3430e+10 91720
## [11,] 1.9337e+11 522092
## [12,] 4.6534e+11 280437
## [13,] -1.2378e+11 -497823
## [14,] -5.1500e+10 390410
## [15,] 1.1185e+11 26089
## [16,] -2.9000e+08 126609
## [17,] 3.6060e+10 -30534
## [18,] 8.2520e+10 212344
## [19,] 6.0780e+10 232244
## [20,] 1.0420e+11 -25087
## [21,] 8.5680e+10 245927
## [22,] 1.0830e+11 95191
## [23,] 2.3255e+11 459412
## [24,] 3.2677e+11 116867
## [25,] 6.4825e+11 -168116
## [26,] -2.3719e+11 366759
## [27,] 3.3957e+11 274581
## [28,] 9.4920e+10 590084
## [29,] 2.4943e+11 246123
## [30,] 1.1059e+11 546265
## [31,] 1.7135e+11 632158
## [32,] 3.5080e+11 -166429
## [33,] 2.9973e+11 1496391
## [34,] 3.8495e+11 -368532
## [35,] 8.6405e+11 1562161
## [36,] 4.5082e+11 1844558
## [37,] 4.9129e+11 -3669316
## [38,] -2.4621e+11 2939356
## [39,] 1.8498e+11 272754
## [40,] 1.2229e+11 2134460
## [41,] 4.2676e+11 -628474
## [42,] 3.0697e+11 2315826
## [43,] 4.5169e+11 357662
## [44,] 5.2572e+11 -791019
##
## $model
##
## VAR Estimation Results:
## =======================
##
## Estimated coefficients for equation y1:
## =======================================
## Call:
## y1 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 3.886074e-01 8.988778e+04 8.501168e+10 3.011770e+09 -1.113438e+11
## sd3
## 7.971232e+10
##
##
## Estimated coefficients for equation y2:
## =======================================
## Call:
## y2 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 9.769023e-07 -5.023989e-01 2.681538e+05 -5.701957e+05 -5.657797e+04
## sd3
## 3.518836e+05
##
##
##
## $exo.fcst
## NULL
plot(varp.f, names = "y1")
fanchart(varp.f, names = "y1")
varp.f %>% head()
## $fcst
## $fcst$y1
## fcst lower upper CI
## [1,] 228374143967 -157816316194 614564604129 386190460162
## [2,] 130484497280 -310756434667 571725429226 441240931947
## [3,] 237232437766 -213696179174 688161054706 450928616940
## [4,] 250367774265 -203866892236 704602440766 454234666501
## [5,] 210422174602 -244447144663 665291493867 454869319265
##
## $fcst$y2
## fcst lower upper CI
## [1,] 677664.76 -937291.1 2292621 1614956
## [2,] 162939.47 -1675876.1 2001755 1838816
## [3,] 734369.90 -1184704.7 2653445 1919075
## [4,] 199682.52 -1743962.2 2143327 1943645
## [5,] -89054.86 -2041894.6 1863785 1952840
##
##
## $endog
## y1 y2
## [1,] 5.3750e+10 -201159
## [2,] -1.2967e+11 132946
## [3,] 1.1820e+10 65756
## [4,] -5.1400e+09 -50319
## [5,] 5.1740e+10 47901
## [6,] -1.0420e+10 34644
## [7,] 1.6610e+10 130190
## [8,] 1.1440e+10 -55277
## [9,] -3.5710e+10 92046
## [10,] 4.3430e+10 91720
## [11,] 1.9337e+11 522092
## [12,] 4.6534e+11 280437
## [13,] -1.2378e+11 -497823
## [14,] -5.1500e+10 390410
## [15,] 1.1185e+11 26089
## [16,] -2.9000e+08 126609
## [17,] 3.6060e+10 -30534
## [18,] 8.2520e+10 212344
## [19,] 6.0780e+10 232244
## [20,] 1.0420e+11 -25087
## [21,] 8.5680e+10 245927
## [22,] 1.0830e+11 95191
## [23,] 2.3255e+11 459412
## [24,] 3.2677e+11 116867
## [25,] 6.4825e+11 -168116
## [26,] -2.3719e+11 366759
## [27,] 3.3957e+11 274581
## [28,] 9.4920e+10 590084
## [29,] 2.4943e+11 246123
## [30,] 1.1059e+11 546265
## [31,] 1.7135e+11 632158
## [32,] 3.5080e+11 -166429
## [33,] 2.9973e+11 1496391
## [34,] 3.8495e+11 -368532
## [35,] 8.6405e+11 1562161
## [36,] 4.5082e+11 1844558
## [37,] 4.9129e+11 -3669316
## [38,] -2.4621e+11 2939356
## [39,] 1.8498e+11 272754
## [40,] 1.2229e+11 2134460
## [41,] 4.2676e+11 -628474
## [42,] 3.0697e+11 2315826
## [43,] 4.5169e+11 357662
## [44,] 5.2572e+11 -791019
##
## $model
##
## VAR Estimation Results:
## =======================
##
## Estimated coefficients for equation y1:
## =======================================
## Call:
## y1 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 3.886074e-01 8.988778e+04 8.501168e+10 3.011770e+09 -1.113438e+11
## sd3
## 7.971232e+10
##
##
## Estimated coefficients for equation y2:
## =======================================
## Call:
## y2 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 9.769023e-07 -5.023989e-01 2.681538e+05 -5.701957e+05 -5.657797e+04
## sd3
## 3.518836e+05
##
##
##
## $exo.fcst
## NULL
#Generate impulse funtion
irf.avgbal <- irf(varp, impulse = "y1", response = "y2",
n.ahead = 5, boot = TRUE)
plot(irf.avgbal, ylab = "avgbal", main = "Shock from SLGD")
#Define the final equation and Backtest in Excel file
### MAPE = 0.01 ###
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.