R Markdown

#Building a VAR Model #Vector Autoregression (VAR) is a multivariate forecasting #algorithm that is used when two or more time series influence each other

#1.Analyze the time series characteristics #2.Test for causation amongst the time series #3.Test for stationarity #4.Transform the series to make it stationary, if needed #5.Find optimal order (p) #6.Prepare training and test datasets #7.Train the model #8.Roll back the transformations, if any. #9.Evaluate the model using test set #10.Forecast to future

#loading data

data <- read.csv("E:/PHU/Techcombank/MAF.csv")
data <- data.frame(data[1:45,])
data %>% head()
##       AVG_BAL     TXN
## 1 2.55134e+12 1674249
## 2 2.60509e+12 1473090
## 3 2.47542e+12 1606036
## 4 2.48724e+12 1671792
## 5 2.48210e+12 1621473
## 6 2.53384e+12 1669374
#Create date/month columns

dates <- seq(as.Date("2016-01-01"), as.Date("2019-09-01"), "1 months")

data <- cbind(data, dates)

#Visulize the data
plot(data$AVG_BAL, data$TXN)

ggplot(data, aes(x=dates, y = AVG_BAL)) + geom_line()+
  geom_point()

#scatterplot
scatterplotMatrix(data[1:2])

#Calculate the Correlations
cor.test(data$AVG_BAL, data$TXN)
## 
##  Pearson's product-moment correlation
## 
## data:  data$AVG_BAL and data$TXN
## t = 30.617, df = 43, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9597683 0.9878278
## sample estimates:
##      cor 
## 0.977825
#Test for stationary
###Augmented Dicker Fuller###
ur.df(data$AVG_BAL, type = "none") %>%
  summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -5.896e+11 -1.033e+11 -4.058e+10  9.926e+10  5.944e+11 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## z.lag.1    0.037321   0.008707   4.286 0.000107 ***
## z.diff.lag 0.060196   0.162928   0.369 0.713682    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.084e+11 on 41 degrees of freedom
## Multiple R-squared:  0.5101, Adjusted R-squared:  0.4862 
## F-statistic: 21.34 on 2 and 41 DF,  p-value: 4.446e-07
## 
## 
## Value of test-statistic is: 4.2864 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.62 -1.95 -1.61
ur.df(data$TXN, type = "none") %>%
  summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3483092  -132852   -59441   280592  2017209 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## z.lag.1     0.09073    0.02219   4.089 0.000197 ***
## z.diff.lag -0.64068    0.13941  -4.596 4.07e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 811500 on 41 degrees of freedom
## Multiple R-squared:  0.3872, Adjusted R-squared:  0.3573 
## F-statistic: 12.95 on 2 and 41 DF,  p-value: 4.367e-05
## 
## 
## Value of test-statistic is: 4.0894 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.62 -1.95 -1.61
###Time series are not stationary based on test-statistics is more than critical values
###Make differences

ur.df(diff(data$AVG_BAL, differences = 1), type = "none") %>%
  summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -6.145e+11 -1.008e+10  4.964e+10  1.580e+11  5.988e+11 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)   
## z.lag.1     -0.2145     0.1466  -1.463  0.15136   
## z.diff.lag  -0.4365     0.1497  -2.916  0.00578 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.294e+11 on 40 degrees of freedom
## Multiple R-squared:  0.3409, Adjusted R-squared:  0.3079 
## F-statistic: 10.34 on 2 and 40 DF,  p-value: 0.0002396
## 
## 
## Value of test-statistic is: -1.4627 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.62 -1.95 -1.61
ur.df(diff(data$TXN, differences = 1), type = "none") %>%
  summary()
## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression none 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -3288784    31094   198741   543306  2398697 
## 
## Coefficients:
##            Estimate Std. Error t value Pr(>|t|)    
## z.lag.1     -1.1853     0.2606  -4.548 4.94e-05 ***
## z.diff.lag  -0.1371     0.1577  -0.869     0.39    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 965800 on 40 degrees of freedom
## Multiple R-squared:  0.6887, Adjusted R-squared:  0.6732 
## F-statistic: 44.25 on 2 and 40 DF,  p-value: 7.291e-11
## 
## 
## Value of test-statistic is: -4.5479 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau1 -2.62 -1.95 -1.61
dif.bal <- diff(data$AVG_BAL, differences = 1)
dif.txn <- diff(data$TXN, differences = 1)


df <- data.frame(dif.bal, dif.txn)
df$date <- data$dates[2:45]

#Visualize
df %>% ggplot(aes(date, dif.bal)) +geom_line()+
  geom_point()

df %>% ggplot(aes(date, dif.txn)) +geom_line()+
  geom_point()

#####Define optimal lags
VARselect(df$dif.bal, lag.max = 5, type = "const")
## $selection
## AIC(n)  HQ(n)  SC(n) FPE(n) 
##      2      2      1      2 
## 
## $criteria
##                   1            2            3            4            5
## AIC(n) 5.239306e+01 5.236641e+01 5.240183e+01 5.244421e+01 5.248766e+01
## HQ(n)  5.242367e+01 5.241233e+01 5.246305e+01 5.252073e+01 5.257949e+01
## SC(n)  5.247837e+01 5.249438e+01 5.257245e+01 5.265749e+01 5.274359e+01
## FPE(n) 5.676189e+22 5.528110e+22 5.729793e+22 5.982018e+22 6.254198e+22
VARselect(df$dif.txn, lag.max = 5, type = "const")
## $selection
## AIC(n)  HQ(n)  SC(n) FPE(n) 
##      1      1      1      1 
## 
## $criteria
##                   1            2            3            4            5
## AIC(n) 2.745946e+01 2.749744e+01 2.754016e+01 2.755695e+01 2.755305e+01
## HQ(n)  2.749007e+01 2.754335e+01 2.760138e+01 2.763347e+01 2.764488e+01
## SC(n)  2.754477e+01 2.762540e+01 2.771078e+01 2.777023e+01 2.780898e+01
## FPE(n) 8.424223e+11 8.752193e+11 9.138087e+11 9.299266e+11 9.272719e+11
###plot ACF/PACF
par(mfrow = c(1,2))
acf(df$dif.bal)
pacf(df$dif.bal)

##Make a time series
df1 <- cbind(df$dif.bal, df$dif.txn)
df$time <- ts(start = c(2016,2), end = c(2019,9), frequency = 12)
###Estimate VAR

varp <- VAR(df1, p =1, type = "const", season = 4)
## Warning in VAR(df1, p = 1, type = "const", season = 4): No column names supplied in y, using: y1, y2 , instead.
varp
## 
## VAR Estimation Results:
## ======================= 
## 
## Estimated coefficients for equation y1: 
## ======================================= 
## Call:
## y1 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3 
## 
##         y1.l1         y2.l1         const           sd1           sd2 
##  3.886074e-01  8.988778e+04  8.501168e+10  3.011770e+09 -1.113438e+11 
##           sd3 
##  7.971232e+10 
## 
## 
## Estimated coefficients for equation y2: 
## ======================================= 
## Call:
## y2 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3 
## 
##         y1.l1         y2.l1         const           sd1           sd2 
##  9.769023e-07 -5.023989e-01  2.681538e+05 -5.701957e+05 -5.657797e+04 
##           sd3 
##  3.518836e+05
lmp <- varp$varresult

summary(lmp)
##    Length Class Mode
## y1 12     lm    list
## y2 12     lm    list
varp.f <- predict(varp, n.ahead = 5, ci = 0.95)

varp.f %>% head()
## $fcst
## $fcst$y1
##              fcst         lower        upper           CI
## [1,] 228374143967 -157816316194 614564604129 386190460162
## [2,] 130484497280 -310756434667 571725429226 441240931947
## [3,] 237232437766 -213696179174 688161054706 450928616940
## [4,] 250367774265 -203866892236 704602440766 454234666501
## [5,] 210422174602 -244447144663 665291493867 454869319265
## 
## $fcst$y2
##           fcst      lower   upper      CI
## [1,] 677664.76  -937291.1 2292621 1614956
## [2,] 162939.47 -1675876.1 2001755 1838816
## [3,] 734369.90 -1184704.7 2653445 1919075
## [4,] 199682.52 -1743962.2 2143327 1943645
## [5,] -89054.86 -2041894.6 1863785 1952840
## 
## 
## $endog
##                y1       y2
##  [1,]  5.3750e+10  -201159
##  [2,] -1.2967e+11   132946
##  [3,]  1.1820e+10    65756
##  [4,] -5.1400e+09   -50319
##  [5,]  5.1740e+10    47901
##  [6,] -1.0420e+10    34644
##  [7,]  1.6610e+10   130190
##  [8,]  1.1440e+10   -55277
##  [9,] -3.5710e+10    92046
## [10,]  4.3430e+10    91720
## [11,]  1.9337e+11   522092
## [12,]  4.6534e+11   280437
## [13,] -1.2378e+11  -497823
## [14,] -5.1500e+10   390410
## [15,]  1.1185e+11    26089
## [16,] -2.9000e+08   126609
## [17,]  3.6060e+10   -30534
## [18,]  8.2520e+10   212344
## [19,]  6.0780e+10   232244
## [20,]  1.0420e+11   -25087
## [21,]  8.5680e+10   245927
## [22,]  1.0830e+11    95191
## [23,]  2.3255e+11   459412
## [24,]  3.2677e+11   116867
## [25,]  6.4825e+11  -168116
## [26,] -2.3719e+11   366759
## [27,]  3.3957e+11   274581
## [28,]  9.4920e+10   590084
## [29,]  2.4943e+11   246123
## [30,]  1.1059e+11   546265
## [31,]  1.7135e+11   632158
## [32,]  3.5080e+11  -166429
## [33,]  2.9973e+11  1496391
## [34,]  3.8495e+11  -368532
## [35,]  8.6405e+11  1562161
## [36,]  4.5082e+11  1844558
## [37,]  4.9129e+11 -3669316
## [38,] -2.4621e+11  2939356
## [39,]  1.8498e+11   272754
## [40,]  1.2229e+11  2134460
## [41,]  4.2676e+11  -628474
## [42,]  3.0697e+11  2315826
## [43,]  4.5169e+11   357662
## [44,]  5.2572e+11  -791019
## 
## $model
## 
## VAR Estimation Results:
## ======================= 
## 
## Estimated coefficients for equation y1: 
## ======================================= 
## Call:
## y1 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3 
## 
##         y1.l1         y2.l1         const           sd1           sd2 
##  3.886074e-01  8.988778e+04  8.501168e+10  3.011770e+09 -1.113438e+11 
##           sd3 
##  7.971232e+10 
## 
## 
## Estimated coefficients for equation y2: 
## ======================================= 
## Call:
## y2 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3 
## 
##         y1.l1         y2.l1         const           sd1           sd2 
##  9.769023e-07 -5.023989e-01  2.681538e+05 -5.701957e+05 -5.657797e+04 
##           sd3 
##  3.518836e+05 
## 
## 
## 
## $exo.fcst
## NULL
plot(varp.f, names = "y1")

fanchart(varp.f, names = "y1")

varp.f %>% head()
## $fcst
## $fcst$y1
##              fcst         lower        upper           CI
## [1,] 228374143967 -157816316194 614564604129 386190460162
## [2,] 130484497280 -310756434667 571725429226 441240931947
## [3,] 237232437766 -213696179174 688161054706 450928616940
## [4,] 250367774265 -203866892236 704602440766 454234666501
## [5,] 210422174602 -244447144663 665291493867 454869319265
## 
## $fcst$y2
##           fcst      lower   upper      CI
## [1,] 677664.76  -937291.1 2292621 1614956
## [2,] 162939.47 -1675876.1 2001755 1838816
## [3,] 734369.90 -1184704.7 2653445 1919075
## [4,] 199682.52 -1743962.2 2143327 1943645
## [5,] -89054.86 -2041894.6 1863785 1952840
## 
## 
## $endog
##                y1       y2
##  [1,]  5.3750e+10  -201159
##  [2,] -1.2967e+11   132946
##  [3,]  1.1820e+10    65756
##  [4,] -5.1400e+09   -50319
##  [5,]  5.1740e+10    47901
##  [6,] -1.0420e+10    34644
##  [7,]  1.6610e+10   130190
##  [8,]  1.1440e+10   -55277
##  [9,] -3.5710e+10    92046
## [10,]  4.3430e+10    91720
## [11,]  1.9337e+11   522092
## [12,]  4.6534e+11   280437
## [13,] -1.2378e+11  -497823
## [14,] -5.1500e+10   390410
## [15,]  1.1185e+11    26089
## [16,] -2.9000e+08   126609
## [17,]  3.6060e+10   -30534
## [18,]  8.2520e+10   212344
## [19,]  6.0780e+10   232244
## [20,]  1.0420e+11   -25087
## [21,]  8.5680e+10   245927
## [22,]  1.0830e+11    95191
## [23,]  2.3255e+11   459412
## [24,]  3.2677e+11   116867
## [25,]  6.4825e+11  -168116
## [26,] -2.3719e+11   366759
## [27,]  3.3957e+11   274581
## [28,]  9.4920e+10   590084
## [29,]  2.4943e+11   246123
## [30,]  1.1059e+11   546265
## [31,]  1.7135e+11   632158
## [32,]  3.5080e+11  -166429
## [33,]  2.9973e+11  1496391
## [34,]  3.8495e+11  -368532
## [35,]  8.6405e+11  1562161
## [36,]  4.5082e+11  1844558
## [37,]  4.9129e+11 -3669316
## [38,] -2.4621e+11  2939356
## [39,]  1.8498e+11   272754
## [40,]  1.2229e+11  2134460
## [41,]  4.2676e+11  -628474
## [42,]  3.0697e+11  2315826
## [43,]  4.5169e+11   357662
## [44,]  5.2572e+11  -791019
## 
## $model
## 
## VAR Estimation Results:
## ======================= 
## 
## Estimated coefficients for equation y1: 
## ======================================= 
## Call:
## y1 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3 
## 
##         y1.l1         y2.l1         const           sd1           sd2 
##  3.886074e-01  8.988778e+04  8.501168e+10  3.011770e+09 -1.113438e+11 
##           sd3 
##  7.971232e+10 
## 
## 
## Estimated coefficients for equation y2: 
## ======================================= 
## Call:
## y2 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3 
## 
##         y1.l1         y2.l1         const           sd1           sd2 
##  9.769023e-07 -5.023989e-01  2.681538e+05 -5.701957e+05 -5.657797e+04 
##           sd3 
##  3.518836e+05 
## 
## 
## 
## $exo.fcst
## NULL
#Generate impulse funtion
irf.avgbal <- irf(varp, impulse = "y1", response = "y2",
               n.ahead = 5, boot = TRUE)
plot(irf.avgbal, ylab = "avgbal", main = "Shock from SLGD")

#Define the final equation and Backtest in Excel file
### MAPE = 0.01 ###

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.