#Building a VAR Model #Vector Autoregression (VAR) is a multivariate forecasting #algorithm that is used when two or more time series influence each other
#1.Analyze the time series characteristics #2.Test for causation amongst the time series #3.Test for stationarity #4.Transform the series to make it stationary, if needed #5.Find optimal order (p) #6.Prepare training and test datasets #7.Train the model #8.Roll back the transformations, if any. #9.Evaluate the model using test set #10.Forecast to future
#loading data
data <- read.csv("E:/PHU/Tech/MASS.csv")
data <- data.frame(data[1:47,])
data %>% head()
## AVG.BAL TXN
## 1 2.96916e+12 1.988552
## 2 3.00892e+12 1.710431
## 3 2.80406e+12 1.661168
## 4 2.84398e+12 1.745906
## 5 2.96329e+12 1.616728
## 6 2.91128e+12 1.606698
#Create date/month columns
dates <- seq(as.Date("2016-01-01"), as.Date("2019-11-01"), "1 months")
data <- cbind(data, dates)
data %>% head(n=6)
## AVG.BAL TXN dates
## 1 2.96916e+12 1.988552 2016-01-01
## 2 3.00892e+12 1.710431 2016-02-01
## 3 2.80406e+12 1.661168 2016-03-01
## 4 2.84398e+12 1.745906 2016-04-01
## 5 2.96329e+12 1.616728 2016-05-01
## 6 2.91128e+12 1.606698 2016-06-01
#Visulize the data
plot(data$AVG.BAL, data$TXN)
ggplot(data, aes(x=dates, y = AVG.BAL)) + geom_line()+
geom_point()
#scatterplot
scatterplotMatrix(data[1:2])
#Calculate the Correlations
cor.test(data$AVG.BAL, data$TXN)
##
## Pearson's product-moment correlation
##
## data: data$AVG.BAL and data$TXN
## t = 13.123, df = 45, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.8104823 0.9377889
## sample estimates:
## cor
## 0.8904115
#Test for stationary
###Augmented Dicker Fuller###
ur.df(data$AVG.BAL, type = "none") %>%
summary()
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.123e+12 -1.894e+11 -5.889e+10 1.040e+11 1.169e+12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 0.03578 0.01634 2.190 0.0340 *
## z.diff.lag 0.37318 0.17075 2.186 0.0343 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.713e+11 on 43 degrees of freedom
## Multiple R-squared: 0.2869, Adjusted R-squared: 0.2537
## F-statistic: 8.65 on 2 and 43 DF, p-value: 0.0006962
##
##
## Value of test-statistic is: 2.1898
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.62 -1.95 -1.61
ur.df(data$TXN, type = "none") %>%
summary()
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.24738 -0.14067 -0.06732 0.05013 0.74996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 0.05749 0.01816 3.166 0.002843 **
## z.diff.lag -0.54438 0.14615 -3.725 0.000565 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.332 on 43 degrees of freedom
## Multiple R-squared: 0.2826, Adjusted R-squared: 0.2493
## F-statistic: 8.47 on 2 and 43 DF, p-value: 0.0007917
##
##
## Value of test-statistic is: 3.1656
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.62 -1.95 -1.61
###Time series are non- stationary based on test-statistics is more than critical values
###Make differences
ur.df(diff(data$AVG.BAL, differences = 1), type = "none") %>%
summary()
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.072e+12 -9.096e+10 6.938e+10 1.780e+11 1.059e+12
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 -0.04516 0.24050 -0.188 0.852
## z.diff.lag -0.46599 0.20883 -2.231 0.031 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.73e+11 on 42 degrees of freedom
## Multiple R-squared: 0.2485, Adjusted R-squared: 0.2128
## F-statistic: 6.946 on 2 and 42 DF, p-value: 0.002477
##
##
## Value of test-statistic is: -0.1878
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.62 -1.95 -1.61
ur.df(diff(data$TXN, differences = 1), type = "none") %>%
summary()
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression none
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 - 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.10918 -0.01155 0.07518 0.27955 1.05530
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## z.lag.1 -1.32509 0.25961 -5.104 7.58e-06 ***
## z.diff.lag -0.02384 0.17082 -0.140 0.89
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3722 on 42 degrees of freedom
## Multiple R-squared: 0.6622, Adjusted R-squared: 0.6461
## F-statistic: 41.16 on 2 and 42 DF, p-value: 1.267e-10
##
##
## Value of test-statistic is: -5.1042
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau1 -2.62 -1.95 -1.61
dif.bal <- diff(data$AVG.BAL, differences = 1)
dif.txn <- diff(data$TXN, differences = 1)
df <- data.frame(dif.bal, dif.txn)
df$date <- data$dates[2:47]
#Visualize
df %>% ggplot(aes(date, dif.bal)) +geom_line()+
geom_point()
df %>% ggplot(aes(date, dif.txn)) +geom_line()+
geom_point()
#####Define optimal lags
VARselect(df$dif.bal, lag.max = 5, type = "const")
## $selection
## AIC(n) HQ(n) SC(n) FPE(n)
## 4 4 4 4
##
## $criteria
## 1 2 3 4 5
## AIC(n) 5.345818e+01 5.341806e+01 5.342286e+01 5.328189e+01 5.331150e+01
## HQ(n) 5.348862e+01 5.346371e+01 5.348373e+01 5.335798e+01 5.340282e+01
## SC(n) 5.354177e+01 5.354344e+01 5.359003e+01 5.349086e+01 5.356227e+01
## FPE(n) 1.646750e+23 1.582270e+23 1.590456e+23 1.382164e+23 1.424983e+23
VARselect(df$dif.txn, lag.max = 5, type = "const")
## $selection
## AIC(n) HQ(n) SC(n) FPE(n)
## 1 1 1 1
##
## $criteria
## 1 2 3 4 5
## AIC(n) -2.027496 -1.9969536 -1.9607917 -1.9192476 -1.9457954
## HQ(n) -1.997058 -1.9512959 -1.8999148 -1.8431514 -1.8544800
## SC(n) -1.943907 -1.8715703 -1.7936139 -1.7102754 -1.6950287
## FPE(n) 0.131675 0.1357838 0.1408346 0.1468964 0.1431763
###plot ACF/PACF
par(mfrow = c(1,2))
acf(df$dif.bal)
pacf(df$dif.bal)
##Make a time series
df1 <- cbind(df$dif.bal, df$dif.txn)
df$time <- ts(start = c(2016,2), end = c(2019,11), frequency = 12)
VARselect(df1, lag.max = 5, type = "const")
## $selection
## AIC(n) HQ(n) SC(n) FPE(n)
## 4 4 1 4
##
## $criteria
## 1 2 3 4 5
## AIC(n) 5.138111e+01 5.131428e+01 5.131699e+01 5.115384e+01 5.121737e+01
## HQ(n) 5.147243e+01 5.146648e+01 5.153006e+01 5.142779e+01 5.155219e+01
## SC(n) 5.163188e+01 5.173223e+01 5.190212e+01 5.190614e+01 5.213684e+01
## FPE(n) 2.064249e+22 1.934505e+22 1.948140e+22 1.667786e+22 1.799334e+22
###Estimate VAR
varp <- VAR(df1, p = 1, type = "const", season = 4)
## Warning in VAR(df1, p = 1, type = "const", season = 4): No column names supplied in y, using: y1, y2 , instead.
varp
##
## VAR Estimation Results:
## =======================
##
## Estimated coefficients for equation y1:
## =======================================
## Call:
## y1 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 4.495302e-01 2.550328e+11 6.673299e+10 -1.079420e+10 -2.202650e+10
## sd3
## 1.399945e+11
##
##
## Estimated coefficients for equation y2:
## =======================================
## Call:
## y2 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 2.941404e-13 -5.872313e-01 1.188349e-01 -2.156508e-01 -1.690456e-01
## sd3
## 1.392416e-01
lmp <- varp$varresult
summary(lmp)
## Length Class Mode
## y1 12 lm list
## y2 12 lm list
varp.f <- predict(varp, n.ahead = 5, ci = 0.95)
varp.f %>% head()
## $fcst
## $fcst$y1
## fcst lower upper CI
## [1,] 717530418563 -27024287948 1.462085e+12 744554706511
## [2,] 654214107025 -190927452114 1.499356e+12 845141559140
## [3,] 251707923205 -617509960651 1.120926e+12 869217883857
## [4,] 213100626126 -663325111025 1.089526e+12 876425737151
## [5,] 249280681799 -629016177981 1.127578e+12 878296859780
##
## $fcst$y2
## fcst lower upper CI
## [1,] 1.1438645 0.5028472 1.7848818 0.6410173
## [2,] -0.2804597 -1.0345767 0.4736572 0.7541169
## [3,] 0.3216734 -0.4776374 1.1209842 0.7993108
## [4,] -0.1037063 -0.9205097 0.7130972 0.8168035
## [5,] 0.4430212 -0.3813934 1.2674359 0.8244147
##
##
## $endog
## y1 y2
## [1,] 3.97600e+10 -0.278121
## [2,] -2.04860e+11 -0.049263
## [3,] 3.99200e+10 0.084738
## [4,] 1.19310e+11 -0.129178
## [5,] -5.20100e+10 -0.010030
## [6,] -2.87300e+10 0.008057
## [7,] 8.06000e+09 0.042525
## [8,] 6.16310e+11 -0.016959
## [9,] -7.67330e+11 -0.022301
## [10,] 4.44900e+10 0.012702
## [11,] 4.00810e+11 0.565229
## [12,] 1.35650e+11 0.381467
## [13,] -2.14050e+11 -0.627536
## [14,] -6.82700e+10 0.188010
## [15,] 1.24820e+11 0.050653
## [16,] 1.54760e+11 -0.033953
## [17,] -1.87180e+11 -0.136411
## [18,] 1.48300e+10 0.122962
## [19,] -4.88900e+10 0.086225
## [20,] 5.38600e+10 -0.018530
## [21,] 7.14500e+10 0.030849
## [22,] -2.94500e+10 0.127999
## [23,] 9.65400e+10 0.050257
## [24,] 1.27510e+11 0.018923
## [25,] 5.51140e+11 0.054091
## [26,] -3.50890e+11 -0.026907
## [27,] 1.05910e+11 0.109898
## [28,] 5.96300e+10 0.138278
## [29,] -3.18500e+10 0.055875
## [30,] -8.94400e+10 0.136199
## [31,] 6.67100e+10 0.216041
## [32,] 1.08800e+10 -0.095604
## [33,] -1.92240e+11 0.380310
## [34,] -3.75000e+09 -0.262369
## [35,] 5.95320e+11 0.587429
## [36,] -8.36100e+10 0.451122
## [37,] 4.08830e+11 -1.252582
## [38,] -2.28160e+11 0.779318
## [39,] 1.04790e+11 0.074315
## [40,] 8.76200e+10 0.611348
## [41,] 2.95130e+11 -0.114036
## [42,] 8.97000e+10 0.669790
## [43,] 5.36360e+11 0.393071
## [44,] 6.46590e+11 -0.054888
## [45,] 1.62494e+12 1.083821
## [46,] 1.55150e+12 -0.626781
##
## $model
##
## VAR Estimation Results:
## =======================
##
## Estimated coefficients for equation y1:
## =======================================
## Call:
## y1 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 4.495302e-01 2.550328e+11 6.673299e+10 -1.079420e+10 -2.202650e+10
## sd3
## 1.399945e+11
##
##
## Estimated coefficients for equation y2:
## =======================================
## Call:
## y2 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 2.941404e-13 -5.872313e-01 1.188349e-01 -2.156508e-01 -1.690456e-01
## sd3
## 1.392416e-01
##
##
##
## $exo.fcst
## NULL
plot(varp.f, names = "y1")
fanchart(varp.f, names = "y1")
varp.f %>% head()
## $fcst
## $fcst$y1
## fcst lower upper CI
## [1,] 717530418563 -27024287948 1.462085e+12 744554706511
## [2,] 654214107025 -190927452114 1.499356e+12 845141559140
## [3,] 251707923205 -617509960651 1.120926e+12 869217883857
## [4,] 213100626126 -663325111025 1.089526e+12 876425737151
## [5,] 249280681799 -629016177981 1.127578e+12 878296859780
##
## $fcst$y2
## fcst lower upper CI
## [1,] 1.1438645 0.5028472 1.7848818 0.6410173
## [2,] -0.2804597 -1.0345767 0.4736572 0.7541169
## [3,] 0.3216734 -0.4776374 1.1209842 0.7993108
## [4,] -0.1037063 -0.9205097 0.7130972 0.8168035
## [5,] 0.4430212 -0.3813934 1.2674359 0.8244147
##
##
## $endog
## y1 y2
## [1,] 3.97600e+10 -0.278121
## [2,] -2.04860e+11 -0.049263
## [3,] 3.99200e+10 0.084738
## [4,] 1.19310e+11 -0.129178
## [5,] -5.20100e+10 -0.010030
## [6,] -2.87300e+10 0.008057
## [7,] 8.06000e+09 0.042525
## [8,] 6.16310e+11 -0.016959
## [9,] -7.67330e+11 -0.022301
## [10,] 4.44900e+10 0.012702
## [11,] 4.00810e+11 0.565229
## [12,] 1.35650e+11 0.381467
## [13,] -2.14050e+11 -0.627536
## [14,] -6.82700e+10 0.188010
## [15,] 1.24820e+11 0.050653
## [16,] 1.54760e+11 -0.033953
## [17,] -1.87180e+11 -0.136411
## [18,] 1.48300e+10 0.122962
## [19,] -4.88900e+10 0.086225
## [20,] 5.38600e+10 -0.018530
## [21,] 7.14500e+10 0.030849
## [22,] -2.94500e+10 0.127999
## [23,] 9.65400e+10 0.050257
## [24,] 1.27510e+11 0.018923
## [25,] 5.51140e+11 0.054091
## [26,] -3.50890e+11 -0.026907
## [27,] 1.05910e+11 0.109898
## [28,] 5.96300e+10 0.138278
## [29,] -3.18500e+10 0.055875
## [30,] -8.94400e+10 0.136199
## [31,] 6.67100e+10 0.216041
## [32,] 1.08800e+10 -0.095604
## [33,] -1.92240e+11 0.380310
## [34,] -3.75000e+09 -0.262369
## [35,] 5.95320e+11 0.587429
## [36,] -8.36100e+10 0.451122
## [37,] 4.08830e+11 -1.252582
## [38,] -2.28160e+11 0.779318
## [39,] 1.04790e+11 0.074315
## [40,] 8.76200e+10 0.611348
## [41,] 2.95130e+11 -0.114036
## [42,] 8.97000e+10 0.669790
## [43,] 5.36360e+11 0.393071
## [44,] 6.46590e+11 -0.054888
## [45,] 1.62494e+12 1.083821
## [46,] 1.55150e+12 -0.626781
##
## $model
##
## VAR Estimation Results:
## =======================
##
## Estimated coefficients for equation y1:
## =======================================
## Call:
## y1 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 4.495302e-01 2.550328e+11 6.673299e+10 -1.079420e+10 -2.202650e+10
## sd3
## 1.399945e+11
##
##
## Estimated coefficients for equation y2:
## =======================================
## Call:
## y2 = y1.l1 + y2.l1 + const + sd1 + sd2 + sd3
##
## y1.l1 y2.l1 const sd1 sd2
## 2.941404e-13 -5.872313e-01 1.188349e-01 -2.156508e-01 -1.690456e-01
## sd3
## 1.392416e-01
##
##
##
## $exo.fcst
## NULL
#Generate impulse funtion
irf.avgbal <- irf(varp, impulse = "y1", response = "y2",
n.ahead = 5, boot = TRUE)
plot(irf.avgbal, ylab = "avgbal", main = "Shock from SLGD")
#Define the final equation and Backtest in Excel file
### MAPE = 0.01 ###
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.