Libraries
require(fpp2)
## Loading required package: fpp2
## Warning: package 'fpp2' was built under R version 4.0.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## -- Attaching packages ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- fpp2 2.4 --
## v ggplot2 3.3.2 v fma 2.4
## v forecast 8.13 v expsmooth 2.3
## Warning: package 'forecast' was built under R version 4.0.3
## Warning: package 'fma' was built under R version 4.0.3
## Warning: package 'expsmooth' was built under R version 4.0.3
##
require(forecast)
require(urca)
## Loading required package: urca
## Warning: package 'urca' was built under R version 4.0.3
require(zoo)
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
require(vars)
## Loading required package: vars
## Warning: package 'vars' was built under R version 4.0.3
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following objects are masked from 'package:fma':
##
## cement, housing, petrol
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.0.3
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.0.3
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.0.3
Import Retail employment import and plot
retail<-read.csv("D:/predictive analytics/assignments/retail_combined.csv")
retail.emp<-ts(retail[,2], frequency=12, start=c(1992,1))
autoplot(retail.emp)
Exploring other variables for xreg
#retail sales
retail.sales<-ts(retail[,3], frequency=12, start=c(1992,1))
retail.sales.diff<-diff(retail.sales)
autoplot(retail.sales.diff)
#offset
retail.sales.diff<-lag(retail.sales.diff, 1)
#personal consumption expenditures
pce<-ts(retail[,4], frequency=12, start=c(1992,1))
pce.diff<-diff(pce)
autoplot(pce.diff)
pce.diff<-lag(pce.diff, 1)
#disposable income
disp.inc<-ts(retail[,5], frequency=12, start=c(1992,1))
disp.inc.diff<-diff(disp.inc)
autoplot(disp.inc.diff)
disp.inc.diff<-lag(disp.inc.diff, 1)
decompose
retail.emp %>%
mstl(robust=TRUE) %>%
autoplot()
ggseasonplot(retail.emp, year.labels=TRUE, year.labels.left=TRUE) +
ylab("Employment") +
ggtitle("Seasonal plot: Retail Employment")
Data isn’t stationary and has strong seasonal component, so we should do some differencing and a box cox transformation I could not get the seasonality component to work, at all.
retail.emp.bc<-BoxCox(retail.emp, lambda = BoxCox.lambda(retail.emp))
autoplot(retail.emp.bc)
ndiffs(retail.emp.bc)
## [1] 1
#Differences
retail.emp.diff<-diff(retail.emp.bc)
autoplot(retail.emp.diff)
retail.emp.diff %>% ur.kpss() %>% summary()
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 5 lags.
##
## Value of test-statistic is: 0.0697
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ggtsdisplay(retail.emp.diff)
Set Up train and test models
ret.data<-cbind(retail.emp.diff, retail.sales.diff, pce.diff, disp.inc.diff)
ret.data.train<-ts(ret.data[1:325,], frequency=12, start=c(1992,1))
ret.data.test<-ts(ret.data[325:347,], frequency=12, start=c(2019,1))
ret.data.test
## retail.emp.diff retail.sales.diff pce.diff disp.inc.diff
## Jan 2019 -280959.167 -14501 12.4 23.2
## Feb 2019 -96607.830 62667 133.9 -8.4
## Mar 2019 -1743.003 -3548 73.2 -38.5
## Apr 2019 23698.070 33507 49.8 -7.6
## May 2019 32061.316 -27224 62.8 17.2
## Jun 2019 40053.120 13918 56.1 13.5
## Jul 2019 5516.672 11377 39.0 72.1
## Aug 2019 -18346.456 -44238 22.3 21.2
## Sep 2019 -53122.075 27235 55.3 2.6
## Oct 2019 78355.751 12492 24.3 61.0
## Nov 2019 212885.630 53269 43.5 -37.0
## Dec 2019 39589.949 -105202 84.2 110.0
## Jan 2020 -289015.277 -3316 -3.1 92.4
## Feb 2020 -72112.410 16408 -998.9 -213.3
## Mar 2020 -28913.980 -53962 -1766.4 2337.8
## Apr 2020 -1008475.606 85690 1053.3 -833.6
## May 2020 204779.058 20017 849.2 -303.7
## Jun 2020 415414.725 12011 204.2 90.4
## Jul 2020 115981.413 -3474 175.3 -528.2
## Aug 2020 103901.087 -14710 175.5 91.6
## Sep 2020 -34157.607 20423 70.9 -120.7
## Oct 2020 116151.841 NA NA NA
## Nov 2020 145712.501 NA NA NA
ARIMA model
#fit arima models
fit<-auto.arima(ret.data.train[,1], xreg=ret.data.train[,2:4])
fit.fcst<-forecast(fit, xreg=ret.data.test[,2:4], h=24)
## Warning in forecast.forecast_ARIMA(fit, xreg = ret.data.test[, 2:4], h = 24):
## Upper prediction intervals are not finite.
fit.fcst
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Feb 2019 -68692.726 -92896.330 -4.448912e+04 -105708.944 -31676.51
## Mar 2019 -7438.252 -32041.580 1.716508e+04 -45065.795 30189.29
## Apr 2019 34610.871 9683.062 5.953868e+04 -3512.923 72734.66
## May 2019 31540.831 6348.853 5.673281e+04 -6986.975 70068.64
## Jun 2019 32244.845 6837.305 5.765238e+04 -6612.635 71102.32
## Jul 2019 -2017.544 -27601.301 2.356621e+04 -41144.524 37109.44
## Aug 2019 -25734.050 -51462.069 -6.031509e+00 -65081.660 13613.56
## Sep 2019 -58400.067 -84246.325 -3.255381e+04 -97928.507 -18871.63
## Oct 2019 56378.867 30435.608 8.232213e+04 16702.076 96055.66
## Nov 2019 231509.320 205486.423 2.575322e+05 191710.733 271307.91
## Dec 2019 -20531.424 -46619.747 5.556898e+03 -60430.070 19367.22
## Jan 2020 -263215.163 -289357.259 -2.370731e+05 -303196.049 -223234.28
## Feb 2020 -70089.485 -100010.987 -4.016798e+04 -115850.473 -24328.50
## Mar 2020 15991.534 -14165.215 4.614828e+04 -30129.234 62112.30
## Apr 2020 80413.476 50064.297 1.107627e+05 33998.413 126828.54
## May 2020 3995.163 -26511.628 3.450195e+04 -42660.948 50651.27
## Jun 2020 10913.209 -19722.814 4.154923e+04 -35940.546 57766.96
## Jul 2020 -2366.788 -33108.866 2.837529e+04 -49382.740 44649.16
## Aug 2020 -26431.997 -57261.170 4.397177e+03 -73581.149 20717.16
## Sep 2020 -66447.968 -97348.706 -3.554723e+04 -113706.569 -19189.37
## Oct 2020 57533.316 26573.747 8.849289e+04 10184.741 104881.89
## Nov 2020 NA NA NA NA NA
## Dec 2020 NA NA NA NA NA
autoplot(fit.fcst)+autolayer(ret.data[,1])+coord_cartesian(xlim = c(2015, 2021))
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 row(s) containing missing values (geom_path).
Look, there was some really intense changes in retail employment that had little to do with normal fundamentals. Retail employment saw a sharp decline in its total in response to the pandemic, and then a large increase. While I was hoping I could predict some of that with consumption, income, and retail sales, but the crisis completely divorced retail employment from fundamentals, as income actually has grown at record rates from stimulus, so I am going to give up on trying that and see if I can forecsat accurately in a normal year
ret.data.train<-na.omit(ts(ret.data[1:312,], frequency=12, start=c(1992,1)))
ret.data.test<-na.omit(ts(ret.data[313:335,], frequency=12, start=c(2018,1)))
#fit arima models
fit<-auto.arima(ret.data.train[,1], xreg=ret.data.train[,2:4])
fit.fcst<-forecast(fit, xreg=ret.data.test[,2:4])
autoplot(fit.fcst)+autolayer(ret.data[,1])+coord_cartesian(xlim = c(2015, 2019.9))
## Warning: Removed 1 row(s) containing missing values (geom_path).
accuracy(fit.fcst, ret.data.test[,1])
## ME RMSE MAE MPE MAPE MASE
## Training set -125.1895 18328.66 12898.68 15.097613 100.9734 0.8040415
## Test set 199.9279 14709.44 11128.58 -7.268026 103.8393 0.6937022
## ACF1 Theil's U
## Training set 0.01157454 NA
## Test set -0.26094199 0.04469013
VAR Select
VARselect(ret.data.train, lag.max=24, type="const")[["selection"]]
## AIC(n) HQ(n) SC(n) FPE(n)
## 14 13 12 14
#AIC says 14 but BIC suggests 12
var1 <- VAR(ret.data.train, p=12, type="const")
serial.test(var1, lags.pt=24, type="PT.asymptotic")
##
## Portmanteau Test (asymptotic)
##
## data: Residuals of VAR object var1
## Chi-squared = 332.67, df = 192, p-value = 1.263e-09
var2 <- VAR(ret.data.train, p=14, type="const")
serial.test(var2, lags.pt=24, type="PT.asymptotic")
##
## Portmanteau Test (asymptotic)
##
## data: Residuals of VAR object var2
## Chi-squared = 284.64, df = 160, p-value = 4.909e-09
Using the BIC selection (var1) because p is lower than BIC, and Portmanteau Test was essentially 0 for both
forecast(var1) %>%
autoplot() + xlab("Year")
var.forecast<-forecast(var1)
p <- predict(var1, n.ahead=24)
res <- residuals(var.forecast)
fits <- fitted(var.forecast)
for(i in 1:4)
{
fc <- structure(list(mean=p$fcst[[i]][,"fcst"], x=ret.data.train[,i],
fitted=c(NA,NA,fits[,i])),class="forecast")
print(accuracy(fc,ret.data.test[,i]))
}
## Warning in `-.default`(dx, fits): longer object length is not a multiple of
## shorter object length
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set NaN NaN NaN NaN NaN NaN NA
## Test set 1502.997 16094.33 12122.63 4.382756 58.16897 0.7556663 -0.1698616
## Theil's U
## Training set NA
## Test set 0.04177614
## Warning in `-.default`(dx, fits): longer object length is not a multiple of
## shorter object length
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set NaN NaN NaN NaN NaN NaN NA
## Test set 1306.146 15723.76 12935.55 -17.21795 94.28542 2.146341 -0.5079881
## Theil's U
## Training set NA
## Test set 0.2609104
## Warning in `-.default`(dx, fits): longer object length is not a multiple of
## shorter object length
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set NaN NaN NaN NaN NaN NaN NA
## Test set 12.13128 49.27956 36.14489 -10.63968 87.97616 1.089678 -0.390449
## Theil's U
## Training set NA
## Test set 0.762286
## Warning in `-.default`(dx, fits): longer object length is not a multiple of
## shorter object length
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set NaN NaN NaN NaN NaN NaN NA
## Test set -1.554009 45.85776 35.89323 214.8928 231.7304 0.6061367 0.04363819
## Theil's U
## Training set NA
## Test set 0.5908297
So far the ARIMA has an RMSE of 14709, while the VAR model has a RMSE of 16094.33, so the ARIMA continues to be superior. For the last model, lets try neural net
fit <- nnetar(ret.data.train[,1], xreg=ret.data.train[,2:4])
fcst.nnet<-forecast(fit, xreg=ret.data.test[,2:4],h=24)
autoplot(fcst.nnet)+autolayer(ret.data.test[,1])
accuracy(fcst.nnet, ret.data.test[,1])
## ME RMSE MAE MPE MAPE MASE
## Training set 10.58578 11294.95 8442.264 -42.90548 149.8612 0.5262500
## Test set -6036.64321 20072.66 15984.964 48.91301 183.2795 0.9964255
## ACF1 Theil's U
## Training set 0.03211822 NA
## Test set -0.28686248 0.09046366
Arima has the best RMSE over both VAR and Neural Nets in the test data, so the ARIMA(1,0,1)(0,1,1)[12] is the best for modeling a differenced retail employment series transformed with box-cox