Libraries

require(fpp2)
## Loading required package: fpp2
## Warning: package 'fpp2' was built under R version 4.0.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## -- Attaching packages ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- fpp2 2.4 --
## v ggplot2   3.3.2     v fma       2.4  
## v forecast  8.13      v expsmooth 2.3
## Warning: package 'forecast' was built under R version 4.0.3
## Warning: package 'fma' was built under R version 4.0.3
## Warning: package 'expsmooth' was built under R version 4.0.3
## 
require(forecast)
require(urca)
## Loading required package: urca
## Warning: package 'urca' was built under R version 4.0.3
require(zoo)
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.0.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
require(vars)
## Loading required package: vars
## Warning: package 'vars' was built under R version 4.0.3
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following objects are masked from 'package:fma':
## 
##     cement, housing, petrol
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.0.3
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.0.3
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.0.3

Import Retail employment import and plot

retail<-read.csv("D:/predictive analytics/assignments/retail_combined.csv")
retail.emp<-ts(retail[,2], frequency=12, start=c(1992,1))
autoplot(retail.emp)

Exploring other variables for xreg

#retail sales
retail.sales<-ts(retail[,3], frequency=12, start=c(1992,1))
retail.sales.diff<-diff(retail.sales)
autoplot(retail.sales.diff)

#offset
retail.sales.diff<-lag(retail.sales.diff, 1)

#personal consumption expenditures
pce<-ts(retail[,4], frequency=12, start=c(1992,1))
pce.diff<-diff(pce)
autoplot(pce.diff)

pce.diff<-lag(pce.diff, 1)

#disposable income
disp.inc<-ts(retail[,5], frequency=12, start=c(1992,1))
disp.inc.diff<-diff(disp.inc)
autoplot(disp.inc.diff)

disp.inc.diff<-lag(disp.inc.diff, 1)

decompose

retail.emp %>%
  mstl(robust=TRUE) %>%
  autoplot()

ggseasonplot(retail.emp, year.labels=TRUE, year.labels.left=TRUE) +
  ylab("Employment") +
  ggtitle("Seasonal plot: Retail Employment")

Data isn’t stationary and has strong seasonal component, so we should do some differencing and a box cox transformation I could not get the seasonality component to work, at all.

retail.emp.bc<-BoxCox(retail.emp, lambda = BoxCox.lambda(retail.emp))
autoplot(retail.emp.bc)

ndiffs(retail.emp.bc)
## [1] 1
#Differences
retail.emp.diff<-diff(retail.emp.bc)
autoplot(retail.emp.diff)

retail.emp.diff %>% ur.kpss() %>% summary()
## 
## ####################### 
## # KPSS Unit Root Test # 
## ####################### 
## 
## Test is of type: mu with 5 lags. 
## 
## Value of test-statistic is: 0.0697 
## 
## Critical value for a significance level of: 
##                 10pct  5pct 2.5pct  1pct
## critical values 0.347 0.463  0.574 0.739
ggtsdisplay(retail.emp.diff)

Set Up train and test models

ret.data<-cbind(retail.emp.diff, retail.sales.diff, pce.diff, disp.inc.diff)



ret.data.train<-ts(ret.data[1:325,], frequency=12, start=c(1992,1))
ret.data.test<-ts(ret.data[325:347,], frequency=12, start=c(2019,1))
ret.data.test
##          retail.emp.diff retail.sales.diff pce.diff disp.inc.diff
## Jan 2019     -280959.167            -14501     12.4          23.2
## Feb 2019      -96607.830             62667    133.9          -8.4
## Mar 2019       -1743.003             -3548     73.2         -38.5
## Apr 2019       23698.070             33507     49.8          -7.6
## May 2019       32061.316            -27224     62.8          17.2
## Jun 2019       40053.120             13918     56.1          13.5
## Jul 2019        5516.672             11377     39.0          72.1
## Aug 2019      -18346.456            -44238     22.3          21.2
## Sep 2019      -53122.075             27235     55.3           2.6
## Oct 2019       78355.751             12492     24.3          61.0
## Nov 2019      212885.630             53269     43.5         -37.0
## Dec 2019       39589.949           -105202     84.2         110.0
## Jan 2020     -289015.277             -3316     -3.1          92.4
## Feb 2020      -72112.410             16408   -998.9        -213.3
## Mar 2020      -28913.980            -53962  -1766.4        2337.8
## Apr 2020    -1008475.606             85690   1053.3        -833.6
## May 2020      204779.058             20017    849.2        -303.7
## Jun 2020      415414.725             12011    204.2          90.4
## Jul 2020      115981.413             -3474    175.3        -528.2
## Aug 2020      103901.087            -14710    175.5          91.6
## Sep 2020      -34157.607             20423     70.9        -120.7
## Oct 2020      116151.841                NA       NA            NA
## Nov 2020      145712.501                NA       NA            NA

ARIMA model

#fit arima models
fit<-auto.arima(ret.data.train[,1], xreg=ret.data.train[,2:4])
fit.fcst<-forecast(fit, xreg=ret.data.test[,2:4], h=24)
## Warning in forecast.forecast_ARIMA(fit, xreg = ret.data.test[, 2:4], h = 24):
## Upper prediction intervals are not finite.
fit.fcst
##          Point Forecast       Lo 80         Hi 80       Lo 95      Hi 95
## Feb 2019     -68692.726  -92896.330 -4.448912e+04 -105708.944  -31676.51
## Mar 2019      -7438.252  -32041.580  1.716508e+04  -45065.795   30189.29
## Apr 2019      34610.871    9683.062  5.953868e+04   -3512.923   72734.66
## May 2019      31540.831    6348.853  5.673281e+04   -6986.975   70068.64
## Jun 2019      32244.845    6837.305  5.765238e+04   -6612.635   71102.32
## Jul 2019      -2017.544  -27601.301  2.356621e+04  -41144.524   37109.44
## Aug 2019     -25734.050  -51462.069 -6.031509e+00  -65081.660   13613.56
## Sep 2019     -58400.067  -84246.325 -3.255381e+04  -97928.507  -18871.63
## Oct 2019      56378.867   30435.608  8.232213e+04   16702.076   96055.66
## Nov 2019     231509.320  205486.423  2.575322e+05  191710.733  271307.91
## Dec 2019     -20531.424  -46619.747  5.556898e+03  -60430.070   19367.22
## Jan 2020    -263215.163 -289357.259 -2.370731e+05 -303196.049 -223234.28
## Feb 2020     -70089.485 -100010.987 -4.016798e+04 -115850.473  -24328.50
## Mar 2020      15991.534  -14165.215  4.614828e+04  -30129.234   62112.30
## Apr 2020      80413.476   50064.297  1.107627e+05   33998.413  126828.54
## May 2020       3995.163  -26511.628  3.450195e+04  -42660.948   50651.27
## Jun 2020      10913.209  -19722.814  4.154923e+04  -35940.546   57766.96
## Jul 2020      -2366.788  -33108.866  2.837529e+04  -49382.740   44649.16
## Aug 2020     -26431.997  -57261.170  4.397177e+03  -73581.149   20717.16
## Sep 2020     -66447.968  -97348.706 -3.554723e+04 -113706.569  -19189.37
## Oct 2020      57533.316   26573.747  8.849289e+04   10184.741  104881.89
## Nov 2020             NA          NA            NA          NA         NA
## Dec 2020             NA          NA            NA          NA         NA
autoplot(fit.fcst)+autolayer(ret.data[,1])+coord_cartesian(xlim = c(2015, 2021))
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 row(s) containing missing values (geom_path).

Look, there was some really intense changes in retail employment that had little to do with normal fundamentals. Retail employment saw a sharp decline in its total in response to the pandemic, and then a large increase. While I was hoping I could predict some of that with consumption, income, and retail sales, but the crisis completely divorced retail employment from fundamentals, as income actually has grown at record rates from stimulus, so I am going to give up on trying that and see if I can forecsat accurately in a normal year

ret.data.train<-na.omit(ts(ret.data[1:312,], frequency=12, start=c(1992,1)))
ret.data.test<-na.omit(ts(ret.data[313:335,], frequency=12, start=c(2018,1)))

#fit arima models
fit<-auto.arima(ret.data.train[,1], xreg=ret.data.train[,2:4])
fit.fcst<-forecast(fit, xreg=ret.data.test[,2:4])
autoplot(fit.fcst)+autolayer(ret.data[,1])+coord_cartesian(xlim = c(2015, 2019.9))
## Warning: Removed 1 row(s) containing missing values (geom_path).

accuracy(fit.fcst, ret.data.test[,1])
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -125.1895 18328.66 12898.68 15.097613 100.9734 0.8040415
## Test set      199.9279 14709.44 11128.58 -7.268026 103.8393 0.6937022
##                     ACF1  Theil's U
## Training set  0.01157454         NA
## Test set     -0.26094199 0.04469013

VAR Select

VARselect(ret.data.train, lag.max=24, type="const")[["selection"]]
## AIC(n)  HQ(n)  SC(n) FPE(n) 
##     14     13     12     14
#AIC says 14 but BIC suggests 12
var1 <- VAR(ret.data.train, p=12, type="const")
serial.test(var1, lags.pt=24, type="PT.asymptotic")
## 
##  Portmanteau Test (asymptotic)
## 
## data:  Residuals of VAR object var1
## Chi-squared = 332.67, df = 192, p-value = 1.263e-09
var2 <- VAR(ret.data.train, p=14, type="const")
serial.test(var2, lags.pt=24, type="PT.asymptotic")
## 
##  Portmanteau Test (asymptotic)
## 
## data:  Residuals of VAR object var2
## Chi-squared = 284.64, df = 160, p-value = 4.909e-09

Using the BIC selection (var1) because p is lower than BIC, and Portmanteau Test was essentially 0 for both

forecast(var1) %>%
  autoplot() + xlab("Year")

var.forecast<-forecast(var1)

p <- predict(var1, n.ahead=24)
res <- residuals(var.forecast)
fits <- fitted(var.forecast)
for(i in 1:4)
{
  fc <- structure(list(mean=p$fcst[[i]][,"fcst"], x=ret.data.train[,i],
     fitted=c(NA,NA,fits[,i])),class="forecast")
  print(accuracy(fc,ret.data.test[,i]))
}
## Warning in `-.default`(dx, fits): longer object length is not a multiple of
## shorter object length
##                    ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set      NaN      NaN      NaN      NaN      NaN       NaN         NA
## Test set     1502.997 16094.33 12122.63 4.382756 58.16897 0.7556663 -0.1698616
##               Theil's U
## Training set         NA
## Test set     0.04177614
## Warning in `-.default`(dx, fits): longer object length is not a multiple of
## shorter object length
##                    ME     RMSE      MAE       MPE     MAPE     MASE       ACF1
## Training set      NaN      NaN      NaN       NaN      NaN      NaN         NA
## Test set     1306.146 15723.76 12935.55 -17.21795 94.28542 2.146341 -0.5079881
##              Theil's U
## Training set        NA
## Test set     0.2609104
## Warning in `-.default`(dx, fits): longer object length is not a multiple of
## shorter object length
##                    ME     RMSE      MAE       MPE     MAPE     MASE      ACF1
## Training set      NaN      NaN      NaN       NaN      NaN      NaN        NA
## Test set     12.13128 49.27956 36.14489 -10.63968 87.97616 1.089678 -0.390449
##              Theil's U
## Training set        NA
## Test set      0.762286
## Warning in `-.default`(dx, fits): longer object length is not a multiple of
## shorter object length
##                     ME     RMSE      MAE      MPE     MAPE      MASE       ACF1
## Training set       NaN      NaN      NaN      NaN      NaN       NaN         NA
## Test set     -1.554009 45.85776 35.89323 214.8928 231.7304 0.6061367 0.04363819
##              Theil's U
## Training set        NA
## Test set     0.5908297

So far the ARIMA has an RMSE of 14709, while the VAR model has a RMSE of 16094.33, so the ARIMA continues to be superior. For the last model, lets try neural net

fit <- nnetar(ret.data.train[,1], xreg=ret.data.train[,2:4])


fcst.nnet<-forecast(fit, xreg=ret.data.test[,2:4],h=24)
autoplot(fcst.nnet)+autolayer(ret.data.test[,1])

accuracy(fcst.nnet, ret.data.test[,1])
##                       ME     RMSE       MAE       MPE     MAPE      MASE
## Training set    10.58578 11294.95  8442.264 -42.90548 149.8612 0.5262500
## Test set     -6036.64321 20072.66 15984.964  48.91301 183.2795 0.9964255
##                     ACF1  Theil's U
## Training set  0.03211822         NA
## Test set     -0.28686248 0.09046366

Arima has the best RMSE over both VAR and Neural Nets in the test data, so the ARIMA(1,0,1)(0,1,1)[12] is the best for modeling a differenced retail employment series transformed with box-cox