Set-up

library(fpp3)
## Warning: package 'fpp3' was built under R version 4.1.3
## -- Attaching packages -------------------------------------------- fpp3 0.4.0 --
## v tibble      3.1.8     v tsibble     1.1.3
## v dplyr       1.0.7     v tsibbledata 0.4.1
## v tidyr       1.1.4     v feasts      0.3.0
## v lubridate   1.8.0     v fable       0.3.2
## v ggplot2     3.3.5
## Warning: package 'tibble' was built under R version 4.1.3
## Warning: package 'tsibble' was built under R version 4.1.3
## Warning: package 'tsibbledata' was built under R version 4.1.3
## Warning: package 'feasts' was built under R version 4.1.3
## Warning: package 'fabletools' was built under R version 4.1.3
## Warning: package 'fable' was built under R version 4.1.3
## -- Conflicts ------------------------------------------------- fpp3_conflicts --
## x lubridate::date()    masks base::date()
## x dplyr::filter()      masks stats::filter()
## x tsibble::intersect() masks base::intersect()
## x tsibble::interval()  masks lubridate::interval()
## x dplyr::lag()         masks stats::lag()
## x tsibble::setdiff()   masks base::setdiff()
## x tsibble::union()     masks base::union()
# Paths
projpath <- "C:/Users/dancu/Documents/Fall2022_ADEC743002"
rawdata <- file.path(projpath, "RawData")

Data: US Retail Employment with Consumer Sentiment

# Read in consumer sentiment index from St. Louis FRED
cons<-read.csv(file.path(rawdata, "UMCSENT.csv"))
names(cons)[names(cons) == "UMCSENT"] <- "cs"
cons$cs<-as.numeric(cons$cs)
## Warning: NAs introduced by coercion
consts <- cons %>%
  mutate(Month = yearmonth(DATE)) %>%
  select(-DATE) %>%
  as_tsibble(index = Month)

# Start at 1978 where consistent consumer sentiment begins
consts<-consts %>% filter(year(Month)>=1978)
us_retail<-us_employment %>% filter(Title=="Retail Trade") %>% filter(year(Month)>=1978)

# Merge data sets, create training set
us_retail_cs<-merge(us_retail, consts)

us_retail_cs<-us_retail_cs %>% as_tsibble(index = Month)
us_retail_cs
## # A tsibble: 501 x 5 [1M]
##       Month Series_ID     Title        Employed    cs
##       <mth> <chr>         <chr>           <dbl> <dbl>
##  1 1978 Jan CEU4200000001 Retail Trade    9545   83.7
##  2 1978 Feb CEU4200000001 Retail Trade    9426.  84.3
##  3 1978 Mar CEU4200000001 Retail Trade    9521.  78.8
##  4 1978 Apr CEU4200000001 Retail Trade    9696.  81.6
##  5 1978 May CEU4200000001 Retail Trade    9827.  82.9
##  6 1978 Jun CEU4200000001 Retail Trade    9922.  80  
##  7 1978 Jul CEU4200000001 Retail Trade    9919.  82.4
##  8 1978 Aug CEU4200000001 Retail Trade    9950.  78.4
##  9 1978 Sep CEU4200000001 Retail Trade   10019.  80.4
## 10 1978 Oct CEU4200000001 Retail Trade   10048.  79.3
## # ... with 491 more rows
autoplot(us_retail_cs, .vars=Employed)

train<-us_retail_cs[1:477,]

Consumer sentiment x variable with ARIMA errors

fit <- train %>%
  model(ARIMA(Employed ~ cs))
report(fit)
## Series: Employed 
## Model: LM w/ ARIMA(2,0,1)(0,1,2)[12] errors 
## 
## Coefficients:
##          ar1      ar2      ma1     sma1     sma2      cs  intercept
##       1.9136  -0.9177  -0.7782  -0.4028  -0.1242  0.6185   152.7061
## s.e.  0.0356   0.0352   0.0571   0.0489   0.0477  0.4073    47.6107
## 
## sigma^2 estimated as 1631:  log likelihood=-2379.18
## AIC=4774.36   AICc=4774.67   BIC=4807.49
us_future <- new_data(train, 24) %>%
  mutate(cs = mean(train[466:477,]$cs))
fc1<-forecast(fit, new_data = us_future) 
fc1 %>%
  autoplot(us_retail_cs) +
  labs(y = "retail employed")

augment(fit) %>% features(.innov, ljung_box, lag=24)
## # A tibble: 1 x 3
##   .model               lb_stat lb_pvalue
##   <chr>                  <dbl>     <dbl>
## 1 ARIMA(Employed ~ cs)    33.5    0.0941
# passes, but not by a lot

fc1
## # A fable: 24 x 5 [1M]
## # Key:     .model [1]
##    .model                  Month        Employed  .mean    cs
##    <chr>                   <mth>          <dist>  <dbl> <dbl>
##  1 ARIMA(Employed ~ cs) 2017 Oct  N(15822, 1631) 15822.  95.4
##  2 ARIMA(Employed ~ cs) 2017 Nov  N(16188, 3733) 16188.  95.4
##  3 ARIMA(Employed ~ cs) 2017 Dec  N(16314, 6301) 16314.  95.4
##  4 ARIMA(Employed ~ cs) 2018 Jan  N(15734, 9316) 15734.  95.4
##  5 ARIMA(Employed ~ cs) 2018 Feb N(15521, 12746) 15521.  95.4
##  6 ARIMA(Employed ~ cs) 2018 Mar N(15548, 16549) 15548.  95.4
##  7 ARIMA(Employed ~ cs) 2018 Apr N(15610, 20681) 15610.  95.4
##  8 ARIMA(Employed ~ cs) 2018 May N(15695, 25090) 15695.  95.4
##  9 ARIMA(Employed ~ cs) 2018 Jun N(15782, 29725) 15782.  95.4
## 10 ARIMA(Employed ~ cs) 2018 Jul N(15791, 34533) 15791.  95.4
## # ... with 14 more rows
accuracy(fc1, us_retail_cs)
## # A tibble: 1 x 10
##   .model               .type    ME  RMSE   MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>                <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(Employed ~ cs) Test   9.20  66.1  54.2 0.0565 0.342 0.214 0.217 0.589

Try same ARIMA parameters in a model without cs predictor

train %>% gg_tsdisplay(difference(Employed), plot_type='partial')
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).

# ACF plot does suggest seasonal MA(q=2)
# a case can be made for AR(p=2), because PACF lag 2 has big spike

fit2 <- train %>%
  model(ARIMA(Employed ~ pdq(2,0,1)+PDQ(0,1,2)))
report(fit2)
## Series: Employed 
## Model: ARIMA(2,0,1)(0,1,2)[12] w/ drift 
## 
## Coefficients:
##          ar1     ar2      ma1     sma1    sma2  constant
##       1.9160  -0.920  -0.7829  -0.4093  -0.119    0.6154
## s.e.  0.0344   0.034   0.0553   0.0485   0.047    0.1911
## 
## sigma^2 estimated as 1635:  log likelihood=-2380.32
## AIC=4774.65   AICc=4774.89   BIC=4803.64
fc2<-forecast(fit2, new_data = us_future) 
fc2 %>%
  autoplot(us_retail_cs) +
  labs(y = "retail employed")

augment(fit2) %>% features(.innov, ljung_box, lag=24)
## # A tibble: 1 x 3
##   .model                                        lb_stat lb_pvalue
##   <chr>                                           <dbl>     <dbl>
## 1 ARIMA(Employed ~ pdq(2, 0, 1) + PDQ(0, 1, 2))    36.0    0.0544
# close call

accuracy(fc2, us_retail_cs)
## # A tibble: 1 x 10
##   .model                  .type    ME  RMSE   MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>                   <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(Employed ~ pdq(2~ Test   7.29  65.3  53.7 0.0445 0.339 0.212 0.215 0.588
# The ARIMA without the external predictor is actually a little better

Try lagged consumer sentiment by 1 month with ARIMA errors

train2 <- train %>% mutate(Employed = c(NA, Employed[2:477]))

fit3 <- train2 %>%
  model(ARIMA(Employed ~ pdq(d = 0) + cs + lag(cs)))
report(fit3)
## Series: Employed 
## Model: LM w/ ARIMA(4,0,0)(2,1,0)[12] errors 
## 
## Coefficients:
##          ar1      ar2      ar3      ar4     sar1     sar2      cs  lag(cs)
##       1.1827  -0.0373  -0.0259  -0.1291  -0.3448  -0.2199  0.8176   0.2752
## s.e.  0.0464   0.0721   0.0726   0.0463   0.0468   0.0459  0.3995   0.3988
## 
## sigma^2 estimated as 1729:  log likelihood=-2387.23
## AIC=4792.45   AICc=4792.85   BIC=4829.73
fc3<-forecast(fit3, new_data = us_future) 
fc3 %>%
  autoplot(us_retail_cs) +
  labs(y = "retail employed")

fc3
## # A fable: 24 x 5 [1M]
## # Key:     .model [1]
##    .model                                     Month        Employed  .mean    cs
##    <chr>                                      <mth>          <dist>  <dbl> <dbl>
##  1 ARIMA(Employed ~ pdq(d = 0) + cs + lag~ 2017 Oct  N(15829, 1729) 15829.  95.4
##  2 ARIMA(Employed ~ pdq(d = 0) + cs + lag~ 2017 Nov  N(16193, 4146) 16193.  95.4
##  3 ARIMA(Employed ~ pdq(d = 0) + cs + lag~ 2017 Dec  N(16319, 7350) 16319.  95.4
##  4 ARIMA(Employed ~ pdq(d = 0) + cs + lag~ 2018 Jan N(15746, 11450) 15746.  95.4
##  5 ARIMA(Employed ~ pdq(d = 0) + cs + lag~ 2018 Feb N(15540, 15935) 15540.  95.4
##  6 ARIMA(Employed ~ pdq(d = 0) + cs + lag~ 2018 Mar N(15569, 20695) 15569.  95.4
##  7 ARIMA(Employed ~ pdq(d = 0) + cs + lag~ 2018 Apr N(15630, 25613) 15630.  95.4
##  8 ARIMA(Employed ~ pdq(d = 0) + cs + lag~ 2018 May N(15717, 30565) 15717.  95.4
##  9 ARIMA(Employed ~ pdq(d = 0) + cs + lag~ 2018 Jun N(15804, 35488) 15804.  95.4
## 10 ARIMA(Employed ~ pdq(d = 0) + cs + lag~ 2018 Jul N(15813, 40336) 15813.  95.4
## # ... with 14 more rows
accuracy(fc3, us_retail_cs)
## # A tibble: 1 x 10
##   .model                  .type    ME  RMSE   MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>                   <chr> <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(Employed ~ pdq(d~ Test   8.44  48.9  41.1 0.0519 0.259 0.162 0.161 0.386
gg_tsresiduals(fit3)
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).

augment(fit3) %>% features(.innov, ljung_box, lag=24)
## # A tibble: 1 x 3
##   .model                                      lb_stat lb_pvalue
##   <chr>                                         <dbl>     <dbl>
## 1 ARIMA(Employed ~ pdq(d = 0) + cs + lag(cs))    39.9    0.0218
# actually doesn't pass Ljung Box test

Try auto ARIMA

fit4 <- train %>%
  model(ARIMA(Employed))
report(fit4)
## Series: Employed 
## Model: ARIMA(1,1,1)(0,1,2)[12] 
## 
## Coefficients:
##          ar1      ma1     sma1     sma2
##       0.8817  -0.7156  -0.4125  -0.1373
## s.e.  0.0490   0.0710   0.0476   0.0465
## 
## sigma^2 estimated as 1664:  log likelihood=-2378.54
## AIC=4767.08   AICc=4767.21   BIC=4787.78
fc4<-forecast(fit4, new_data = us_future) 
fc4 %>%
  autoplot(us_retail_cs) +
  labs(y = "retail employed")

accuracy(fc4, us_retail_cs)
## # A tibble: 1 x 10
##   .model          .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>           <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(Employed) Test   50.9  65.7  54.0 0.321 0.341 0.213 0.216 0.187
augment(fit4) %>% features(.innov, ljung_box, lag=24)
## # A tibble: 1 x 3
##   .model          lb_stat lb_pvalue
##   <chr>             <dbl>     <dbl>
## 1 ARIMA(Employed)    33.8    0.0891
# barely passes

Review some accuracy measures

c(ARIMAx = accuracy(fc1, us_retail_cs)["RMSE"], ARIMA = accuracy(fc2, us_retail_cs)["RMSE"], ARIMAxlag = accuracy(fc3, us_retail_cs)["RMSE"], ARIMAauto = accuracy(fc4, us_retail_cs)["RMSE"])
## $ARIMAx.RMSE
## [1] 66.11605
## 
## $ARIMA.RMSE
## [1] 65.26866
## 
## $ARIMAxlag.RMSE
## [1] 48.8649
## 
## $ARIMAauto.RMSE
## [1] 65.73839
c(ARIMAx = accuracy(fc1, us_retail_cs)["MASE"], ARIMA = accuracy(fc2, us_retail_cs)["MASE"], ARIMAxlag = accuracy(fc3, us_retail_cs)["MASE"], ARIMAauto = accuracy(fc4, us_retail_cs)["MASE"])
## $ARIMAx.MASE
## [1] 0.2139277
## 
## $ARIMA.MASE
## [1] 0.2120622
## 
## $ARIMAxlag.MASE
## [1] 0.1621576
## 
## $ARIMAauto.MASE
## [1] 0.2133636

model including lagged predictor is a close winner by these metrics