Homework #5

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.8     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(fpp3)

## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──
## ✔ lubridate   1.8.0     ✔ feasts      0.2.2
## ✔ tsibble     1.1.2     ✔ fable       0.3.1
## ✔ tsibbledata 0.4.0     
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

Loading packages

Question 1

piggy <- aus_livestock %>% 
  filter(State=="Victoria") %>% 
  filter(Animal=="Pigs")
pigfit <- piggy %>% model(ses=ETS(Count ~ error("A") + trend("N") + season("N")))

piggy %>% 
  autoplot(Count)

pigfit %>% 
  components() %>% 
  autoplot()

## Warning: Removed 1 row(s) containing missing values (geom_path).

pigfit %>% 
  report()

## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.3221247 
## 
##   Initial states:
##      l[0]
##  100646.6
## 
##   sigma^2:  87480760
## 
##      AIC     AICc      BIC 
## 13737.10 13737.14 13750.07

Optimal value of alpha is .3221247 Optimal value of l0 is 100646.6

pigfc <- pigfit %>% 
  forecast(h=4)
pigfc %>% 
  autoplot(piggy %>% filter(year(Month)>2010))

Upper

95187+1.96*(87480760)^(1/2)

## [1] 113519.1

Lower

95187-1.96*(87480760)^(1/2)

## [1] 76854.89

hilo(pigfc)

## # A tsibble: 4 x 8 [1M]
## # Key:       Animal, State, .model [1]
##   Animal State   .model    Month             Count  .mean                  `80%`
##   <fct>  <fct>   <chr>     <mth>            <dist>  <dbl>                 <hilo>
## 1 Pigs   Victor… ses    2019 Jan N(95187, 8.7e+07) 95187. [83200.06, 107173.1]80
## 2 Pigs   Victor… ses    2019 Feb N(95187, 9.7e+07) 95187. [82593.52, 107779.6]80
## 3 Pigs   Victor… ses    2019 Mar N(95187, 1.1e+08) 95187. [82014.88, 108358.2]80
## 4 Pigs   Victor… ses    2019 Apr N(95187, 1.1e+08) 95187. [81460.61, 108912.5]80
## # … with 1 more variable: `95%` <hilo>
## # ℹ Use `colnames()` to see all variable names

They are almost the exact same

Question 2

ger <- global_economy %>% 
  filter(Country=="Germany")
gerexp <- ger %>% 
  select(Exports) %>% 
  na.omit()
gerexp %>% 
  autoplot(Exports)

There is a clear upward trend with something of a yearly seasonality. There is a downward dip likely due to the great recession

gerfit <- gerexp %>% model(ses=ETS(Exports ~ error("A") + trend("N") + season("N")))
gerfc <- gerfit %>% 
  forecast(h=10)
gerfc %>% 
  autoplot(gerexp)

This forecast completely ignores the upward trend

accuracy(gerfit)

## # A tibble: 1 × 10
##   .model .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE     ACF1
##   <chr>  <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>    <dbl>
## 1 ses    Training 0.672  1.76  1.31  2.21  4.73 0.982 0.990 -0.00124

gerfit2 <- gerexp %>% model(ses=ETS(Exports ~ error("A") + trend("A") + season("N")))
accuracy(gerfit2)

## # A tibble: 1 × 10
##   .model .type         ME  RMSE   MAE    MPE  MAPE  MASE RMSSE   ACF1
##   <chr>  <chr>      <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 ses    Training -0.0121  1.62  1.11 -0.571  4.14 0.832 0.914 0.0220

AAN includes the obvious upwards trend.

gerfc2 <- gerfit2 %>% 
  forecast(h=10)
gerfc2 %>% 
  autoplot(gerexp)

I believe the AAN model more because it is likely exports will continue to increase.

Question 3

china <- global_economy %>% 
  filter(Country=="China") %>% 
  select(GDP)
china %>% 
  autoplot(GDP)

lambdaval <- china %>% 
  features(GDP,features=guerrero) %>% 
  pull(lambda_guerrero)


chinafit <- china %>% 
  model(
    reg=ETS(GDP),
    damp=ETS(GDP~error("A")+trend("Ad")+season("N")),
    boxreg=ETS(box_cox(GDP,lambdaval)~error("A")+trend("N")+season("N"))
  )
chinafit %>% 
  report()

## Warning in report.mdl_df(.): Model reporting is only supported for individual
## models, so a glance will be shown. To see the report for a specific model, use
## `select()` and `filter()` to identify a single model.

## # A tibble: 3 × 9
##   .model   sigma2 log_lik   AIC  AICc    BIC      MSE     AMSE      MAE
##   <chr>     <dbl>   <dbl> <dbl> <dbl>  <dbl>    <dbl>    <dbl>    <dbl>
## 1 reg    1.08e- 2 -1546.  3102. 3103. 3112.  4.00e+22 1.61e+23 7.93e- 2
## 2 damp   3.96e+22 -1624.  3260. 3262. 3273.  3.62e+22 1.30e+23 9.49e+10
## 3 boxreg 2.72e- 3    54.5 -103. -103.  -96.9 2.63e- 3 9.76e- 3 4.51e- 2

chinafc <- chinafit %>% 
  forecast(h=20)
chinafc %>% 
  autoplot(china)

Question 4

gas <- aus_production %>% 
  select(Gas)
gas %>% 
  autoplot(Gas)

gasfit <- gas %>% 
  model(
    mod=ETS(Gas~error("M")+trend("A")+season("M"))
  )
gasfc <- gasfit %>% 
  forecast(h=36)
gasfc %>% 
  autoplot(gas)

Multiplicative is necessary here because the magnitude of the seasonality changes greatly.

gg_tsresiduals(gasfit)

Significant autocorrelation at many levels, along with heteroskedasticity.

gasfit2 <- gas %>% 
  model(
    mod2=ETS(Gas~error("M")+trend("Ad")+season("M"))
  )
gasfc2 <- gasfit2 %>%
  forecast(h=36)
gg_tsresiduals(gasfit2)

Not much of an improvement.

gasfc2 %>% 
  autoplot(gas)

gasfit %>% 
  report()

## Series: Gas 
## Model: ETS(M,A,M) 
##   Smoothing parameters:
##     alpha = 0.6528545 
##     beta  = 0.1441675 
##     gamma = 0.09784922 
## 
##   Initial states:
##      l[0]       b[0]      s[0]    s[-1]    s[-2]     s[-3]
##  5.945592 0.07062881 0.9309236 1.177883 1.074851 0.8163427
## 
##   sigma^2:  0.0032
## 
##      AIC     AICc      BIC 
## 1680.929 1681.794 1711.389

gasfit2 %>% 
  report()

## Series: Gas 
## Model: ETS(M,Ad,M) 
##   Smoothing parameters:
##     alpha = 0.6489044 
##     beta  = 0.1551275 
##     gamma = 0.09369372 
##     phi   = 0.98 
## 
##   Initial states:
##      l[0]       b[0]      s[0]    s[-1]   s[-2]     s[-3]
##  5.858941 0.09944006 0.9281912 1.177903 1.07678 0.8171255
## 
##   sigma^2:  0.0033
## 
##      AIC     AICc      BIC 
## 1684.028 1685.091 1717.873

The first one has a lower AICc value.

Question 5

set.seed(223465)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`, 1))
myseries %>% autoplot(Turnover)

There is heteroskedasticity which necesitates multiplicative seasonality

myfit <- myseries %>% 
  model(
    modm=ETS(Turnover~trend("A")+season("M"))
  )
myfit2 <- myseries %>% 
  model(
    modm=ETS(Turnover~trend("Ad")+season("M"))
  )

fc1 <- myfit %>% 
  forecast(h=36)
fc2 <- myfit2 %>% 
  forecast(h=36)

fc1 %>% 
  autoplot(myseries)

fc2 %>% 
  autoplot(myseries)

myfit3 <- myseries %>% 
  model(
    `Regular`=ETS(Turnover~trend("A")+season("M")),
    `Damped`=ETS(Turnover~trend("Ad")+season("M"))
  )


fc3 <- myfit3 %>% 
  forecast(h=36)


fc3 %>% 
  autoplot(myseries,level=NULL)

gg_tsresiduals(myfit)

gg_tsresiduals(myfit2)

The second one is slightly better

accuracy(myfit3) %>% 
  select(.model,RMSE)

## # A tibble: 2 × 2
##   .model   RMSE
##   <chr>   <dbl>
## 1 Regular 0.671
## 2 Damped  0.667

The RMSE values confirm this.

mytrain <- myseries %>% 
  filter(year(Month)<2011)
mytest <- myseries %>% 
  filter(year(Month)>2010)
mytrainfit <- mytrain %>% 
  model(
    snaive=SNAIVE(Turnover),
    damped=ETS(Turnover~error("M")+trend("Ad")+season("M")),
    reg=ETS(Turnover~trend("A")+season("M"))
  )
mytrainfc <- mytrainfit %>% 
  forecast(h=96)
accuracy(mytrainfc, mytest ) %>% 
  select(.model, RMSE)

## # A tibble: 3 × 2
##   .model  RMSE
##   <chr>  <dbl>
## 1 damped  2.41
## 2 reg     4.19
## 3 snaive  2.29

My Damped Seasonality has a slightly worse RMSE than Seasonal NAIVE

comparison <- anti_join(myseries, mytrain, by = c('State', 'Industry', 'Series ID', 'Month', 'Turnover'))
yup <- mytrainfit %>% forecast(comparison)

autoplot(comparison, Turnover) +
  autolayer(yup, level = NULL) +
  labs(title = 'Forecast Method Comparison')

SNAIVE is the closest.

accuracy(mytrainfit) %>% 
  select(.model,RMSE)

## # A tibble: 3 × 2
##   .model  RMSE
##   <chr>  <dbl>
## 1 snaive 1.34 
## 2 damped 0.583
## 3 reg    0.619

For the data, however, the damped does the best.

Question 6

tot <- tourism %>% 
  summarise(Trips=sum(Trips))
tot %>% 
  autoplot(Trips)

Seasonal data with a large dip followed by a massive upward trend.

ugh <- tot %>%
  model(STL(Trips)) %>% components()

ugh %>%
  as_tsibble() %>%
  autoplot(season_adjust)

thing <- tot %>%
  model(
    dampdec=decomposition_model(STL(Trips),
                        ETS(season_adjust ~ error("A") + trend("Ad") + season("N")))) 
thingfc <- thing%>%
  forecast(h = "2 years") 
thingfc%>%
  autoplot(tot)

This forecast appears to be strong at first glance.

thing2 <- tot %>%
  model(
    regdec=decomposition_model(STL(Trips),
                        ETS(season_adjust ~ error("A") + trend("A") + season("N")))) 
thing2fc <- thing2%>%
  forecast(h = "2 years") 

thing2fc%>%
  autoplot(tot)

thing3 <- tot %>% 
  model(
    m=ETS(Trips))
thingfc3 <- thing3 %>% 
  forecast(h=8)
thingfc3 %>% 
  autoplot(tot)

accuracy(thing) %>% 
  select(.model,RMSE)

## # A tibble: 1 × 2
##   .model   RMSE
##   <chr>   <dbl>
## 1 dampdec  763.

accuracy(thing2) %>% 
  select(.model,RMSE)

## # A tibble: 1 × 2
##   .model  RMSE
##   <chr>  <dbl>
## 1 regdec  763.

accuracy(thing3) %>% 
  select(.model,RMSE)

## # A tibble: 1 × 2
##   .model  RMSE
##   <chr>  <dbl>
## 1 m       794.

thingfc

## # A fable: 8 x 4 [1Q]
## # Key:     .model [1]
##   .model  Quarter             Trips  .mean
##   <chr>     <qtr>            <dist>  <dbl>
## 1 dampdec 2018 Q1  N(28813, 613295) 28813.
## 2 dampdec 2018 Q2  N(27572, 779622) 27572.
## 3 dampdec 2018 Q3  N(27249, 976340) 27249.
## 4 dampdec 2018 Q4 N(28553, 1205316) 28553.
## 5 dampdec 2019 Q1 N(29839, 1482121) 29839.
## 6 dampdec 2019 Q2 N(28578, 1781723) 28578.
## 7 dampdec 2019 Q3 N(28234, 2118429) 28234.
## 8 dampdec 2019 Q4 N(29519, 2493604) 29519.

thing2fc

## # A fable: 8 x 4 [1Q]
## # Key:     .model [1]
##   .model Quarter             Trips  .mean
##   <chr>    <qtr>            <dist>  <dbl>
## 1 regdec 2018 Q1   N(28883, 6e+05) 28883.
## 2 regdec 2018 Q2  N(27683, 768628) 27683.
## 3 regdec 2018 Q3  N(27405, 961774) 27405.
## 4 regdec 2018 Q4 N(28760, 1186493) 28760.
## 5 regdec 2019 Q1 N(30102, 1458939) 30102.
## 6 regdec 2019 Q2 N(28901, 1755034) 28901.
## 7 regdec 2019 Q3 N(28624, 2089927) 28624.
## 8 regdec 2019 Q4 N(29978, 2466013) 29978.

thingfc3

## # A fable: 8 x 4 [1Q]
## # Key:     .model [1]
##   .model Quarter             Trips  .mean
##   <chr>    <qtr>            <dist>  <dbl>
## 1 m      2018 Q1   N(29068, 7e+05) 29068.
## 2 m      2018 Q2  N(27794, 870750) 27794.
## 3 m      2018 Q3 N(27619, 1073763) 27619.
## 4 m      2018 Q4 N(28627, 1311711) 28627.
## 5 m      2019 Q1 N(30336, 1587455) 30336.
## 6 m      2019 Q2 N(29062, 1903591) 29062.
## 7 m      2019 Q3 N(28887, 2262980) 28887.
## 8 m      2019 Q4 N(29895, 2668392) 29895.

I buy the undamped decomposition the most. It has the lowest RMSE.

gg_tsresiduals(thing2)

## Warning: Removed 4 row(s) containing missing values (geom_path).

## Warning: Removed 4 rows containing missing values (geom_point).

## Warning: Removed 4 rows containing non-finite values (stat_bin).

There apepars to be autocorrelation at a lag of 14, but otherwise the residuals definitely appear to be a white noise.

Question 7

nz <- aus_arrivals %>%
  filter(Origin=="NZ")
nz %>% 
  autoplot(Arrivals)

High seasonality, upward trend

trainz <- nz %>% 
  filter(year(Quarter)<2011)
testz <- nz %>% 
  filter(year(Quarter)>2010)
trainzfit <- trainz %>% 
  model(
    mets=ETS(Arrivals~error("M")+trend("A")+season("M"))
  )
trainzfc <- trainzfit %>% 
  forecast(h=7)
trainzfc %>% 
  autoplot(nz)

accuracy(trainzfc,testz)

## # A tibble: 1 × 11
##   .model Origin .type      ME   RMSE    MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>  <chr>  <chr>   <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 mets   NZ     Test  -10467. 18302. 12746. -3.27  4.16   NaN   NaN -0.0857

Multiplicative seasonality is necessary here because the magnitude of the seasonality is inconstant over time.

bigfit <- trainz %>%
  model(
    etschosen=ETS(Arrivals),
    etschosenlog=ETS(box_cox(Arrivals,0)),
    snaive=SNAIVE(Arrivals),
    logdecomp=decomposition_model(STL(box_cox(Arrivals,0)),
                            ETS(season_adjust ~ error("A") + trend("Ad") + season("N")))
    )
bigfc <- bigfit %>% 
  forecast(h=7)
bigfc %>% 
  autoplot(nz,level=NULL)

accuracy(bigfc,testz) %>% 
  select(.model,RMSE)

## # A tibble: 4 × 2
##   .model         RMSE
##   <chr>         <dbl>
## 1 etschosen    18302.
## 2 etschosenlog 14763.
## 3 logdecomp    19979.
## 4 snaive       14962.

The one generated by the ETS function on logged data gave the lowest RMSE.

letsfckingo <- bigfit %>% select('etschosenlog')
letsfckingo %>%
  gg_tsresiduals()

nzcv <- nz %>%
  slice(1:(n()-3)) %>% 
  stretch_tsibble(.init = 36, .step = 3) %>% 
  relocate(Quarter, .id)
nczvfc <- nzcv %>%
  model(
    etschosen2=ETS(Arrivals),
    etschosenlog2=ETS(box_cox(Arrivals,0)),
    snaive2=SNAIVE(Arrivals),
    mod2=decomposition_model(STL(box_cox(Arrivals,0)),
                            ETS(season_adjust ~ error("M") + trend("Ad") + season("N")))) %>%
  forecast(h = 3)

nczvfc%>%
  accuracy(nz)

## # A tibble: 4 × 11
##   .model        Origin .type    ME   RMSE    MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>         <chr>  <chr> <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 etschosen2    NZ     Test  4627. 15327. 11799.  2.23  6.45 0.793 0.797 0.283
## 2 etschosenlog2 NZ     Test  4388. 15047. 11566.  1.99  6.36 0.778 0.782 0.268
## 3 mod2          NZ     Test  3020. 15114. 11864.  1.45  6.34 0.798 0.786 0.250
## 4 snaive2       NZ     Test  8244. 18768. 14422.  3.83  7.76 0.970 0.976 0.566

The best one here is now the decomposition model. The previous winner is second best now.

Question 8

cement <- aus_production %>% 
  select(Cement)
cement %>% autoplot(Cement)

Very strong upward trend, clear heteroskedasticity, high seasonality.

ccv <- cement %>% 
  stretch_tsibble(.init=20,.step=1) %>% 
  relocate(Quarter,.id)
ccvmod <- ccv %>% 
  model(
    ETS(Cement),
    SNAIVE(Cement)
  )

tail(ccvmod %>% forecast(h=4))

## # A fable: 6 x 5 [1Q]
## # Key:     .id, .model [2]
##     .id .model         Quarter         Cement .mean
##   <int> <chr>            <qtr>         <dist> <dbl>
## 1   199 ETS(Cement)    2011 Q1 N(2091, 21888) 2091.
## 2   199 ETS(Cement)    2011 Q2 N(2347, 35012) 2347.
## 3   199 SNAIVE(Cement) 2010 Q3 N(2325, 17963) 2325 
## 4   199 SNAIVE(Cement) 2010 Q4 N(2273, 17963) 2273 
## 5   199 SNAIVE(Cement) 2011 Q1 N(1904, 17963) 1904 
## 6   199 SNAIVE(Cement) 2011 Q2 N(2401, 17963) 2401

zoinks <- cement %>%
  stretch_tsibble(.init = 5, .step = 4) %>% 
  relocate(Quarter,.id)

cemented <- zoinks %>%
  model(ETS(Cement),
        SNAIVE(Cement)) %>%
  forecast(h= 4)

cementtesting <- accuracy(cemented, cement) %>%
  mutate(MSE = (RMSE)^2) %>% 
  select(RMSE,MSE,.model)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 3 observations are missing between 2010 Q3 and 2011 Q1

cementtesting

## # A tibble: 2 × 3
##    RMSE    MSE .model        
##   <dbl>  <dbl> <chr>         
## 1  99.8  9962. ETS(Cement)   
## 2 134.  18028. SNAIVE(Cement)

The ETS model is much better. It has a lower RMSE.