library(fpp3)
## Registered S3 methods overwritten by 'ggtime':
##   method           from      
##   autolayer.fbl_ts fabletools
##   autolayer.tbl_ts fabletools
##   autoplot.dcmp_ts fabletools
##   autoplot.fbl_ts  fabletools
##   autoplot.tbl_ts  fabletools
##   fortify.fbl_ts   fabletools
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tibble      3.3.0     ✔ tsibble     1.2.0
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.5.0
## ✔ lubridate   1.9.4     ✔ fable       0.5.0
## ✔ ggplot2     4.0.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

8.1

#Preparing the df
unique(aus_livestock$Animal)
## [1] Bulls, bullocks and steers Calves                    
## [3] Cattle (excl. calves)      Cows and heifers          
## [5] Lambs                      Pigs                      
## [7] Sheep                     
## 7 Levels: Bulls, bullocks and steers Calves ... Sheep
unique(aus_livestock$State)
## [1] Australian Capital Territory New South Wales             
## [3] Northern Territory           Queensland                  
## [5] South Australia              Tasmania                    
## [7] Victoria                     Western Australia           
## 8 Levels: Australian Capital Territory New South Wales ... Western Australia
piggies <- aus_livestock |>
  filter(Animal == "Pigs", State == "Victoria")
#a

#Fitting a simple exponential smoothing model
fit_piggies <- piggies |>
  model(ETS(Count ~ error("A") + trend("N") + season("N")))


#Optimal values
print("Optimal values of alpha and l[0]")
## [1] "Optimal values of alpha and l[0]"
report(fit_piggies)
## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.3221247 
## 
##   Initial states:
##      l[0]
##  100646.6
## 
##   sigma^2:  87480760
## 
##      AIC     AICc      BIC 
## 13737.10 13737.14 13750.07
#Next 4 month forecast
fc_piggies <- fit_piggies |>
  forecast(h = 4)

fc_piggies
## # A fable: 4 x 6 [1M]
## # Key:     Animal, State, .model [1]
##   Animal State    .model                                                   Month
##   <fct>  <fct>    <chr>                                                    <mth>
## 1 Pigs   Victoria "ETS(Count ~ error(\"A\") + trend(\"N\") + season(\"… 2019 Jan
## 2 Pigs   Victoria "ETS(Count ~ error(\"A\") + trend(\"N\") + season(\"… 2019 Feb
## 3 Pigs   Victoria "ETS(Count ~ error(\"A\") + trend(\"N\") + season(\"… 2019 Mar
## 4 Pigs   Victoria "ETS(Count ~ error(\"A\") + trend(\"N\") + season(\"… 2019 Apr
## # ℹ 2 more variables: Count <dist>, .mean <dbl>
#It's difficult to see the forecast, so for the visualization I only used it on data from after 2015 January
fc_piggies |>
  autoplot(piggies |>
             filter(Month >= yearmonth("2015 Jan")))
## Warning: `autoplot.fbl_ts()` was deprecated in fabletools 0.6.0.
## ℹ Please use `ggtime::autoplot.fbl_ts()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
##   `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

(b)

#b

#95% CI from R
intervals <- fc_piggies |> hilo() |> slice(1)
cat("R's 95% PI is [", intervals$'95%'$lower, ",", intervals$'95%'$upper, "] \n")
## R's 95% PI is [ 76854.79 , 113518.3 ]
#Manually calculating PI

#Residuals and std
sd <- augment(fit_piggies) |> pull(.resid) |> sd()
#y-hat
y_hat <- fc_piggies |> slice(1) |> pull(.mean)

lower <- y_hat - 1.96 * sd
upper <- y_hat + 1.96 * sd

cat("The manual 95% PI is [", lower, ",", upper, "]")
## The manual 95% PI is [ 76871.01 , 113502.1 ]

The two prediction intervals are very close, almost identical. R’s interval is a smidge wider than the one I manually calculated.

8.5

#Picking Australia just because it has no missing data
australia <- global_economy |>
  filter(Country == "Australia")
#a
australia |> autoplot(Exports) + labs(title = "Australia Annual Exports")
## Warning: `autoplot.tbl_ts()` was deprecated in fabletools 0.6.0.
## ℹ Please use `ggtime::autoplot.tbl_ts()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
##   `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

There appears to be an upwards trend, and no seasonality to be observed. The fluctuations in the data points seem to be becoming more volatile as time goes on, meaning the variance might be increasing. Most of the drops recover fairly quickly, which tells us that it’s only a short-term change.

#b
fit_aus <- australia |>
  model(ETS(Exports ~ error("A") + trend("N") + season("N")))

#Forecast for the next 10 years
fc_aus <- fit_aus |> 
  forecast(h = 10)

fc_aus |> autoplot(australia)

Because the ANN forecast has no trend included, it’s flat.

#c
metrics <- accuracy(fit_aus)
print(metrics$RMSE)
## [1] 1.146794
#d

fit_aus_comp <- australia |>
  model(
    ANN = ETS(Exports ~ error("A") + trend("N") + season("N")),
    AAN = ETS(Exports ~ error("A") + trend("A") + season("N")))

comp_metrics <- accuracy(fit_aus_comp)
cat("ANN: ", comp_metrics$RMSE[1], "\n")
## ANN:  1.146794
cat("AAN: ",comp_metrics$RMSE[2])
## AAN:  1.116727

AAN has a slightly lower RMSE at 1.117. Because we can clearly see in the initial visualizations that the data has an upwards trend, it does make sense that AAN would be the more appropriate model - it has the extra trend component that ANN ignores. The difference is not that large, however, but I don’t really know enough to debate whether ANN would be fine to use as well.

#e
fit_aus_comp |> forecast(h = 10) |> autoplot(australia, level = NULL)

The AAN model accounts for the upwards trend, which seems to fit the data better overall, whereas the initial ANN one remains flat. So in this case, I think that AAN is better.

  1. Calculate a 95% prediction interval for the first forecast for each model, using the RMSE values and assuming normal errors. Compare your intervals with those produced using R.
#f
fc_comp <- fit_aus_comp |> forecast(h = 10)

#ANN
fc_ann <- fc_comp |> filter(.model == "ANN")
intervals_ann <- fc_ann |> hilo() |> slice(1)
cat("R's ANN 95% PI: [", intervals_ann$`95%`$lower, ",", intervals_ann$`95%`$upper, "]\n")
## R's ANN 95% PI: [ 18.3197 , 22.89462 ]
y_hat_ann <- fc_ann |> slice(1) |> pull(.mean)
lower_ann <- y_hat_ann - 1.96 * comp_metrics$RMSE[1]
upper_ann <- y_hat_ann + 1.96 * comp_metrics$RMSE[1]
cat("Manual ANN 95% PI: [", lower_ann, ",", upper_ann, "]\n\n")
## Manual ANN 95% PI: [ 18.35944 , 22.85488 ]
#AAN
fc_aan <- fc_comp |> filter(.model == "AAN")
intervals_aan <- fc_aan |> hilo() |> slice(1)
cat("R's AAN 95% PI: [", intervals_aan$`95%`$lower, ",", intervals_aan$`95%`$upper, "]\n")
## R's AAN 95% PI: [ 18.57028 , 23.107 ]
y_hat_aan <- fc_aan |> slice(1) |> pull(.mean)
lower_aan <- y_hat_aan - 1.96 * comp_metrics$RMSE[2]
upper_aan <- y_hat_aan + 1.96 * comp_metrics$RMSE[2]
cat("Manual AAN 95% PI: [", lower_aan, ",", upper_aan, "]\n")
## Manual AAN 95% PI: [ 18.64986 , 23.02743 ]

The manual and R intervals are very close for both models, with R’s versions being marginally wider. AAN produces slightly wider intervals than ANN, which makes sense because it’s reflecting the additional uncertainty from estimating the trend component too.

8.6

#8.6
china <- global_economy |>
  filter(Country == "China")

china |> autoplot(GDP)

lambda <- china |>
  features(GDP, features = guerrero) |>
  pull(lambda_guerrero)

fit_china <- china |>
  model(
    ANN = ETS(GDP ~ error("A") + trend("N") + season("N")),
    AAN = ETS(GDP ~ error("A") + trend("A") + season("N")),
    Damped = ETS(GDP ~ error("A") + trend("Ad") + season("N")),
    BoxCox = ETS(box_cox(GDP, lambda) ~ error("A") + trend("A") + season("N")),
    DampedBoxCox = ETS(box_cox(GDP, lambda) ~ error("A") + trend("Ad") + season("N"))
  )

fit_china |> forecast(h = 20) |> autoplot(china, level = NULL)

It’s widely accepted that GDP tends to grow exponentially, and the ANN trend ignores that and simply remains flat, meaning that it’s probably not the appropriate model here. AAN does include the trend component, and it does grow steadily, but it may be underestimating, given GDP’s exponential nature. The Damped model is similar to AAN, but the trend gradually flattens out, so this means that is produces a slightly more conservative long-term forecast. The Box-Cox transformation, on the other hand, gave an extremely aggressive upwards curve, which seemed unrealistic over 20 years. The optimal lambda I calculated being near 0 makes sense for GDP, because it effectively applies a log transform, which is the natural way to handle data that grows exponentially. The problem is that assuming exponential growth indefinitely gave us that aggressive curve. Due to this, I was prompted to try applying a Box-Cox transformation to a damped trend model, which would consider exponential growth without assuming that it would grow indefinitely. So I lastly tried out a damped Box-Cox model, and that one definitely seemed like the best fit.

8.7

#8.7
gas <- aus_production |> select(Gas)
gas |> autoplot(Gas)

fit_gas <- gas |>
  model(
    AAN = ETS(Gas ~ error("A") + trend("A") + season("A")),
    Multiplicative = ETS(Gas ~ error("M") + trend("A") + season("M")),
    Multiplicative_Damped = ETS(Gas ~ error("M") + trend("Ad") + season("M"))
  )

fit_gas |> forecast(h = 30) |> autoplot(gas, level = NULL)

accuracy(fit_gas) |> select(.model, RMSE)
## # A tibble: 3 × 2
##   .model                 RMSE
##   <chr>                 <dbl>
## 1 AAN                    4.76
## 2 Multiplicative         4.60
## 3 Multiplicative_Damped  4.59

Based on the initial visualization, it seems like multiplicative seasonality would be necessary here because the seasonal swings grows in proportion to the upwards trend in the rising gas production. Additive seasonality would add a fixed amount each season regardless of how high the gas production is, so it would use one average seasonal spike for the whole forecast, which would be too small for the later years and too large for the early years, essentially. Multiplicative seasonality will make it so that when gas production grows, the seasonal swings will grow with it.

Damped multiplicative wins on RMSE, but only marginally so over regular multiplicative. Both multiplicative models clearly beat AAN, which, again, confirms that multiplicative seasonality is necessary here. The damping provides a very slight improvement, so it’s debatable whether it’s really worth the extra parameter. ## 8.8

#a
set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries |> autoplot(Turnover)

Just like in the previous example, multiplicative seasonality is necessary here because the seasonal variations are changing proportional to the level of the series.

#b
fit_retail <- myseries |>
  model(
    Multiplicative = ETS(Turnover ~ error("M") + trend("A") + season("M")),
    Damped = ETS(Turnover ~ error("M") + trend("Ad") + season("M"))
  )

fit_retail |> forecast(h = 30) |> autoplot(myseries, level = NULL)

(c)

#c
accuracy(fit_retail) |> select(.model, RMSE)
## # A tibble: 2 × 2
##   .model          RMSE
##   <chr>          <dbl>
## 1 Multiplicative 0.613
## 2 Damped         0.616

The Multiplicative model has the lower RMSE at 0.6130408, which suggests it might be more accurate for this series.

#d
fit_retail |> select(Multiplicative) |> gg_tsresiduals()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
##   `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

With the exception of a few outliers, the residuals look mostly like white noise, so I would say this is overall acceptable.

#e
train <- myseries |> filter(year(Month) <= 2010)
test <- myseries |> filter(year(Month) > 2010)

fit_train <- train |>
  model(
    Multiplicative = ETS(Turnover ~ error("M") + trend("A") + season("M")),
    Damped = ETS(Turnover ~ error("M") + trend("Ad") + season("M")),
    SNAIVE = SNAIVE(Turnover)
  )

#Training RMSE
fit_train |> accuracy() |> select(.model, RMSE)
## # A tibble: 3 × 2
##   .model          RMSE
##   <chr>          <dbl>
## 1 Multiplicative 0.518
## 2 Damped         0.519
## 3 SNAIVE         1.21
#Forecast and test RMSE
fit_train |> forecast(h = 96) |> accuracy(test) |> select(.model, RMSE) #using 96 because that's how many months are in the test dataset
## # A tibble: 3 × 2
##   .model          RMSE
##   <chr>          <dbl>
## 1 Damped          1.15
## 2 Multiplicative  1.78
## 3 SNAIVE          1.55
fit_train |> forecast(h = 96) |> autoplot(myseries, level = NULL) + facet_wrap(vars(.model))

fit_train |> forecast(h = 96) |> autoplot(train, level = NULL) + facet_wrap(vars(.model))

To evaluate forecast performance honestly, I compared both the training and test RMSE because training RMSE alone is insufficient, as a model can fit training data well by overfitting. The Multiplicative model had the lowest training RMSE, but also somehow the worst test RMSE, even losing to SNAIVE. Damped achieved the lowest test RMSE and was the only model to beat SNAIVE on test data, which I also confirmed visually. To me, it looks like damped tracks the test more closely without aggressively extrapolating the trend. But I will admit that Multiplicative performing so badly shocked me, because it doesn’t look that terrible on the visual at all. But I would overall choose Damped for its conservative estimates and lowest RMSE.

8.9

lambda2 <- train |>
  features(Turnover, features = guerrero) |>
  pull(lambda_guerrero)

fit_stl <- train |>
  model(STL_ETS = decomposition_model(
      STL(box_cox(Turnover, lambda2) ~ season(window = "periodic")),
      ETS(season_adjust ~ error("A") + trend("A") + season("N")))
  )

fc_stl <- fit_stl |> forecast(h = 96)
fc_stl |> accuracy(test) |> select(.model, RMSE)
## # A tibble: 1 × 2
##   .model   RMSE
##   <chr>   <dbl>
## 1 STL_ETS  4.02
fc_stl |> autoplot(myseries)

I tried following chapter 5.7 for this one, and I expected it to do better because I thought Box-Cox would stabilize the variance while STL handles the seasonality, but it actually turned out to be much worse than my best model - RMSE 4.02 vs 1.15.