Homework 6 - Exponential Smoothing

#8.1 Consider the the number of pigs slaughtered in Victoria, available in the aus_livestock dataset.

Use the ETS() function to estimate the equivalent model for simple exponential smoothing. Find the optimal values of α and ℓ0, and generate forecasts for the next four months.

library(fabletools)

pigs_slaugtered <-  aus_livestock %>% 
  filter(Animal == "Pigs", State == "Victoria")

fit <- pigs_slaugtered %>% 
  model(ANN = ETS(Count ~ error("A")+ trend("N") + season("N")))

report(fit)

## Series: Count 
## Model: ETS(A,N,N) 
##   Smoothing parameters:
##     alpha = 0.3221247 
## 
##   Initial states:
##      l[0]
##  100646.6
## 
##   sigma^2:  87480760
## 
##      AIC     AICc      BIC 
## 13737.10 13737.14 13750.07

The optimal values are α = 0.3221247 and ‘ℓ0 = 100646.6’

fc <- fit %>% 
  forecast(h = 4)

fc %>% autoplot(pigs_slaugtered)

Compute a 95% prediction interval for the first forecast using ^y ± 1.96s where s is the standard deviation of the residuals. Compare your interval with the interval produced by R.

pig_resid <- augment(fit)

s <-  sd(pig_resid$.resid, na.rm = FALSE)

y <-  100646.6

lower_bound <- y - 1.96 * s
upper_bound <- y + 1.96 * s

r_fc <- fc %>% hilo(95) %>% 
  pull('95%') %>% 
  head(1)

print(paste0("The lower bound is ", lower_bound," and the upper bound is ",upper_bound ))

## [1] "The lower bound is 82331.0550465242 and the upper bound is 118962.144953476"

print(paste0("R-Generated Prediction Interval: ", r_fc))

## [1] "R-Generated Prediction Interval: [76854.7888896402, 113518.325972343]95"

#8.5

Data set global_economy contains the annual Exports from many countries. Select one country to analyse.

Plot the Exports series and discuss the main features of the data.

Guyana_df <-  global_economy %>% 
  filter(Country == "Guyana")

Guyana_df %>% autoplot(.vars = GDP) + 
  labs(title = "Guyana Exports", y = "% of GDP", x = "Year")

Use an ETS(A,N,N) model to forecast the series, and plot the forecasts.

fit <- Guyana_df |>
  model(ETS(GDP ~ error("A") + trend("N") + season("N")))

fc <- fit  %>% 
  forecast(h= 20)

fc %>% autoplot(Guyana_df) + geom_line(aes(y = .fitted), col="#D55E00",
            data = augment(fit)) +
  labs(y="% of GDP", title="Guyana Exports") +
  guides(colour = "none")

Compute the RMSE values for the training data.

tidy(fit)

## # A tibble: 2 × 4
##   Country .model                                                  term  estimate
##   <fct>   <chr>                                                   <chr>    <dbl>
## 1 Guyana  "ETS(GDP ~ error(\"A\") + trend(\"N\") + season(\"N\")… alpha   1.00e0
## 2 Guyana  "ETS(GDP ~ error(\"A\") + trend(\"N\") + season(\"N\")… l[0]    2.13e8

accuracy(fit)

## # A tibble: 1 × 11
##   Country .model        .type     ME   RMSE    MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <fct>   <chr>         <chr>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Guyana  "ETS(GDP ~ e… Trai… 5.88e7 1.32e8 7.90e7  4.01  9.64 0.992 0.992 0.501

The RSME is 132194432

Compare the results to those from an ETS(A,A,N) model. (Remember that the trended model is using one more parameter than the simpler model.) Discuss the merits of the two forecasting methods for this data set.

fit <- Guyana_df |>
  model(
    ANN = ETS(GDP ~ error("A") + trend("N") + season("N")),
    AAN = ETS(GDP ~ error("A") + trend("A") + season("N"))
  )

Compare the forecasts from both methods. Which do you think is best?

tidy(fit)

## # A tibble: 6 × 4
##   Country .model term       estimate
##   <fct>   <chr>  <chr>         <dbl>
## 1 Guyana  ANN    alpha         1.00 
## 2 Guyana  ANN    l[0]  212576297.   
## 3 Guyana  AAN    alpha         1.00 
## 4 Guyana  AAN    beta          0.266
## 5 Guyana  AAN    l[0]  160186896.   
## 6 Guyana  AAN    b[0]   12744222.

accuracy(fit)

## # A tibble: 2 × 11
##   Country .model .type           ME   RMSE    MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <fct>   <chr>  <chr>        <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 Guyana  ANN    Training 58772420. 1.32e8 7.90e7 4.01   9.64 0.992 0.992 0.501 
## 2 Guyana  AAN    Training 10743916. 9.97e7 5.53e7 0.698  8.11 0.694 0.749 0.0923

Calculate a 95% prediction interval for the first forecast for each model, using the RMSE values and assuming normal errors. Compare your intervals with those produced using R.

rmse <- sd(residuals(fit)$.resid)
first_forecast <- fc$.mean[1]

lower_bound <- first_forecast - 1.96 * rmse
upper_bound <- first_forecast + 1.96 * rmse

interval <- fc %>% hilo(95) %>% pull('95%') %>% head(1)

list(
  manual_lower_bound = lower_bound, 
  manual_upper_bound = upper_bound, 
  interval = interval
)

## $manual_lower_bound
## [1] 3400929654
## 
## $manual_upper_bound
## [1] 3841138840
## 
## $interval
## <hilo[1]>
## [1] [3357351790, 3884716704]95

#8.6

Forecast the Chinese GDP from the global_economy data set using an ETS model. Experiment with the various options in the ETS() function to see how much the forecasts change with damped trend, or with a Box-Cox transformation. Try to develop an intuition of what each is doing to the forecasts.

[Hint: use a relatively large value of h when forecasting, so you can clearly see the differences between the various options when plotting the forecasts.]

China <- global_economy %>% 
  filter(Country == "China")

fit <- China %>% 
  model( Damped = ETS(GDP ~ error("A") + trend("Ad") +
                   season("N"))
         )                                                                           

fc <- fit %>% 
  forecast(h = 5)

fc %>% autoplot(China) +
  labs(title = "Chinese GDP")

#8.7

Find an ETS model for the Gas data from aus_production and forecast the next few years. Why is multiplicative seasonality necessary here? Experiment with making the trend damped. Does it improve the forecasts?

aus_production %>% autoplot() %>% 
  labs(title = "Gas Production")

## Plot variable not specified, automatically selected `.vars = Beer`

## [[1]]

## 
## $title
## [1] "Gas Production"
## 
## attr(,"class")
## [1] "labels"

#8.8

Recall your retail time series data (from Exercise 7 in Section 2.10).

set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries %>% autoplot()

## Plot variable not specified, automatically selected `.vars = Turnover`

Why is multiplicative seasonality necessary for this series?

Multiplicative seasonality is necessary because the data is trending upwards with variable seasonality

Apply Holt-Winters’ multiplicative method to the data. Experiment with making the trend damped.

fit <- myseries %>% 
  model(
    multiplicative = ETS(Turnover ~ error("M") + trend("A") + season("M")),
    'damped multiplicative' = ETS(Turnover ~ error("M") + trend("Ad") + season("M")))

fc <- fit |> forecast(h = "3 years")
fc |>
  autoplot(myseries, level = NULL)

Compare the RMSE of the one-step forecasts from the two methods. Which do you prefer?

accuracy(fit)

## # A tibble: 2 × 12
##   State      Industry .model .type      ME  RMSE   MAE     MPE  MAPE  MASE RMSSE
##   <chr>      <chr>    <chr>  <chr>   <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl>
## 1 Northern … Clothin… multi… Trai… -0.0128 0.613 0.450 -0.469   5.15 0.513 0.529
## 2 Northern … Clothin… dampe… Trai…  0.0398 0.616 0.444 -0.0723  5.06 0.507 0.531
## # ℹ 1 more variable: ACF1 <dbl>

The damped method appears to show less variation, whereas the multiplicative method exhibits a more pronounced upward trend and generates higher forecasts.

Check that the residuals from the best method look like white noise.

myseries %>%
  model(multiplicative = ETS(Turnover ~ error("M") + trend("A") + season("M"))) %>%
  gg_tsresiduals() +
  ggtitle("Multiplicative Method")

Now find the test set RMSE, while training the model to the end of 2010. Can you beat the seasonal naïve approach from Exercise 7 in Section 5.11?

myseries_train <- myseries %>%
  filter(year(Month) < 2011)

fit_train <- myseries_train %>%
  model(multi = ETS(Turnover ~ error("M") + trend("A") + season("M")),
        snaive = SNAIVE(Turnover))

#producing forecasts
fc <- fit_train %>%
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc %>% autoplot(myseries, level = NULL) +
  labs(title = "Multiplicative Method")

accuracy_output <- fc %>%
  accuracy(myseries) 

print(accuracy_output)

## # A tibble: 2 × 12
##   .model State  Industry .type     ME  RMSE   MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <chr>  <chr>    <chr>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 multi  North… Clothin… Test  -1.54   1.78  1.60 -12.3  12.6   1.75  1.47 0.495
## 2 snaive North… Clothin… Test   0.836  1.55  1.24   5.94  9.06  1.36  1.28 0.601

The multiplicative method appears to provide a better forecast for the data, as its RMSE is considerably lower than that of the seasonal naïve approach. This indicates that the multiplicative method is more suitable for the task.

#8.9

For the same retail data, try an STL decomposition applied to the Box-Cox transformed series, followed by ETS on the seasonally adjusted data. How does that compare with your best previous forecasts on the test set?

lambda <- myseries %>%
  features(Turnover, features = guerrero) %>%
  pull(lambda_guerrero)

myseries %>%
  model(STL(box_cox(Turnover,lambda) ~ season(window = "periodic"), robust = TRUE)) %>%
  components() %>%
  autoplot() +
  ggtitle("STL with Box-Cox")

dcmp <- myseries %>%
  model(STL_box = STL(box_cox(Turnover,lambda) ~ season(window = "periodic"), robust = TRUE)) %>%
  components()

myseries$Turnover_sa <- dcmp$season_adjust

myseries_train <- myseries %>%
  filter(year(Month) < 2011)

fit <- myseries_train %>%
  model(ETS(Turnover_sa ~ error("M") + trend("A") + season("M")))

fit %>% gg_tsresiduals()  +
  ggtitle("Residual Plots for Australian Retail Turnover")

#Box-Pierce test
myseries %>%
  model(multiplicative = ETS(Turnover_sa ~ error("M") + trend("A") + season("M"))) %>%
  augment() %>% 
  features(.innov, box_pierce, lag = 24, dof = 0)

## # A tibble: 1 × 5
##   State              Industry                           .model bp_stat bp_pvalue
##   <chr>              <chr>                              <chr>    <dbl>     <dbl>
## 1 Northern Territory Clothing, footwear and personal a… multi…    24.6     0.426

#produce forecasts for test data
fc <- fit %>%
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover,
## Turnover_sa)`

accuracy_output <- fc %>%
  accuracy(myseries) 

print(accuracy_output)

## # A tibble: 1 × 12
##   .model   State Industry .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "ETS(Tu… Nort… Clothin… Test  -0.297 0.325 0.298 -10.4  10.4  1.96  1.60 0.687

accuracy_output <- fc %>%
  accuracy(myseries) 

print(accuracy_output)

## # A tibble: 1 × 12
##   .model   State Industry .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr> <chr>    <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "ETS(Tu… Nort… Clothin… Test  -0.297 0.325 0.298 -10.4  10.4  1.96  1.60 0.687

The STL decomposition applied to the Box-Cox transformed series, followed by ETS on the seasonally adjusted data, produces a more accurate forecast compared to previous methods. The RMSE is significantly lower than both the seasonal naïve approach and the standard ETS method, indicating that the STL decomposition is more effective for this task. Specifically, the RMSE for the STL decomposition is 0.3252, a substantial improvement over the multiplicative method, which had an RMSE of 1.7. This suggests that STL decomposition better captures the underlying patterns in the data.

Data 624 HW 5

Mikhail Broomes

2024-09-23

Homework 6 - Exponential Smoothing