library(fpp3)
library(patchwork)
library(lubridate)
library(scales)
library(stringr)
library(forecast)
options(scipen=999)
data(aus_livestock)
Hyndman Chapter 8 Homework
8.1 Pigs from Victoria
Consider the the number of pigs slaughtered in Victoria, available in the aus_livestock
dataset.
= aus_livestock |>
victoria filter(State == "Victoria" & Animal == "Pigs")
- Use the
ETS()
function to estimate the equivalent model for simple exponential smoothing. Find the optimal values of α and ℓ0 and generate forecasts for the next four months.
Easy enough, but I spent all my time trying to get the l0 value extracted so I could print it.
<- capture.output(victoria
v_output |> model(ETS(Count))
|> report()
)
<- trimws(v_output[grep("Model:", v_output)])
model_line <- trimws(v_output[grep("alpha =", v_output)])
alpha_line <- grep("l\\[0\\]", v_output)
l0_index <- trimws(v_output[l0_index + 1])
trimmed_l0_line <- strsplit(trimmed_l0_line, " ")[[1]]
split_l0_line <- split_l0_line[split_l0_line != ""]
split_l0_line <- split_l0_line[1]
l0_value sprintf("Model %s %s, l0 value = %s", model_line, alpha_line, l0_value)
[1] "Model Model: ETS(A,N,A) alpha = 0.3579401, l0 value = 95487.5"
# No forcast out 8 intervals later.
= victoria |> model(ETS(Count))
v_fit = v_fit |> forecast(h = 4)
v_forecasts print(v_forecasts)
# A fable: 4 x 6 [1M]
# Key: Animal, State, .model [1]
Animal State .model Month Count .mean
<fct> <fct> <chr> <mth> <dist> <dbl>
1 Pigs Victoria ETS(Count) 2019 Jan N(84425, 60742898) 84425.
2 Pigs Victoria ETS(Count) 2019 Feb N(85158, 68525346) 85158.
3 Pigs Victoria ETS(Count) 2019 Mar N(91567, 76307794) 91567.
4 Pigs Victoria ETS(Count) 2019 Apr N(90098, 84090242) 90098.
- Compute a 95% prediction interval for the first forecast using yHat +- 1.96s where s is the standard deviation of the residuals. Compare your interval with the interval produced by R.
This was way more tricky. Eventually I flipped back over to what I know, which is expanding dataframes with columns to the right, and the just filtering or showing what I wanted. That, to me, was way easier than trying to take stuff out of the dataframe and compare it to stuff in the dataframe.
# print(v_fit)
# print(v_forecasts)
<- v_fit$`ETS(Count)`[[1]]$fit$par$estimate[1]
alpha <- v_fit$`ETS(Count)`[[1]]$fit$states[1, "l"]
l0 <- l0$l[1]
l0_value
= residuals(v_fit) |> as_tibble()
residuals_tibble <- sd(residuals_tibble$.resid)
v_stdev
# print(alpha)
# print(l0_value)
# print(v_standard_deviation)
# this is for all forecasts, not just the first one.
<- v_forecasts %>%
v_forecasts as_tibble() |>
mutate(
variance = as.numeric(str_extract(as.character(Count), "(?<=, )\\d+")),
stdev = sqrt(variance),
lower_b = .mean - 1.96 * stdev,
upper_b = .mean + 1.96 * stdev,
manual_lower_b = .mean - 1.96 * v_stdev,
man_upper_b = .mean + 1.96 * v_stdev
|>
) select(-Animal, -State, -.model, -Month, -Count)
print(v_forecasts[1, ])
# A tibble: 1 × 7
.mean variance stdev lower_b upper_b manual_lower_b man_upper_b
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 84425. 60742898 7794. 69149. 99701. 69328. 99521.
8.5 Gaza Exports
Data set global_economy
contains the annual Exports from many countries. Select one country to analyse.
= global_economy |>
gaza filter(Country == "West Bank and Gaza" & Year >= 1996) |>
select(Exports)
- Plot the Exports series and discuss the main features of the data.
There seems to be a slight upward trend since about 2010, but the ups and downs on the early 2.000’s are clearly represented and are a result of geopolitical security developments in the region.
I can’t imagine what it’s going to be this year, probably nothing at all.
|>
gaza autoplot ()
Plot variable not specified, automatically selected `.vars = Exports`
- Use an ETS(A,N,N) model to forecast the series, and plot the forecasts.
<- gaza |>
g1_model model(ETS(Exports ~ error("A") + trend("N") + season("N")))
<- g1_model |> forecast(h = 4)
g1_fc<- g1_model |> accuracy()
g1_accuracy_results <- round(g1_accuracy_results$RMSE, 3)
g1_rmse |> autoplot(gaza) g1_fc
- Compute the RMSE values for the training data.
sprintf("RMSE No Trend: %s", g1_rmse)
[1] "RMSE No Trend: 1.841"
- Compare the results to those from an ETS(A,A,N) model. (Remember that the trended model is using one more parameter than the simpler model.) Discuss the merits of the two forecasting methods for this data set.
The RMSE is lower for additive trend. And there does appear to be a trend. ETS(AAN) is a better model for this data set.
<- gaza |>
g2_model model(ETS(Exports ~ error("A") + trend("A") + season("N")))
<- g2_model |> forecast(h = 4)
g2_fc<- g2_model |> accuracy()
g2_accuracy_results <- round(g2_accuracy_results$RMSE, 3)
g2_rmse sprintf("RMSE Additive Trend : %s", g2_rmse)
[1] "RMSE Additive Trend : 1.837"
- Compare the forecasts from both methods. Which do you think is best?
In this model, the fit of the ETS(AAN) model fits better than the ETS(ANN) model, so we expect ETS(AAN) to be more correct with our forecasts. That said, after forecasting them out, it looks like the ETS(AAN) is showing slightly higher exports than the ETS(ANN).
print(g1_fc)
# A fable: 4 x 4 [1Y]
# Key: .model [1]
.model Year Exports .mean
<chr> <dbl> <dist> <dbl>
1 "ETS(Exports ~ error(\"A\") + trend(\"N\") + season(\"… 2018 N(18, 3.7) 18.4
2 "ETS(Exports ~ error(\"A\") + trend(\"N\") + season(\"… 2019 N(18, 6) 18.4
3 "ETS(Exports ~ error(\"A\") + trend(\"N\") + season(\"… 2020 N(18, 8.3) 18.4
4 "ETS(Exports ~ error(\"A\") + trend(\"N\") + season(\"… 2021 N(18, 11) 18.4
print(g2_fc)
# A fable: 4 x 4 [1Y]
# Key: .model [1]
.model Year Exports .mean
<chr> <dbl> <dist> <dbl>
1 "ETS(Exports ~ error(\"A\") + trend(\"A\") + season(\"… 2018 N(19, 4.1) 18.5
2 "ETS(Exports ~ error(\"A\") + trend(\"A\") + season(\"… 2019 N(19, 6.6) 18.6
3 "ETS(Exports ~ error(\"A\") + trend(\"A\") + season(\"… 2020 N(19, 9.2) 18.8
4 "ETS(Exports ~ error(\"A\") + trend(\"A\") + season(\"… 2021 N(19, 12) 18.9
- Calculate a 95% prediction interval for the first forecast for each model, using the RMSE values and assuming normal errors. Compare your intervals with those produced using R.
It’s simple to what we said above, the ETS(AAN) model shows a little higher of a set of bounds, and that’s what we expect due to the trending nature of this data.
## ETS(ANN)
<- g1_fc |>
g1_fc_expanded as_tibble() |>
mutate(
model="ETS(ANN)",
variance = as.numeric(str_extract(as.character(Exports), "(?<=, )\\d+")),
stdev = sqrt(variance),
lower_b = .mean - 1.96 * stdev,
upper_b = .mean + 1.96 * stdev,
manual_lower_b = .mean - 1.96 * v_stdev,
man_upper_b = .mean + 1.96 * v_stdev
|>
) select(-Exports, -.model)
## ETS(AAN)
<- g2_fc |>
g2_fc_expanded as_tibble() |>
mutate(
model="ETS(AAN)",
variance = as.numeric(str_extract(as.character(Exports), "(?<=, )\\d+")),
stdev = sqrt(variance),
lower_b = .mean - 1.96 * stdev,
upper_b = .mean + 1.96 * stdev,
manual_lower_b = .mean - 1.96 * v_stdev,
man_upper_b = .mean + 1.96 * v_stdev
|>
) select(-Exports, -.model)
<- bind_rows(
combined_rows |> slice(2),
g1_fc_expanded |> slice(2)
g2_fc_expanded
)
print(combined_rows)
# A tibble: 2 × 9
Year .mean model variance stdev lower_b upper_b manual_lower_b man_upper_b
<dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2019 18.4 ETS(ANN) 6 2.45 13.6 23.2 -15078. 15115.
2 2019 18.6 ETS(AAN) 6 2.45 13.8 23.4 -15078. 15115.
8.6 Chinese GDP
Forecast the Chinese GDP from the global_economy
data set using an ETS model. Experiment with the various options in the ETS(
) function to see how much the forecasts change with damped trend, or with a Box-Cox transformation. Try to develop an intuition of what each is doing to the forecasts.
[Hint: use a relatively large value of h when forecasting, so you can clearly see the differences between the various options when plotting the forecasts.]
I am gaining two intuitions. Firstly, that overall it might be best to run ETS() in a way that you allow it to pick the model for you. In this case it was ETS(M,A,N). Secondly, dampening seems like the way to go. I’m not 100% sure how it works, but nothing goes on forever in a linear direction.
= global_economy |>
china filter(Country == "China") |>
mutate(gdp = GDP / 1e6) |> # to millions, a cool little trick!
mutate(gdp = round(gdp, 4)) |>
select(gdp)
<- china |>
c_fit model(ETS(gdp))
<- china |>
c_fit_dampened model(ETS(gdp ~ error("M") + trend("Ad") + season("N")))
# c_fit |> report()
# c_fit_dampened |> report()
<- c_fit |> forecast(h = 50)
c_forecast <- c_fit_dampened |> forecast(h = 50)
c_forecast_dampened
# Combine forecasts for plotting
<- bind_rows(
combined_forecasts %>% mutate(Model = "Without Damping"),
c_forecast %>% mutate(Model = "With Damping")
c_forecast_dampened
)
ggplot() +
geom_line(data = china, aes(x = Year, y = gdp), color = "black", size = 1) + # Actual data line
geom_line(data = combined_forecasts, aes(x = Year, y = .mean, color = Model)) + # Forecast lines
labs(title = "Comparison of ETS Forecasts",
subtitle = "With and Without Damped Trend",
x = "Year",
y = "GDP (millions)",
color = "Model") +
theme_minimal()
8.7 Australian Gas
Find an ETS model for the Gas data from aus_production
and forecast the next few years. Why is multiplicative seasonality necessary here? Experiment with making the trend damped. Does it improve the forecasts?
The first chart shows the ever-increasing seasonality of this data set, and that is the definitive look for what one should select multiplicative for in an ETS model. The next chart plots non-seasonal, seasonal, and seasonal dampened on top of each other. The ETS Dampened seems to follow the trend ot the eye, but the smallest RMSE is not from dampened, it’s from ETS(MAM)
<- aus_production |> select(Gas)
gas
<- gas |> model(ETS(Gas)) # ETS(M,A,M) is selected.
g_fit <- gas |> model(ETS(Gas ~ error("M") + trend("Ad") + season("M")))
g_fit_dampened <- gas |> model(ETS(Gas ~ error("A") + trend("N") + season("N")))
g_fit_no_season
<- g_fit |> forecast(h = 30)
g_forecast <- g_fit_dampened |> forecast(h = 30)
g_forecast_dampened <- g_fit_no_season |> forecast(h = 30)
g_forecast_no_season
<- g_forecast |>
g_forecast_tbl as_tibble() |>
mutate(Type = "ETS with Seasonality")
<- g_forecast_dampened |>
g_forecast_dampened_tbl as_tibble() |>
mutate(Type = "ETS Dampened")
<- g_forecast_no_season |>
g_forecast_no_season_tbl as_tibble() |>
mutate(Type = "ETS without Seasonality")
autoplot(gas) + labs(title="Gas seasonality - the perfect case for multiplicative")
ggplot(gas, aes(x = Quarter, y = Gas)) +
geom_line(color = "black") +
geom_line(data = g_forecast_tbl, aes(x = Quarter, y = .mean, color = Type)) +
geom_line(data = g_forecast_dampened_tbl, aes(x = Quarter, y = .mean, color = Type)) +
geom_line(data = g_forecast_no_season_tbl, aes(x = Quarter, y = .mean, color = Type)) +
scale_color_manual(values = c(
"ETS with Seasonality" = "blue",
"ETS without Seasonality" = "red",
"ETS Dampened" = "green"
+
)) labs(title = "Gas Production with and without Seasonality",
x = "Quarter",
y = "Gas Production",
color = "Forecast Type") +
theme_minimal()
8.8 Aus Retail
Recall your retail time series data (from Exercise 7 in Section 2.10).
# that was
set.seed(12345678)
<- aus_retail |>
retail_series filter(`Series ID` == sample(aus_retail$`Series ID`,1)) |>
select(Turnover)
- Why is multiplicative seasonality necessary for this series?
- A quick autoplot shows that characteristic ever-increasing seasonality number of multiplicative seasonality in ETS models.*
|> autoplot() retail_series
- Apply Holt-Winters’ multiplicative method to the data. Experiment with making the trend damped.
- Compare the RMSE of the one-step forecasts from the two methods. Which do you prefer?
I prefer the one with the least error, that seems to be ETS(AAM)
<- retail_series |>
fit model(
multiplicat = ETS(Turnover ~ error("A") + trend("A") + season("M")),
multip_damp = ETS(Turnover ~ error("A") + trend("Ad") + season("N")),
holt_winter = ETS(Turnover ~ error("A") + trend("A") + season("A")),
holt_w_damp = ETS(Turnover ~ error("A") + trend("Ad") + season("A"))
)tidy(fit)
# A tibble: 57 × 3
.model term estimate
<chr> <chr> <dbl>
1 multiplicat alpha 0.550
2 multiplicat beta 0.000100
3 multiplicat gamma 0.202
4 multiplicat l[0] 2.35
5 multiplicat b[0] 0.0334
6 multiplicat s[0] 0.813
7 multiplicat s[-1] 0.798
8 multiplicat s[-2] 0.775
9 multiplicat s[-3] 1.32
10 multiplicat s[-4] 0.992
# ℹ 47 more rows
accuracy(fit)
# A tibble: 4 × 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 multiplicat Training -0.00119 0.600 0.443 -0.265 5.21 0.506 0.517 -0.0422
2 multip_damp Training 0.175 1.88 1.37 -1.64 15.3 1.57 1.63 0.214
3 holt_winter Training -0.0223 0.676 0.514 -0.515 6.65 0.586 0.584 0.109
4 holt_w_damp Training 0.0660 0.687 0.517 0.716 6.74 0.591 0.593 0.132
- Check that the residuals from the best method look like white noise.
It looks like it, nothing stands out as a pattern, it’s all just centered around zero.
augment(fit) |>
autoplot(.resid)
- Now find the test set RMSE, while training the model to the end of 2010. Can you beat the seasonal naïve approach from Exercise 7 in Section 5.11?
Assuming this is asking if ETS is better than SNAIVE. Yes, it appears it is!
set.seed(12345678)
<- aus_retail |>
retail_series2 filter(`Series ID` == sample(aus_retail$`Series ID`,1)) |>
select(Turnover) |>
filter(as.Date(Month) < as.Date("2010-01-01"))
<- retail_series2 |>
fit_retail2 model(
multiplicative = ETS(Turnover ~ error("A") + trend("A") + season("M")),
snaive = SNAIVE(Turnover)
)accuracy(fit_retail2)
# A tibble: 2 × 10
.model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 multiplicative Training -0.0230 0.490 0.377 -0.656 5.51 0.410 0.402 0.184
2 snaive Training 0.440 1.22 0.919 5.37 12.7 1 1 0.795
8.9 More Aus Retail
For the same retail data, try an STL decomposition applied to the Box-Cox transformed series, followed by ETS on the seasonally adjusted data. How does that compare with your best previous forecasts on the test set?
If I did this right, all the means on box cox are 11.7, while the means on ETS are between 9 and 14. The BoxCox like the same thing as a Naive plot.
set.seed(12345678)
## our best performing model for aus_retail, ETS(M,A,M)
<- aus_retail |>
retail_series3 filter(`Series ID` == sample(aus_retail$`Series ID`,1)) |>
select(Turnover) |>
filter(as.Date(Month) < as.Date("2010-01-01"))
<- retail_series3 |> model(ETS(Turnover))
fit3 <- fit3 |> forecast(h ="2 years")
r_forecast3
## Now let's box cox our turnover column.
<- BoxCox.lambda(retail_series3$Turnover)
lambda <- retail_series3 |>
retail_series3 mutate(Turnover_transformed = BoxCox(Turnover, lambda))
## create the model.
<- retail_series3 |>
stl_fit model(
stl_bc = STL(Turnover_transformed ~ season(window = "periodic"))
)# Seems strange, bu supposedly the seasonally adjusted series is often
# computed by subtracting the seasonal component from the original series
<- stl_fit %>%
stl_components components() %>%
mutate(Season_adjust = Turnover_transformed - season_year) |>
rename(model_name = .model)
# now fit an ETS model on this seasonally adjusted column
<- model(stl_components,
ets_fit_on_sa ets = ETS(Season_adjust ~ error("A") + trend("N") + season("N")))
# and forecast
<- forecast(ets_fit_on_sa, h = "2 years")
forecast_ets_on_sa # invert back to original scale.
<- forecast_ets_on_sa %>%
forecast_ets_on_sa_inverted mutate(Mean = InvBoxCox(.mean, lambda))
# and compare
r_forecast3
# A fable: 24 x 4 [1M]
# Key: .model [1]
.model Month Turnover .mean
<chr> <mth> <dist> <dbl>
1 ETS(Turnover) 2010 Jan N(10, 0.48) 10.0
2 ETS(Turnover) 2010 Feb N(9.2, 0.54) 9.23
3 ETS(Turnover) 2010 Mar N(10, 0.83) 10.1
4 ETS(Turnover) 2010 Apr N(11, 1.1) 10.5
5 ETS(Turnover) 2010 May N(12, 1.7) 11.9
6 ETS(Turnover) 2010 Jun N(13, 2.3) 12.6
7 ETS(Turnover) 2010 Jul N(14, 3.2) 13.7
8 ETS(Turnover) 2010 Aug N(14, 3.7) 13.6
9 ETS(Turnover) 2010 Sep N(13, 3.7) 12.8
10 ETS(Turnover) 2010 Oct N(13, 4.3) 12.9
# ℹ 14 more rows
forecast_ets_on_sa_inverted
# A fable: 24 x 6 [1M]
# Key: model_name, .model [1]
model_name .model Month Season_adjust .mean Mean
<chr> <chr> <mth> <dist> <dbl> <dbl>
1 stl_bc ets 2010 Jan N(1.9, 0.0023) 1.94 11.7
2 stl_bc ets 2010 Feb N(1.9, 0.0032) 1.94 11.7
3 stl_bc ets 2010 Mar N(1.9, 0.0042) 1.94 11.7
4 stl_bc ets 2010 Apr N(1.9, 0.0051) 1.94 11.7
5 stl_bc ets 2010 May N(1.9, 0.006) 1.94 11.7
6 stl_bc ets 2010 Jun N(1.9, 0.007) 1.94 11.7
7 stl_bc ets 2010 Jul N(1.9, 0.0079) 1.94 11.7
8 stl_bc ets 2010 Aug N(1.9, 0.0089) 1.94 11.7
9 stl_bc ets 2010 Sep N(1.9, 0.0098) 1.94 11.7
10 stl_bc ets 2010 Oct N(1.9, 0.011) 1.94 11.7
# ℹ 14 more rows
That seems odd, let’s plot it!
# now let's plot.
<- retail_series3 |>
retail_series3_plot as_tibble() |>
select(Month, Turnover)
<- r_forecast3 |>
r_forecast3_plot as_tibble()|>
mutate(Model = "ETS(M,A,M)")
<- forecast_ets_on_sa_inverted |>
forecast_ets_on_sa_inverted_plot as_tibble() |>
mutate(Model = "ETS on Seasonally Adjusted Data")
# Bind all data together
<- bind_rows(
combined_data |> mutate(Model = "Original Data"),
retail_series3_plot %>% select(Month, Turnover = .mean, Model),
r_forecast3_plot %>% select(Month, Turnover = Mean, Model)
forecast_ets_on_sa_inverted_plot
)
# Plot using ggplot
ggplot(combined_data, aes(x = Month, y = Turnover, color = Model)) +
geom_line() +
labs(title = "Retail Turnover Forecast Comparison", x = "Month", y = "Turnover") +
scale_color_manual(values = c("Original Data" = "black",
"ETS(M,A,M)" = "blue",
"ETS on Seasonally Adjusted Data" = "red")) +
theme_minimal()