# Import required libraries and data
library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.3.3
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tibble 3.2.1 ✔ tsibbledata 0.4.1
## ✔ dplyr 1.1.4 ✔ feasts 0.3.2
## ✔ tidyr 1.3.1 ✔ fable 0.3.4
## ✔ lubridate 1.9.3 ✔ fabletools 0.4.2
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ purrr 1.0.2 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
Consider the the number of pigs slaughtered in Victoria, available in
the aus_livestock
dataset.
# Filter data
pigs_data <- aus_livestock |>
filter(Animal == 'Pigs') |>
filter(State == 'Victoria') |> select(Month, Count)
print(head(pigs_data))
## # A tsibble: 6 x 2 [1M]
## Month Count
## <mth> <dbl>
## 1 1972 Jul 94200
## 2 1972 Aug 105700
## 3 1972 Sep 96500
## 4 1972 Oct 117100
## 5 1972 Nov 104600
## 6 1972 Dec 100500
Use the ETS()
function to estimate the equivalent model
for simple exponential smoothing. Find the optimal values of \(\alpha\) and \(\ell_0\), and generate forecasts for the
next four months.
First we need to see if the data shows a trend or seasonality.
# Plot the data
pigs_data |> autoplot(Count)
Since there is no trend or seasonality evident in this data, we can use simple exponential smoothing.
# Get forecast parameters
pigs_fit <- pigs_data |> model(ETS(Count ~ error("A") + trend("N") + season("N")))
coef(pigs_fit)
## # A tibble: 2 × 3
## .model term estimate
## <chr> <chr> <dbl>
## 1 "ETS(Count ~ error(\"A\") + trend(\"N\") + season(\"N\"))" alpha 0.322
## 2 "ETS(Count ~ error(\"A\") + trend(\"N\") + season(\"N\"))" l[0] 100647.
The optimal value of \(\alpha\) is approximately 0.3221 and the optimal value of \(\ell_0\) is approximately 100,647.
# Generate forecast
pigs_fit |> forecast(h = 4)
## # A fable: 4 x 4 [1M]
## # Key: .model [1]
## .model Month Count .mean
## <chr> <mth> <dist> <dbl>
## 1 "ETS(Count ~ error(\"A\") + trend(\"N\") + … 2019 Jan N(95187, 8.7e+07) 95187.
## 2 "ETS(Count ~ error(\"A\") + trend(\"N\") + … 2019 Feb N(95187, 9.7e+07) 95187.
## 3 "ETS(Count ~ error(\"A\") + trend(\"N\") + … 2019 Mar N(95187, 1.1e+08) 95187.
## 4 "ETS(Count ~ error(\"A\") + trend(\"N\") + … 2019 Apr N(95187, 1.1e+08) 95187.
Compute a 95% prediction interval for the first forecast using \(\hat{y}\pm1.96s\) where \(s\) is the standard deviation of the residuals. Compare your interval with the interval produced by R.
# Get the data for the first forecast
print(pigs_fit |> forecast(h = 1) |> select(Month, Count))
## # A fable: 1 x 2 [1M]
## Month Count
## <mth> <dist>
## 1 2019 Jan N(95187, 8.7e+07)
# Define variable values
p_hat <- 95187
s <- sd(augment(pigs_fit)$.resid)
# Calculate the lower and upper limits of the prediction interval
ll <- p_hat - (1.96 * s)
ul <- p_hat + (1.96 * s)
pigs_ci <- c(ll, ul)
print(pigs_ci)
## [1] 76871.46 113502.54
# Get R confidence interval
print(pigs_fit |> forecast(h = 1) |> hilo(level = 95) |> select(Month, '.mean', '95%'))
## # A tsibble: 1 x 3 [1M]
## Month .mean `95%`
## <mth> <dbl> <hilo>
## 1 2019 Jan 95187. [76854.79, 113518.3]95
My calculated prediction interval is largely the same as the one produced by R, but it is a little bit narrower (mine has a “higher floor” and “lower ceiling”).
Data set global_economy
contains the annual Exports from
many countries. Select one country to analyse.
I’ve chosen to analyze the data for Greece.
export_data <- global_economy |> filter (Code == 'GRC') |> select(Year, Exports)
print(head(export_data))
## # A tsibble: 6 x 2 [1Y]
## Year Exports
## <dbl> <dbl>
## 1 1960 8.51
## 2 1961 8.65
## 3 1962 9.06
## 4 1963 9.35
## 5 1964 8.54
## 6 1965 8.36
Plot the Exports series and discuss the main features of the data.
export_data |> autoplot(Exports)
The data shows a clear trend upward. The data is cyclic (because is shows rises and falls), but does not show seasonality (because the increases and decreases do not occur at regular intervals).
Use an ETS(A,N,N) model to forecast the series, and plot the forecasts.
# Generate a forecast
export_data |> model(ETS(Exports ~ error("A") + trend("N") + season("N"))) |>
forecast(h = 25) |> autoplot(export_data, level = NULL)
Compute the RMSE values for the training data.
# Create a model from 80% of the original data
export_train <- export_data[1:round(nrow(export_data)*.8),]
export_forecast <- export_train |>
model(ETS(Exports ~ error("A") + trend("N") + season("N"))) |>
forecast(new_data = anti_join(export_data, export_train))
## Joining with `by = join_by(Year, Exports)`
# Report the RMSE of the forecast
accuracy(export_forecast, export_data) |> select(.model, RMSE)
## # A tibble: 1 × 2
## .model RMSE
## <chr> <dbl>
## 1 "ETS(Exports ~ error(\"A\") + trend(\"N\") + season(\"N\"))" 7.20
The RMSE is approximately 7.1986.
Compare the results to those from an ETS(A,A,N) model. (Remember that the trended model is using one more parameter than the simpler model.) Discuss the merits of the two forecasting methods for this data set.
# Generate an AAN forecast
export_data |>
model(ETS(Exports ~ error("A") + trend("A") + season("N"))) |>
forecast(h = 25) |>
autoplot(export_data, level = NULL)
The AAN model’s additional parameter allows it to account for the positive trend in the underlying data, which the ANN model does not consider.
Compare the forecasts from both methods. Which do you think is best?
The competing forecasts are graphed on the same axes below.
# Plot both forecasts on the same graph
export_data |>
model(
ANN = ETS(Exports ~ error("A") + trend("N") + season("N")),
AAN = ETS(Exports ~ error("A") + trend("A") + season("N"))) |>
forecast(h = 25) |>
autoplot(export_data, level = NULL)
I think that the AAN model is more appropriate for this situation. The overall data shows a clear trend upward, and a model that accounts for this trend will produce better forecasts than one that doesn’t. But to be sure, I’ll do a test-train split for each model and calculate the RMSE for each model.
# Create each type of model from 80% of the original data
export_forecast_2 <- export_train |>
model(
ANN = ETS(Exports ~ error("A") + trend("N") + season("N")),
AAN = ETS(Exports ~ error("A") + trend("A") + season("N"))) |>
forecast(new_data = anti_join(export_data, export_train))
## Joining with `by = join_by(Year, Exports)`
# Report the RMSE of each model
accuracy(export_forecast_2, export_data) |> select(.model, RMSE)
## # A tibble: 2 × 2
## .model RMSE
## <chr> <dbl>
## 1 AAN 5.37
## 2 ANN 7.20
As expected, the AAN model has a lower RMSE than the ANN model, so it is a better forecast.
Calculate a 95% prediction interval for the first forecast for each model, using the RMSE values and assuming normal errors. Compare your intervals with those produced using R.
# Define models
exports_mod_1 <- export_data |> model(ETS(Exports ~ error("A") + trend("N") + season("N")))
exports_mod_2 <- export_data |> model(ETS(Exports ~ error("A") + trend("A") + season("N")))
# Get s for each model
s_mod_1 <- sd(augment(exports_mod_1)$.resid)
s_mod_2 <- sd(augment(exports_mod_2)$.resid)
# Get p_hat for model 1
exports_mod_1 |>
forecast(h = 1) |>
select(Year, Exports)
## # A fable: 1 x 2 [1Y]
## Year Exports
## <dbl> <dist>
## 1 2018 N(33, 3.4)
# Get p_hat for model 2
exports_mod_2 |>
forecast(h = 1) |>
select(Year, Exports)
## # A fable: 1 x 2 [1Y]
## Year Exports
## <dbl> <dist>
## 1 2018 N(34, 3.3)
# Calculate 95% confidence interval for model 1 (ANN)
ll_ann <- 33 - (1.96 * s_mod_1)
ul_ann <- 33 + (1.96 * s_mod_1)
ann_ci <- c(ll_ann, ul_ann)
print(ann_ci)
## [1] 29.50899 36.49101
# Get 95% confidence interval for model 1 (ANN) using R
print(exports_mod_1 |> forecast(h = 1) |> hilo(level = 95) |> select(Year, '.mean', '95%'))
## # A tsibble: 1 x 3 [1Y]
## Year .mean `95%`
## <dbl> <dbl> <hilo>
## 1 2018 33.2 [29.5966, 36.84272]95
For the ANN model, I calculate a 95% prediction interval to range from 29.509 to 36.491, while R calculates the interval to range from 29.5966 to 36.8427. The ranges are very similar, although the one reported by R is slightly higher on both ends.
# Calculate 95% confidence interval for model 1 (AAN)
ll_aan <- 34 - (1.96 * s_mod_2)
ul_aan <- 34 + (1.96 * s_mod_2)
aan_ci <- c(ll_aan, ul_aan)
print(aan_ci)
## [1] 30.50887 37.49113
# Get 95% confidence interval for model 1 (ANN) using R
print(exports_mod_2 |> forecast(h = 1) |> hilo(level = 95) |> select(Year, '.mean', '95%'))
## # A tsibble: 1 x 3 [1Y]
## Year .mean `95%`
## <dbl> <dbl> <hilo>
## 1 2018 33.7 [30.07503, 37.24849]95
For the AAN model, I calculate a 95% prediction interval to range from 30.5089 to 37.4911, while R calculates the interval to range from 30.075 to 37.2485. The ranges are very similar, although the one reported by R is slightly lower on both ends.
Notably, the upper and lower limits of the prediction interval for the AAN model (which accounts for the upward trend in the existing data) are both higher than the corresponding limits of the prediction interval for the ANN model (which does not).
Forecast the Chinese GDP from the global_economy
data
set using an ETS model. Experiment with the various options in the
ETS()
function to see how much the forecasts change with
damped trend, or with a Box-Cox transformation. Try to develop an
intuition of what each is doing to the forecasts.
To see the differences, I will forecast 50 years of data.
# Filter data
china_gdp_data <- global_economy |> filter(Code == 'CHN') |> select(Year, GDP)
print(head(china_gdp_data))
## # A tsibble: 6 x 2 [1Y]
## Year GDP
## <dbl> <dbl>
## 1 1960 59716467625.
## 2 1961 50056868958.
## 3 1962 47209359006.
## 4 1963 50706799903.
## 5 1964 59708343489.
## 6 1965 70436266147.
To start, here are the ANN and AAN models.
# Generate ANN and AAN forecasts
china_gdp_data |>
model(
ANN = ETS(GDP ~ error("A") + trend("N") + season("N")),
AAN = ETS(GDP ~ error("A") + trend("A") + season("N"))) |>
forecast(h = 50) |>
autoplot(china_gdp_data, level = NULL)
As stated in the previous exercise, the AAN model accounts for the existing upward trend in the data, while the AAN model does not.
Next, a damped trend model experimenting with different values of phi, ranging from the minimum suggested by the textbook (0.8) to the maximum suggested by the textbook (0.98) with an intermediate value of 0.9.
# Generate damped AAN forecasts
china_gdp_data |>
model(
'phi = 0.8' = ETS(GDP ~ error("A") + trend("Ad", phi = 0.8) + season("N")),
'phi = 0.9' = ETS(GDP ~ error("A") + trend("Ad", phi = 0.9) + season("N")),
'phi = 0.98' = ETS(GDP ~ error("A") + trend("Ad", phi = 0.98) + season("N"))) |>
forecast(h = 50) |>
autoplot(china_gdp_data, level = NULL)
The damped trend model uses the pre-existing trend to forecast growth that eventually levels off. Values of phi that are large (close to 1) result in a forecast that “releases” the original trend slowly, taking a long time to level off. Values of phi that are small (close to 0.8, which is the minimum value in the range given by the textbook) result in a forecast that “releases” the original trend quickly, resulting in a forecast that has an immediate increase and then rapidly levels off.
# Get a value for lambda
lambda_china <- china_gdp_data |>
features(GDP, features = guerrero) |>
pull(lambda_guerrero)
# Apply a Box-Cox transformation
china_bc <- china_gdp_data |>
mutate(b_c = box_cox(GDP, lambda_china)) |>
select(Year, b_c)
# Create forecasts of Box-Cox transformed data
china_bc |>
model(
'ANN' = ETS(b_c ~ error("A") + trend("N") + season("N")),
'AAN' = ETS(b_c ~ error("A") + trend("A") + season("N")),
'phi = 0.8' = ETS(b_c ~ error("A") + trend("Ad", phi = 0.8) + season("N")),
'phi = 0.9' = ETS(b_c ~ error("A") + trend("Ad", phi = 0.9) + season("N")),
'phi = 0.98' = ETS(b_c ~ error("A") + trend("Ad", phi = 0.98) + season("N"))) |>
forecast(h = 50) |>
autoplot(china_bc, level = NULL)
The Box-Cox transformation “straightens” the underlying data.
Find an ETS model for the Gas data from aus_production
and forecast the next few years. Why is multiplicative seasonality
necessary here? Experiment with making the trend damped. Does it improve
the forecasts?
# Select Data
aus_gas_data <- aus_production |> select(Quarter, Gas)
print(head(aus_gas_data))
## # A tsibble: 6 x 2 [1Q]
## Quarter Gas
## <qtr> <dbl>
## 1 1956 Q1 5
## 2 1956 Q2 6
## 3 1956 Q3 7
## 4 1956 Q4 6
## 5 1957 Q1 5
## 6 1957 Q2 7
# Investigate Data
aus_gas_data |> autoplot(Gas)
This data shows an overall trend upward and seasonal variation, with the amount of variation increasing as the value of the series increases. This makes an MAM model appropriate here.
# Forecast data using an MAM model (multiplicative seasonality)
aus_gas_data |>
model(ETS(Gas ~ error("M") + trend("A") + season("M"))) |>
forecast(h = 20) |>
autoplot(aus_gas_data, level = NULL)
Multiplicative seasonality is necessary to account for the fact that the seasonal variation increase as the value of the series increases. At the beginning of the series, the values are quite small and the variation is also small, but as the series progresses the impact of the seasonal variations increases, resulting in larger fluctuations. Using additive seasonality would not account for this progression, resulting in seasonal variations that do not increase in size with the data (shown below).
# Forecast data using an AAA model (additive seasonality)
aus_gas_data |>
model(ETS(Gas ~ error("A") + trend("A") + season("A"))) |>
forecast(h = 20) |>
autoplot(aus_gas_data, level = NULL)
To better observe the difference between additive and multiplicative seasonality forecasts, here are the same forecasts over the next 50 years (instead of the next 5).
# Forecast data using an MAM model (multiplicative seasonality)
aus_gas_data |>
model(
additive = ETS(Gas ~ error("A") + trend("A") + season("A")),
multiplicative = ETS(Gas ~ error("M") + trend("A") + season("M"))) |>
forecast(h = 200) |>
autoplot(aus_gas_data, level = NULL)
Because this is a multiplicative seasonality forecast, the size of the variation increases as the value of the series increases. In turn, this causes the “peaks” of the data to reach even higher, increasing variation further. This creates a feedback loop causing the forecast to take on an exponential shape, which is not evident in the underlying data. A damped forecast could correct for that.
# Create a damped forecast
aus_gas_data |>
model(ETS(Gas ~ error("A") + trend("Ad") + season("A"))) |>
forecast(h = 200) |>
autoplot(aus_gas_data, level = NULL)
Making the trend damped improves the forecast by leveling off the growth over time, which is a forecast that looks more consistent with the existing series.
Recall your retail time series data (from Exercise 7 in Section 2.10).
# Recreate the retail time series data (code provided by the textbook)
set.seed(31415)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
print(head(myseries))
## # A tsibble: 6 x 5 [1M]
## # Key: State, Industry [1]
## State Industry `Series ID` Month Turnover
## <chr> <chr> <chr> <mth> <dbl>
## 1 Northern Territory Other recreational goods ret… A3349843L 1988 Apr 0.6
## 2 Northern Territory Other recreational goods ret… A3349843L 1988 May 1.1
## 3 Northern Territory Other recreational goods ret… A3349843L 1988 Jun 0.9
## 4 Northern Territory Other recreational goods ret… A3349843L 1988 Jul 0.9
## 5 Northern Territory Other recreational goods ret… A3349843L 1988 Aug 1.1
## 6 Northern Territory Other recreational goods ret… A3349843L 1988 Sep 1.2
# Select relevant columns
myseries <- myseries |> select(Month, Turnover)
Why is multiplicative seasonality necessary for this series?
myseries |> autoplot(Turnover)
Multiplicative seasonality is necessary for this series because the amount of variation increases as the value of the series increases. At the beginning of the series when the value is small there is only a little variation, but there is much more variation once the series increases.
Apply Holt-Winters’ multiplicative method to the data. Experiment with making the trend damped.
# Forecast data using an MAM model with and without damping
myseries |>
model(
'Not Damped' = ETS(Turnover ~ error("M") + trend("A") + season("M")),
'Damped' = ETS(Turnover ~ error("M") + trend("Ad") + season("M"))) |>
forecast(h = 40) |>
autoplot(myseries, level = NULL)
The damped forecast results in a slight downward trend compared to the forecast that is not damped.
Compare the RMSE of the one-step forecasts from the two methods. Which do you prefer?
To evaluate RMSE we need a training set and a test set.
train_data <- myseries[1:round(nrow(myseries)*.8),]
myseries_forecast <- train_data |>
model(
'Not Damped' = ETS(Turnover ~ error("M") + trend("A") + season("M")),
'Damped' = ETS(Turnover ~ error("M") + trend("Ad") + season("M"))) |>
forecast(new_data = anti_join(myseries, train_data))
## Joining with `by = join_by(Month, Turnover)`
accuracy(myseries_forecast, myseries) |> select(.model, RMSE)
## # A tibble: 2 × 2
## .model RMSE
## <chr> <dbl>
## 1 Damped 1.31
## 2 Not Damped 1.71
The damped forecast method produces a lower RMSE, so that one would be preferred.
Check that the residuals from the best method look like white noise.
myseries |>
model(ETS(Turnover ~ error("M") + trend("Ad") + season("M"))) |>
gg_tsresiduals()
Since the residuals do not display a pattern or significant outliers, they look like white noise.
Now find the test set RMSE, while training the model to the end of 2010. Can you beat the seasonal naïve approach from Exercise 7 in Section 5.11?
# Filter data
train_2010 <- myseries |>
filter_index('1988 Apr' ~ '2010 Dec')
# Generate a forecast for the remaining data
myseries_forecast_2 <- train_2010 |>
model(ETS(Turnover ~ error("M") + trend("Ad") + season("M"))) |>
forecast(new_data = anti_join(myseries, train_2010))
## Joining with `by = join_by(Month, Turnover)`
# Report the RMSE
accuracy(myseries_forecast_2, myseries) |> select(.model, RMSE)
## # A tibble: 1 × 2
## .model RMSE
## <chr> <dbl>
## 1 "ETS(Turnover ~ error(\"M\") + trend(\"Ad\") + season(\"M\"))" 0.807
# Code to find the accuracy of the Naive method provided by the textbook
myseries_train <- myseries |>
filter(year(Month) < 2011)
fit <- myseries_train |>
model(SNAIVE())
## Model not specified, defaulting to automatic modelling of the `Turnover`
## variable. Override this using the model formula.
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(Month, Turnover)`
fc |> accuracy(myseries) |> select(.model, RMSE)
## # A tibble: 1 × 2
## .model RMSE
## <chr> <dbl>
## 1 SNAIVE() 0.953
For the same retail data, try an STL decomposition applied to the Box-Cox transformed series, followed by ETS on the seasonally adjusted data. How does that compare with your best previous forecasts on the test set?
# Determine a value for lambda
lambda <- myseries |>
features(Turnover, features = guerrero) |>
pull(lambda_guerrero)
# Apply a Box-Cox transformation
myseries_bc <- myseries |>
mutate(b_c = box_cox(Turnover, lambda)) |>
select(Month, b_c)
# Perform STL Decomposition
stl_adjusted <- myseries_bc |>
model(STL(b_c)) |>
components() |>
as_tsibble() |>
select(Month, season_adjust)
# Create a training data set, fit a model, and create a forecast
stl_train <- stl_adjusted |> filter(year(Month) < 2011)
ets_fit <- stl_train |> model(ETS(season_adjust ~ error("A") + trend("Ad") + season("N")))
new_fc <- ets_fit |> forecast(new_data = anti_join(stl_adjusted, stl_train))
## Joining with `by = join_by(Month, season_adjust)`
# Report the accuracy of the forecast
new_fc |> accuracy(stl_adjusted) |> select(.model, RMSE)
## # A tibble: 1 × 2
## .model RMSE
## <chr> <dbl>
## 1 "ETS(season_adjust ~ error(\"A\") + trend(\"Ad\") + season(\"N\"))" 0.133
With an RMSE of approximately 0.1327, this is the best forecast by far.