Forecasting

Lab 3

5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Assuming the packages have been installed, we can start the libraries:

library(fpp3)

## Warning: package 'fpp3' was built under R version 4.4.2

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.1
## ✔ lubridate   1.9.3     ✔ fable       0.4.1
## ✔ ggplot2     3.5.1

## Warning: package 'tsibble' was built under R version 4.4.2

## Warning: package 'tsibbledata' was built under R version 4.4.2

## Warning: package 'feasts' was built under R version 4.4.2

## Warning: package 'fabletools' was built under R version 4.4.2

## Warning: package 'fable' was built under R version 4.4.2

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(tsibble)
library(dplyr)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ readr   2.1.5
## ✔ purrr   1.0.2     ✔ stringr 1.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(ggplot2)
library(gt)

## Warning: package 'gt' was built under R version 4.4.2

library(seasonal)

## Warning: package 'seasonal' was built under R version 4.4.2

## 
## Attaching package: 'seasonal'
## 
## The following object is masked from 'package:tibble':
## 
##     view

library(zoo)

## 
## Attaching package: 'zoo'
## 
## The following object is masked from 'package:tsibble':
## 
##     index
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

Australian Population (global_economy)

First we load the dataset

data(global_economy)

Now filter the Australian Population data

aus_pop <- global_economy %>%
  filter(Country == "Australia") %>%
  select(Year, Population) %>%
  as_tsibble(index = Year)

Plot

autoplot(aus_pop, Population) +
  labs(title = "Australian Population Over Time",
       y = "Population (millions)")

In this case fit a Random Walk with Drift model, since population growth follows a steady trend over time. and forecast the next 10 years, then plot that forecast.

aus_rw_drift_fit <- aus_pop %>%
  model(RW(Population ~ drift()))

aus_rw_drift_fc <- aus_rw_drift_fit %>%
  forecast(h = 10)

autoplot(aus_pop, Population) +
  autolayer(aus_rw_drift_fc, level = 95) +
  labs(title = "Forecast: Australian Population (RW with Drift)",
       y = "Population (millions)")

Bricks (aus_production)

Now we repeat similar steps to previous database, with loading the dataset, filtering the Bricks data and plotting what we get:

data(aus_production)

bricks <- aus_production %>%   select(Quarter, Bricks) %>%   as_tsibble(index = Quarter)
bricks <- bricks %>% drop_na()


autoplot(bricks, Bricks) +
  labs(title = "Brick Production in Australia",
       y = "Bricks (millions)")

Now, we can try both SNAIVE and RW, since we can observe brick production exhibiting strong seasonal patterns but also potential trends. Then we will forecast 5 years and plot.

# Fit and forecast using RW with Drift
bricks_rw_drift_fit <- bricks %>%
  model(RW(Bricks ~ drift()))

bricks_rw_drift_fc <- bricks_rw_drift_fit %>%
  forecast(h = "1 years")

# Plot the forecast
autoplot(bricks, Bricks) +
  autolayer(bricks_rw_drift_fc, level = 95) +
  labs(title = "Forecast: Brick Production (RW with Drift, 5 Years)",
       y = "Bricks (millions)")

bricks_snaive_fit <- bricks %>%
  model(SNAIVE(Bricks))

bricks_snaive_fc <- bricks_snaive_fit %>%
  forecast(h = "5 years")

# Plot SNAIVE forecast
autoplot(bricks, Bricks) +
  autolayer(bricks_snaive_fc, level = 95) +
  labs(title = "Forecast: Brick Production (Seasonal Naive, 5 Years)",
       y = "Bricks (millions)")

Given the historical pattern of Bricks Production, it seems that if recent years suggest that seasonality remains strong then SNAIVE is better, but if the trend is uncertain or expected to drift unpredictably then RW with Drift is safer. In this case, SNAIVE makes more sense, since construction and manufacturing tend to follow predictable seasonal demand cycles.

NSW Lambs (aus_livestock)

Again, we load, filter and plot

data(aus_livestock)

nsw_lambs <- aus_livestock %>%
  filter(State == "New South Wales", Animal == "Lambs") %>%
  select(Month, Count) %>%  # Use Count instead of Lambs
  drop_na() %>%
  as_tsibble(index = Month)

autoplot(nsw_lambs, Count) +
  labs(title = "NSW Lamb Production",
       y = "Lambs (thousands)")

Now we apply Seasonal Naive, since breeding cycles, weather conditions, and market demands greatly impact seasonality for this data.

nsw_lambs_snaive_fit <- nsw_lambs %>%
  model(SNAIVE(Count))

nsw_lambs_snaive_fc <- nsw_lambs_snaive_fit %>%
  forecast(h = "5 years")

# Plot the forecast
autoplot(nsw_lambs, Count) +
  autolayer(nsw_lambs_snaive_fc, level = 95) +
  labs(title = "Forecast: NSW Lamb Production (Seasonal Naive, 5 Years)",
       y = "Lambs (thousands)")

Household wealth (hh_budget)

Load, filter and plot.

data(hh_budget)

household_wealth <- hh_budget %>%
  select(Year, Wealth) %>%  # Use Year instead of Quarter
  drop_na() %>%
  as_tsibble(index = Year)

autoplot(household_wealth, Wealth) +
  labs(title = "Household Wealth Over Time",
       y = "Wealth (millions)")

Now, we apply RW with Drift, since household wealth can normally exhibit long-term growth trends with some seasonal variations.

household_wealth_rw_drift_fit <- household_wealth %>%
  model(RW(Wealth ~ drift()))

household_wealth_rw_drift_fc <- household_wealth_rw_drift_fit %>%
  forecast(h = "5 years")

household_wealth %>%
  autoplot(Wealth) +
  autolayer(household_wealth_rw_drift_fc, level = 95, alpha = 0.4) +
  facet_wrap(~ Country) + 
  labs(title = "Forecast: Household Wealth (RW with Drift, 5 Years)",
       y = "Wealth (millions)")

Australian takeaway food turnover (aus_retail)

Load, filter and plot (also had to transform month)

data(aus_retail)

takeaway_food <- aus_retail %>%
  filter(Industry == "Takeaway food services") %>%
  select(Month, Turnover) %>%
  drop_na() %>%
  mutate(Month = yearmonth(Month)) %>%
  as_tsibble(index = Month)

autoplot(takeaway_food, Turnover) +
  labs(title = "Australian Takeaway Food Turnover",
       y = "Turnover (millions)")

Since retail sales (including takeaway food) are highly seasonal, the SNAIVE model seemes like a better approach.

takeaway_snaive_fit <- takeaway_food %>%
  model(SNAIVE(Turnover))

takeaway_snaive_fc <- takeaway_snaive_fit %>%
  forecast(h = "5 years")

takeaway_food %>%
  filter(Month >= yearmonth("2000 Jan")) %>%
  autoplot(Turnover) +
  autolayer(takeaway_snaive_fc, level = 95, alpha = 0.4) +
  theme(legend.position = "none") +
  facet_wrap(~ State, scales = "free_y") +  # Separate states & adjust y-scale
  labs(title = "Forecast: Australian Takeaway Food Turnover (Seasonal Naive, 5 Years)",
       y = "Turnover (millions)")

5.2

Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series

As usual, we will start by loading the data, filtering for Facebook stock prices and plotting.

data(gafa_stock)

fb_stock <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  select(Date, Close) %>%
  arrange(Date) %>%  # Ensure data is sorted by date
  as_tsibble(index = Date)

autoplot(fb_stock, Close) +
  labs(title = "Facebook Stock Price Over Time",
       y = "Stock Price (USD)")

We can observe that there is general upward trend from 2014 to mid-2018, where it starts declining rapidly with some fluctuation.

Produce forecasts using the drift method and plot them

When handling this step, we originally got RW(Close ~ drift()) can’t handle tsibble of irregular interval because the stock price data is not regularly spaced, and RW with Drift requires regular time intervals.

So for this we can apply RW with Drift using a day index instead.

We should also take into account: The stock market has ~21 trading days per month. 6 months is about 126 trading days

fb_stock <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  mutate(day = row_number()) %>%  # Create a sequential index
  update_tsibble(index = day, regular = TRUE)  # Force regularity

fb_drift_fit <- fb_stock %>%
  model(RW(Open ~ drift()))

# Forecast the next 120 trading days (≈ 3 months)
fb_drift_fc <- fb_drift_fit %>%
  forecast(h = 126) 

autoplot(fb_stock, Close) +
  autolayer(fb_drift_fc, level = 95) +
  labs(title = "Facebook Stock Price Forecast (RW with Drift)",
       y = "Stock Price (USD)")

We can see how the uncertainty grows over time, which is expected because RW with Drift assumes a random walk with an increasing confidence interval.

Show that the forecasts are identical to extending the line drawn between the first and last observations

To prove this, we will manually compute a trend line connecting the first and last stock prices, and then overlay this trend line on the forecasted RW Drift.

# We need to save the first and last values of Open price
first_value <- first(fb_stock$Open)
last_value <- last(fb_stock$Open)
total_rows <- nrow(fb_stock)  # Number of days in dataset

# To facilitate creating the line we will also compute the change per time step to get the slope
slope <- (last_value - first_value) / (total_rows - 1)

# With the previous info we can now create a drift line connecting first and last observation times slope
fb_stock <- fb_stock %>%
  mutate(DriftLine = first_value + (day - 1) * slope) 

# Finally we plot the data with the manually drawn trend line
autoplot(fb_stock, Open) + 
  autolayer(fb_drift_fc, level = NULL, series = "RW Drift Forecast") +
  geom_line(aes(x = day, y = DriftLine), color = "green") +
  labs(title = "Facebook Stock Price Forecast (Comparing Forecast to Line Between Observations)",
       x = "Days",
       y = "Stock Price (USD)")

## Warning in geom_line(mapping = without(mapping, "shape"), data =
## unpack_data(object[single_row[["FALSE"]], : Ignoring unknown parameters:
## `series`

We can visually confirm that the calculated drift line aligns with the forecast.

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

For this we can try code that would fit different benchmark models

# Fit different benchmark models
fb_benchmark_fit <- fb_stock %>%
  model(
    Naive = NAIVE(Open),
    Seasonal_Naive = SNAIVE(Open),
    RW_Drift = RW(Open ~ drift()),
  )

## Warning: 1 error encountered for Seasonal_Naive
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.

# Forecast for the next 6 months (~126 trading days)
fb_benchmark_fc <- fb_benchmark_fit %>%
  forecast(h = 126)

# Plot all forecasts for comparison
autoplot(fb_stock, Open) +
  autolayer(fb_benchmark_fc, level = 95, alpha = 0.4) +
  theme(legend.position = "none") +
  facet_wrap(~ .model, scales = "free_y") +
  labs(title = "Comparing Benchmark Forecasts for Facebook Stock",
       y = "Stock Price (USD)")

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning: Removed 126 rows containing missing values or values outside the scale range
## (`geom_line()`).

We can observe that SNAIVE is not showing anything, since the Facebook stock data does not have a clear seasonal pattern.

5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts

First, let’s get the data

beer_production <- aus_production |>
  filter(year(Quarter) >= 1992)

We know beer production follows a seasonal pattern, so SNAIvE makes sense.

fit <- beer_production |> model(SNAIVE(Beer))

To check residuals

fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

Above we can see that the residuals fluctuate around zero but show some patterns rather than pure randomness, potentially suggesting that some structure in the data is not captured by the SNAIVE model. Also, the ACF shows some significant spikes at what looks like seasonal lags. The histogram is looking normal, but it’s hard to determine if the mean is centered at zero, so there might be some bias.

fit |> forecast() |> autoplot(beer_production)

Overall we can conclude that the residuals don’t show pure white noise, so SNAIVE captures seasonality but may not account for other trends or long-term variations. This is also evidenced by sharp peaks and drops every year from the forecaste plot. The forecast uncertainty keeps growing over time.

5.4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports series from global_economy

Let’s load and plot data first

exports <- global_economy |>
  filter(Country == "Australia") |>
  select(Year, Exports) |>
  as_tsibble(index = Year)

autoplot(exports, Exports) +
  labs(title = "Australian Exports Over Time", y = "Exports (Millions USD)")

Since Exports is a yearly time series, we should use a NAIVE model, which assumes that the next year’s exports will be the same as the previous year’s.

fit_exports <- exports |> model(NAIVE(Exports))

fit_exports |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

fit_exports |> forecast(h = 5) |> autoplot(exports) +
  labs(title = "Forecast: Australian Exports (NAIVE)", y = "Exports (Millions USD)")

The residuals do not appear to be completely random; there are some periods where they trend up or down, indicating the NAIVE model might not fully capture underlying trends in exports. The histogram appears roughly normal, but there may be some skewness, and the residuals are not completely white noise, implying some structure remains that the model does not capture. The NAIVE model is a simple approach but might not be appropriate here, given the overall upward trend in exports.

Bricks series from aus_production

Load and plot first

bricks <- aus_production |>
  select(Quarter, Bricks)

autoplot(bricks, Bricks) +
  labs(title = "Brick Production in Australia", y = "Bricks (Millions)")

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

fit_bricks <- bricks |> filter(!is.na(Bricks)) %>% 
  model(SNAIVE(Bricks))

fit_bricks |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

fit_bricks |> forecast() |> autoplot(bricks) +
  labs(title = "Forecast: Australian Bricks Production", y = "Bricks (Millions)")

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

In this case the residuals show some strong fluctuations over time, and it looks like there is significant autocorrelation at multiple lags, also the histogram is somewhat bell-shaped but skewed, indicating non-normal residuals.

We could say that SNAIVE was appropriate since the data is seasonal, but it does not fully capture underlying trends.

5.7

For your retail time series (from Exercise 7 in Section 2.10)

a)

Create a training dataset consisting of observations before 2011 using

myseries_train <- myseries |> filter(year(Month) < 2011)

We start by bringing back the myseries used (same seed and everything)

set.seed(86863)

myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`, 1))

glimpse(myseries)

## Rows: 441
## Columns: 5
## Key: State, Industry [1]
## $ State       <chr> "New South Wales", "New South Wales", "New South Wales", "…
## $ Industry    <chr> "Takeaway food services", "Takeaway food services", "Takea…
## $ `Series ID` <chr> "A3349792X", "A3349792X", "A3349792X", "A3349792X", "A3349…
## $ Month       <mth> 1982 Apr, 1982 May, 1982 Jun, 1982 Jul, 1982 Aug, 1982 Sep…
## $ Turnover    <dbl> 85.4, 84.8, 80.7, 82.4, 80.7, 82.1, 87.3, 87.4, 97.2, 93.0…

Now we filter the dataset to include only data before 2011.

myseries_train <- myseries |>
  filter(year(Month) < 2011)

b)

Check that your data have been split appropriately by producing the following plot:

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

c)

And since this is monthly retail turnover data, a SNAIVE model seems like a good starting point

fit_train <- myseries_train |> model(SNAIVE(Turnover))

d)

fit_train |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

The residuals show non-random patterns, meaning that the errors are not purely white noise, also ACF) plot indicates significant correlations at multiple lags, while the histogram of residuals is approximately normal, but there may be some skewness

e)

fc <- fit_train |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries) +
  labs(title = "Retail Turnover Forecast (SNAIVE)",
       y = "Turnover (millions)")

This forecast follows the seasonal pattern but lacks adjustments for long-term trends.

f)

Compare the accuracy of your forecasts against the actual values.

fit_train |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 New Sou… Takeawa… SNAIV… Trai…  11.5  26.1  19.2  4.81  9.59     1     1 0.890

fc |> accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… New … Takeawa… Test   48.6  96.8  79.5  7.67  16.3  4.14  3.71 0.964

g)

How sensitive are the accuracy measures to the amount of training data used?

Training accuracy: RMSE: 26.11 MAE: 19.22 MAPE: 9.59% ACF1 (Residual Correlation): 0.89 (high)

Test accuracy: RMSE: 96.80 (much worse) MAE: 79.54 (worse) MAPE: 16.29% (higher error) ACF1 (Residual Correlation): 0.96 (very high)

We can see that the test set accuracy is significantly worse than the training set, since error measures (RMSE, MAE, MAPE) increased substantially, showing that the model is less accurate for future data. Perhaps a longer training set might better capture underlying trends, while a shorter one might introduce bias.

624 - Lab 3

Luis Munoz Grass

2025-02-21