Data624_Homework3_ahmed

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)
Bricks (aus_production)
NSW Lambs (aus_livestock)
Household wealth (hh_budget).
Australian takeaway food turnover (aus_retail).

1(a) Australian Population

library(fpp3)

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──

## ✔ tibble      3.3.1     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.2     ✔ feasts      0.4.2
## ✔ lubridate   1.9.4     ✔ fable       0.5.0
## ✔ ggplot2     4.0.1

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(dplyr)


aus_population <- global_economy |>
  filter(Country == "Australia") |>
  select(Year, Population)

fit_aus_population <- aus_population |>
  model(drift = RW(Population ~ drift()))

fc_aus_population <- fit_aus_population |>
  forecast(h = "10 years")

fc_aus_population |>
  autoplot(aus_population) +
  labs(title = "Australian Population: Drift method", y = "Population")

1(b) Bricks (aus_production)

bricks <- aus_production |>
  select(Quarter, Bricks)

fit_bricks <- bricks |>
  model(snaive = SNAIVE(Bricks))

fc_bricks <- fit_bricks |>
  forecast(h = "3 years")

fc_bricks |>
  autoplot(bricks) +
  labs(title = "Bricks: Seasonal naïve", y = "Bricks")

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

1(c) NSW Lambs (aus_livestock)

nsw_lambs <- aus_livestock |>
  filter(State == "New South Wales", Animal == "Lambs") |>
  select(Month, Count)

fit_nsw_lambs <- nsw_lambs |>
  model(snaive = SNAIVE(Count))

fc_nsw_lambs <- fit_nsw_lambs |>
  forecast(h = "2 years")

fc_nsw_lambs |>
  autoplot(nsw_lambs) +
  labs(title = "NSW Lambs: Seasonal naïve", y = "Count")

1(d) Household wealth (hh_budget)

wealth <- hh_budget |>
  select(Year, Wealth)

fit_wealth <- wealth |>
  model(drift = RW(Wealth ~ drift()))

fc_wealth <- fit_wealth |>
  forecast(h = "10 years")

fc_wealth |>
  autoplot(wealth) +
  labs(title = "Household wealth: Drift method", y = "Wealth")

1(e) Australian takeaway food turnover (aus_retail)

takeaway <- aus_retail |>
  filter(Industry == "Takeaway food services") |>
  index_by(Month) |>
  summarise(Turnover = sum(Turnover))


takeaway_clean <- takeaway |>
  filter(!is.na(Turnover))

fit_takeaway <- takeaway_clean |>
  model(snaive = SNAIVE(Turnover))

fc_takeaway <- fit_takeaway |>
  forecast(h = "2 years")

fc_takeaway |>
  autoplot(takeaway_clean) +
  labs(title = "Takeaway food turnover (Australia) — Seasonal naïve",
       y = "Turnover ($ million)")

Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series.
Produce forecasts using the drift method and plot them.
Show that the forecasts are identical to extending the line drawn between the first and last observations.
Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

# a) Time plot 
fb_stock <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  mutate(day = row_number()) %>%
  update_tsibble(index = day, regular = TRUE)

fb_stock %>%
  autoplot(Close) +
  labs(title = "Facebook Daily Closing Stock Price",
       y = "USD")

# b) Forecasts using the drift method
fb_stock %>%
  model(Drift = RW(Close ~ drift())) %>%
  forecast(h = 60) %>%
  autoplot(fb_stock) +
  labs(title = "Facebook Daily Closing Price – Drift Forecast (60 Days Ahead)",
       y = "USD")

# c) Show that the forecasts are identical to extending the line drawn between the first and last observations.


y1 <- fb_stock %>% slice(1) %>% pull(Close)
yT <- fb_stock %>% slice(n()) %>% pull(Close)
T  <- nrow(fb_stock)

fb_stock %>%
  model(Drift = RW(Close ~ drift())) %>%
  forecast(h = 60) %>%
  autoplot(fb_stock) +
  labs(title = "Facebook Daily Closing Price – Drift vs Linear Extension",
       y = "USD") +
  geom_segment(aes(x = 1, y = y1, xend = T, yend = yT),
               linetype = "dashed")

## Warning in geom_segment(aes(x = 1, y = y1, xend = T, yend = yT), linetype = "dashed"): All aesthetics have length 1, but the data has 1258 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

# d) Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?


fb_stock %>%
  model(
    Mean  = MEAN(Close),
    Naive = NAIVE(Close),
    Drift = RW(Close ~ drift())
  ) %>%
  forecast(h = 60) %>%
  autoplot(fb_stock, level = NULL) +
  facet_wrap(vars(.model), ncol = 1) +
  labs(title = "Benchmark Forecasts for Facebook Daily Closing Price",
       y = "USD")

The drift method is best because it’s the only one that accounts for the trend in the data. Mean gives a flat forecast around $110 that ignores the entire upward movement, and naive just repeats the last value with no trend. Since Facebook’s stock clearly trended upward from $55 to $125 over the period, drift makes the most sense by extending that average growth rate forward.

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

What do you conclude?

recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)

fit <- recent_production |>
  model(SNAIVE(Beer))

# Residual diagnostics
fit |> gg_tsresiduals()

## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_rug()`).

# Forecasts and plot
fit |> forecast() |> autoplot(recent_production) +
  labs(title = "Beer production forecasts (SNAIVE) from 1992", y = "Beer")

Looking at the residuals, they’re clearly not white noise. While they mostly hover around zero, there are some noticeable outliers scattered throughout. The ACF plot is the real issue. There’s a significant negative spike at lag 4, which makes sense since we’re dealing with quarterly data (lag 4 = 1 year). This tells me the SNAIVE model isn’t capturing all the seasonal structure in the data. The forecast plot shows what you’d expect from seasonal naïve. It just repeats the last year’s quarterly pattern going forward. The prediction intervals get wider the further out we go, which is normal. The residual histogram looks fairly normal and symmetric, which is good. The variance also seems pretty stable over time. But that lag 4 autocorrelation is the main proble.It means there’s still some dependency in the errors that the model isn’t accounting for. Overall, SNAIVE works as a decent baseline since it at least preserves the seasonal pattern, but the diagnostics make it clear this isn’t a great final model. The autocorrelation at the yearly lag suggests a more sophisticated model like ETS or seasonal ARIMA would probably do better.

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports

aus_exports <- global_economy |>
  filter(Country == "Australia") |>
  select(Year, Exports)

fit_exports <- aus_exports |>
  model(NAIVE(Exports))

fit_exports |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_rug()`).

fit_exports |>
  forecast(h = "10 years") |>
  autoplot(aus_exports) +
  labs(title = "Australian Exports: NAIVE forecasts", y = "Exports")

Bricks

bricks <- aus_production |>
  select(Quarter, Bricks)

fit_bricks <- bricks |>
  model(SNAIVE(Bricks))

fit_bricks |> gg_tsresiduals()

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_rug()`).

fit_bricks |>
  forecast(h = "3 years") |>
  autoplot(bricks) +
  labs(title = "Bricks: SNAIVE forecasts", y = "Bricks")

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

For your retail time series (from Exercise 7 in Section 2.10):

Create a training dataset consisting of observations before 2011 using

set.seed(1234)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1)) 

myseries_train <- myseries |>
  filter(year(Month) < 2011)

Check that your data have been split appropriately by producing the following plot

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red") +
  labs(title = "Train/test split check", y = "Turnover")

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE(Turnover))

Check the residuals.

fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_rug()`).

Do the residuals appear to be uncorrelated and normally distributed?

The residuals appear roughly normally distributed and centered around zero, as shown in the histogram. However, they are clearly not uncorrelated. The ACF plot shows significant autocorrelation at several early lags (roughly lags 1-6), and the autocorrelation switches from positive to negative around lag 10. This indicates the model hasn’t fully captured the data structure and there’s still pattern left in the residuals.

Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

Compare the accuracy of your forecasts against the actual values

fit |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Tasmania Cafes, … SNAIV… Trai…  1.33  2.90  2.22  6.31  10.7     1     1 0.800

fc |> accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Tasm… Cafes, … Test   7.12  9.13  7.58  13.2  14.4  3.42  3.15 0.863

The SNAIVE forecasts have a MAPE of around 9%, meaning they’re off by about 9% on average, which isn’t terrible. However, the MASE is 1.36, which is above 1. This means the model actually performs worse than just using a basic seasonal naive approach. Overall, the forecasts capture the general trend and seasonality, but the accuracy suggests we’d need a better model to get more reliable predictions.

How sensitive are the accuracy measures to the amount of training data used?

The accuracy measures can be pretty sensitive to how much training data you use. If you use too little training data, the model might not capture important long term patterns or seasonal cycles, leading to worse forecasts. On the other hand, using too much old data could include outdated patterns that aren’t relevant anymore, especially if the series has changed over time like a shift in trend or seasonality.

Data624_Homework3_ahmed_hassan

Ahmed Hassan

2026-02-22