Homework 3

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(fpp3)
Registered S3 method overwritten by 'tsibble':
  method               from 
  as_tibble.grouped_df dplyr
── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
✔ tibble      3.3.1     ✔ tsibble     1.1.6
✔ tidyr       1.3.2     ✔ tsibbledata 0.4.1
✔ lubridate   1.9.4     ✔ feasts      0.4.2
✔ ggplot2     4.0.1     ✔ fable       0.5.0
── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
✖ lubridate::date()    masks base::date()
✖ dplyr::filter()      masks stats::filter()
✖ tsibble::intersect() masks base::intersect()
✖ tsibble::interval()  masks lubridate::interval()
✖ dplyr::lag()         masks stats::lag()
✖ tsibble::setdiff()   masks base::setdiff()
✖ tsibble::union()     masks base::union()

5.1 Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

  • Australian Population (global_economy)
  • Bricks (aus_production)
  • NSW Lambs (aus_livestock)
  • Household wealth (hh_budget).
  • Australian takeaway food turnover (aus_retail). ### Australian Population (global_economy)
global_economy %>%
  # Filter and Create the column first
  filter(Country == "Australia") %>%
  mutate(Population_M = Population / 1e6) %>%
  # Model
  model(Drift = RW(Population_M ~ drift())) %>%
  forecast(h = 15) %>%
  # Make the graph
  autoplot(global_economy %>% filter(Country == "Australia") %>% mutate(Population_M = Population / 1e6)) +
  labs(title = "Australian Population (global_economy) 15 Year Forecast", 
       x = "Year", 
       y = "Population (Millions)")

Bricks (aus_production)

brick_data <- aus_production %>% filter(!is.na(Bricks))

brick_data %>%
  model(SNaive = SNAIVE(Bricks)) %>%
  forecast(h = 8) %>%
  autoplot(brick_data) + 
  labs(title = "Bricks Forecast", y = "Millions of Bricks")

NSW Lambs (aus_livestock)

nsw_lambs <- aus_livestock %>%
  filter(State == "New South Wales", Animal == "Lambs") %>%
  mutate(Lambs_M = Count / 1e6)

nsw_lambs %>%
  model(SNaive = SNAIVE(Lambs_M)) %>%
  forecast(h = 24) %>%
  autoplot(nsw_lambs) + # Simple and clean!
  labs(title = "NSW Lambs Production Forecast",
       y = "Lambs (Millions)",
       x = "Month")

Household wealth (hh_budget)

hh_budget %>%
  filter(Country == "Australia") %>%
  model(Drift = RW(Wealth ~ drift())) %>%
    forecast(h = 10) %>%
    autoplot(hh_budget %>% filter(Country == "Australia")) +
    labs(title = "Australian Household Wealth Forecast",
       subtitle = "Using Random Walk with Drift",
       y = "Wealth (% of disposable income)",
       x = "Year")

Australian takeaway food turnover (aus_retail)

takeaway_data <- aus_retail %>%
  filter(Industry == "Takeaway food services") %>%
  summarise(Total_Turnover = sum(Turnover))

takeaway_data %>%
  model(SNaive = SNAIVE(Total_Turnover)) %>%
  forecast(h = "2 years") %>%
  autoplot(takeaway_data) +
  labs(
    title = "Australian Takeaway Food Turnover Forecast",
    subtitle = "Seasonal Naive benchmark (2-year horizon)",
    y = "Turnover ($ Millions AUD)",
    x = "Month"
  )

5.2 Use the Facebook stock price (data set gafa_stock) to do the following:

  • Produce a time plot of the series.
  • Produce forecasts using the drift method and plot them.
  • Show that the forecasts are identical to extending the line drawn between the first and last observations.
  • Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

Produce a time plot of the series.

fb_stock <- gafa_stock %>%
  filter(Symbol == "FB") 

autoplot(fb_stock, Close) +
  labs(title = "Facebook Closing Stock Price",
       subtitle = "2014 - 2018",
       y = "Price ($USD)",
       x = "Trading Days")

Produce forecasts using the drift method and plot them & Show that the forecasts are identical to extending the line drawn between the first and last observations.

#PREPARE DATA
fb_clean <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  index_by(Month = yearmonth(Date)) %>%
  summarise(Close = mean(Close, na.rm = TRUE))

#Make Forecast
fb_clean %>%
  model(Drift = RW(Close ~ drift())) %>%
  forecast(h = 12) %>%
  autoplot(fb_clean) +
  annotate("segment", 
           x = min(fb_clean$Month), 
           y = fb_clean$Close[1], 
           xend = max(fb_clean$Month), 
           yend = tail(fb_clean$Close, 1),
           color = "red", linetype = "dashed") +
  labs(title = "Facebook Stock Price: Drift Forecast",
       subtitle = "Dashed red line connects the first and last observations",
       y = "Price ($USD)",
       x = "Year/Month")

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

gafa_stock %>%
  filter(Symbol == "FB") %>%
  index_by(Month = yearmonth(Date)) %>%
  summarise(Close = mean(Close, na.rm = TRUE)) %>%
  filter_index("2014 Jan" ~ "2017 Dec") %>%
  model(
    Mean = MEAN(Close),
    Naive = NAIVE(Close),
    Seasonal_Naive = SNAIVE(Close),
    Drift = RW(Close ~ drift())
  ) %>%
  forecast(h = 12) %>%
  autoplot(level = NULL) +
  labs(
    title = "Forecasts for Monthly Facebook Stock Price",
    subtitle = "Comparing benchmarks against actual 2018 data",
    y = "Close Price ($US)",
    x = "Year/Month"
  ) +
  guides(colour = guide_legend(title = "Model"))

5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
ℹ Please use `ggtime::gg_tsresiduals()` instead.
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_bin()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_rug()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

The residuals don’t look like white noise and are normally distribute, in the time plot I don’t see any patterns or trends, and most autocorrected bars are within the blue dashed line.

5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

exports_fit <- global_economy %>%
  filter(Country == "Australia") %>%
  model(Naive = NAIVE(Exports))

exports_fit %>% gg_tsresiduals()
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_bin()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_rug()`).

exports_fit %>% 
  forecast(h = 10) %>% 
  autoplot(global_economy %>% filter(Country == "Australia")) +
  labs(title = "Australian Exports: Naive Forecast", y = "% of GDP")

bricks_fit <- aus_production %>%
  filter(!is.na(Bricks)) %>%
  model(SNaive = SNAIVE(Bricks))

bricks_fit %>% gg_tsresiduals()
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_bin()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_rug()`).

bricks_fit %>% 
  forecast(h = "5 years") %>% 
  autoplot(aus_production) +
  labs(title = "Clay Brick Production: Seasonal Naive Forecast", y = "Millions of bricks")
Warning: Removed 20 rows containing missing values or values outside the scale range
(`geom_line()`).

I don’t think Naive() or Snaive() are better. The residual patterns don’t look like white noise, no trend, or seasonal pattern. There’s a significant spike in lag 4 so I think that the relationship isn’t being fully captured.

5.7 For your retail time series (from Exercise 7 in Section 2.10):

a) Create a training dataset consisting of observations before 2011 using

myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`, 1))

myseries_train <- myseries |>
  filter(year(Month) < 2011)

b) Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

c) Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train %>%
  model(SNAIVE(Turnover))

d) Check the residuals.

fit |> gg_tsresiduals()
Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12 rows containing non-finite outside the scale range
(`stat_bin()`).
Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_rug()`).

The hitsgram shows what looks like a normal distribution. I don’t think the residuals are uncorrelated.

e) Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))
Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)

f) Compare the accuracy of your forecasts against the actual values.

fit |> accuracy()
# A tibble: 1 × 12
  State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
  <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Tasmania Clothin… SNAIV… Trai… 0.532  1.76  1.37  3.03  8.57     1     1 0.597
fc |> accuracy(myseries)
# A tibble: 1 × 12
  .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
  <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SNAIVE(T… Tasm… Clothin… Test   7.61  8.78  7.64  23.2  23.4  5.60  4.99 0.674

g)How sensitive are the accuracy measures to the amount of training data used?

Accuracy measures are highly sensitive to the amount of training data used. If the sample is to large then the model with overfit the training data and struggle with the data that looks slightly different. But if the sample is to small the model will underfit the date and struggle to finding anything valuable regardless of the data we give it.