library(fpp3)
## Warning: package 'fpp3' was built under R version 4.4.3
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tibble      3.2.1     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.2
## ✔ lubridate   1.9.3     ✔ fable       0.5.0
## ✔ ggplot2     3.5.1
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'tsibble' was built under R version 4.4.3
## Warning: package 'tsibbledata' was built under R version 4.4.3
## Warning: package 'feasts' was built under R version 4.4.3
## Warning: package 'fabletools' was built under R version 4.4.3
## Warning: package 'fable' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library(tsibble)

5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)
Bricks (aus_production)
NSW Lambs (aus_livestock)
Household wealth (hh_budget).
Australian takeaway food turnover (aus_retail).

Australian population

aus_pop <- global_economy |>
  filter(Country == "Australia") |>
  select(Year, Population)

autoplot(aus_pop)
## Plot variable not specified, automatically selected `.vars = Population`

Since this plot shows linear growth, a drift model is appropriate.

aus_pop |>
  model(RW(Population ~ drift())) |>
  forecast(h=20) |>
  autoplot(aus_pop, level=NULL)

Bricks

bricks_data <- aus_production |>
  select(Quarter, Bricks ) |>
  drop_na()

autoplot(bricks_data, Bricks)

Since this data has seasonality, SNAIVE should be utilized.

bricks_data |>
  model(SNAIVE(Bricks)) |>
  forecast(h=20) |>
  autoplot(bricks_data, level=NULL)

NSW Lambs

lambs <- aus_livestock |>
  filter(Animal == "Lambs") |>
  filter(State == "New South Wales") |>
  select(Month, Count)

autoplot(lambs, Count)

This dataset doesn’t appear to have seasonality nor does it have a clear trend so NAIVE model should be used.

lambs |> 
  model(NAIVE(Count)) |>
  forecast(h=50) |>
  autoplot(lambs, level=NULL)

Household wealth

For this question, the instructions don’t specify a country so I will select Australia to match the theme of the other excercises.

wealth_data <-hh_budget |>
  filter(Country == "Australia") |>
  select(Year, Wealth)

autoplot(wealth_data, Wealth)

While this dataset doesn’t appear to have a clear trend to me initially, it’s known that there was a market crash in 2008 that impacted many and likely caused the steep decline in the chart. Since we can see that growth has returned, I believe a drift model should be used in this case.

wealth_data |> 
  filter_index("2011" ~ "2016") |>
  model(RW(Wealth ~ drift())) |> 
  forecast(h=10) |>
  autoplot(wealth_data, level=NULL)

Australian takeaway turnover

The question did not specify the region so I will select Australian Capital Territory for this excercise.

turnover_data <- aus_retail |>
  filter(Industry =="Takeaway food services") |>
  filter(State =="Australian Capital Territory") |>
  select(Month, Turnover)

autoplot(turnover_data, Turnover)

This data has a trend but no seasonality so drift model is appropriate.

turnover_data |>
  model(RW(Turnover ~ drift())) |>
  forecast(h=50) |>
  autoplot(turnover_data, level = NULL)

5.2

Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series

fb_data <- gafa_stock |>
  filter(Symbol == "FB") |>
  select(Date, Close) 

autoplot(fb_data, Close)

Produce forecasts using the drift method and plot them.

# Convert dates to row number to remove date gaps (I believe this is due to lack of data on weekends)
fb_data_reg <- fb_data |>
  mutate(Date = row_number()) |>
  as_tsibble(index = Date)

# Plot forecast 
fb_data_reg |>
  model(RW(Close ~ drift())) |>
  forecast(h = 100) |>
  autoplot(fb_data_reg, level = NULL)

Show that the forecasts are identical to extending the line drawn between the first and last observations.

Note: This question was a bit unclear to me but I will assume the equation for the drift is (Yt - Y1)/(T-1).

# Calculate drift
initial_y <- head(fb_data_reg$Close, 1)
final_y <- tail(fb_data_reg$Close, 1)
n <- nrow(fb_data_reg)

fit <- fb_data_reg |> model(RW(Close ~ drift()))
tidy(fit) 
cat("Model's drift:",tidy(fit)$estimate)
## Model's drift: 0.06076372
cat("Manual drift:", (final_y - initial_y) / (n - 1))
## Manual drift: 0.06076372

The drift/slopes are the same in both showing that an extended line would be the same through the first and last observation.

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb_data_reg |>
  model(NAIVE(Close)) |>
  forecast(h=100) |>
  autoplot(fb_data_reg, level = NULL)

I think the drift model is best for this data. The naive model forecast shows relative stagnant pricing but it is unlikely for stock prices for a tech giant to be stable like this. It is more likely that the price to trend upwards over time.

5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

What do you conclude?

The residuals seem to be normally distributed without time spikes. The acf residuals shows lag 4 negative correlation. It appears that the model is likely accounting for all data but a different model may capture the seasonal trend better.

5.4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports

recent_exp <- global_economy |>
  filter(Country == 'Australia' & Year >= 1992) |>
  select(Year, Exports)

autoplot(recent_exp, Exports)

Since there is no seasonality, NAIVE seems more appropriate.

aus_exp_fit <- recent_exp |> model(NAIVE(Exports))
aus_exp_fit |> gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

aus_exp_fit |> forecast() |> autoplot(recent_exp, level = NULL)

The residuals are normally distributed and the acf does not show significant lag correlation.

Bricks

recent_bricks <- aus_production |>
  filter(year(Quarter) >= 1992) |>
  select(Quarter, Bricks) |>
  drop_na()

autoplot(recent_bricks, Bricks)

There seems to be seasonality in this dataset so SNAIVE is more appropriate.

bricks_fit <- recent_bricks |> model(SNAIVE(Bricks))
bricks_fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

bricks_fit |> forecast() |> autoplot(recent_bricks, level = NULL)

The residuals are skewed and show outliers.

5.7

  1. Create a training dataset consisting of observations before 2011 using
# Provided from exercise 
set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

# Select necessary variables
myseries <- myseries |>
  select(Month, Turnover)

# Provided from exercise 
myseries_train <- myseries |>
  filter(year(Month) < 2011)

print(tail(myseries_train))
## # A tsibble: 6 x 2 [1M]
##      Month Turnover
##      <mth>    <dbl>
## 1 2010 Jul     16.1
## 2 2010 Aug     13.8
## 3 2010 Sep     13.6
## 4 2010 Oct     12.3
## 5 2010 Nov     11.7
## 6 2010 Dec     17.9
  1. Check that your data have been split appropriately by producing the following plot.
# Provided from exercise 
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

Yes, the dataset is red until the end of 2010 and switches to black afterwards.

  1. Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
fit <- myseries_train |>
  model(SNAIVE(Turnover))
  1. Check the residuals.Do the residuals appear to be uncorrelated and normally distributed?
# Provided by exercise
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

The residuals seem to be normally distributed. There is more variance between Jan 1995 and 1997 but the overall seems to be stable.

  1. Produce forecasts for the test data
fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(Month, Turnover)`
fc |> autoplot(myseries, level =NULL)

  1. Compare the accuracy of your forecasts against the actual values.
fit |> accuracy()
fc |> accuracy(myseries)
  1. How sensitive are the accuracy measures to the amount of training data used?

To answer this question, I think we will need to repeat with additional training set sizes.

# less data (stopping at 2002)
train_2002 <- myseries |> 
  filter(year(Month) < 2002)

fit_2002 <- train_2002 |> 
  model(SNAIVE(Turnover))

fc2 <- fit_2002|> 
  forecast(new_data = anti_join(myseries, train_2002))
## Joining with `by = join_by(Month, Turnover)`
fc2 |> autoplot(myseries, level=NULL)

accuracy(fc2, myseries)
# more data (through 2018)
train_2018 <- myseries |> 
  filter(year(Month) < 2018)

fit_2018 <- train_2018 |> 
  model(SNAIVE(Turnover))

fc3 <- fit_2018|> 
  forecast(new_data = anti_join(myseries, train_2018))
## Joining with `by = join_by(Month, Turnover)`
fc3 |> autoplot(myseries, level=NULL)

accuracy(fc3, myseries)

The inclusion of more data seems to improve accuracy in this model suggesting that it is sensitive to the amount of training data.