Data 624 Homework 3

library(fpp3)
library(tidyverse)
library(ggrepel)
library(seasonal)
library(fabletools)
theme_set(theme_bw())

5.1 Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)
Bricks (aus_production)
NSW Lambs (aus_livestock)
Household wealth (hh_budget).
Australian takeaway food turnover (aus_retail).

global_economy |>
  filter(Country == "Australia") |>
  model(RW(Population ~ drift())) |>
  forecast(h = 20) |>
  autoplot(global_economy) +
  labs(title = "Australian Population with 20-Year Forecast (Drift Method)") +
  theme(legend.position = "None")

The forecast method used above was the Drift method since the overall trend is nearly linear and positive.

aus_production |>
  filter(!is.na(Bricks)) |>
  model(SNAIVE(Bricks)) |>
  forecast(h = "5 years") |>
  autoplot(aus_production) +
  labs(title = "Brick Production with 5 Year Forecast Using Seasonal Naive Method",
       y = "Production (millons of bricks)") +
  theme(legend.position = "None")

The forecast method used above was the Seaonal Naive method since there is a seasonal component to the observations.

aus_livestock |>
  filter(Animal == "Lambs", State == "New South Wales") |>
  model(SNAIVE(Count)) |>
  forecast(h = "3 years") |>
  autoplot(aus_livestock) +
  labs(title = "New South Wales Lamb Production with 3 Year Forecast\n Using Seasonal Naive Method", y = "Number of Animals") +
  theme(legend.position = "None")

The forecast method used above was the Seaonal Naive method since there is a seasonal component to the observations.

hh_budget_total = hh_budget |>
  summarize(avg_wealth = mean(Wealth))

hh_budget_total  |>
  model(RW(avg_wealth ~ drift())) |>
  forecast(h = "5 years") |>
  autoplot(hh_budget_total) +
  labs(title = "Average Household Wealth with 5 Year Forecast Using Drift Method",
       y = "Average Wealth (% of Net Disposable Income)") +
  theme(legend.position = "None")

The forecast method used above was the Drift method since the overall trend is positive and can be estimated with a positive linear forecast.

aus_retail_total = aus_retail |>
  group_by(Industry) |>
  summarize(total_turnover = sum(Turnover)) |>
  filter(Industry == "Takeaway food services") 

aus_retail_total |>
  model(RW(total_turnover ~ drift())) |>
  forecast(h = "3 years") |>
  autoplot(aus_retail_total) +
  labs(title = "Takeaway Food Services Turnover with 3 Year Forecast Using Drift Method",
       y = "Turnover ($Million AUD)") +
  theme(legend.position = "None")

The forecast method used above was the Drift method since the overall trend is positive and can be estimated with a positive linear forecast.

5.2 Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series.

fb_stock = gafa_stock |>
  filter(Symbol == "FB") |>
  mutate(Day = row_number()) |>
  update_tsibble(index = Day, regular = TRUE)

fb_stock |>
  autoplot(Close) +
  labs(title = "Daily Close Stock Price of Facebook from 2014-2018",
       x = "Day", y = "Close Price ($USD)")

Produce forecasts using the drift method and plot them.

fb_stock |>
  model(RW(Close ~ drift())) |>
  forecast(h = 30) |>
  autoplot(fb_stock) +
  labs(title = "Daily Close Stock Price of Facebook from 2014-2018 with\n30 Day Forecast Using Drift Method",
       x = "Day", y = "Close Price ($USD)") +
  theme(legend.position = "None")

Show that the forecasts are identical to extending the line drawn between the first and last observations.

diff_y = tail(fb_stock$Close, n = 1) - head(fb_stock$Close, n = 1)
diff_x = tail(fb_stock$Day, n = 1) - head(fb_stock$Day, n = 1)
fb_slope = diff_y / diff_x
fb_int = tail(fb_stock$Close, n = 1) - fb_slope * tail(fb_stock$Day, n = 1)

fb_stock |>
  model(RW(Close ~ drift())) |>
  forecast(h = 30) |>
  autoplot(fb_stock) +
  geom_abline(slope = fb_slope, intercept = fb_int, color = "red") +
  labs(title = "Daily Close Stock Price of Facebook from 2014-2018 with\n30 Day Forecast Using Drift Method",
       x = "Day", y = "Close Price ($USD)") +
  theme(legend.position = "None")

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb_stock |>
  model(NAIVE(Close)) |>
  forecast(h = 30) |>
  autoplot(fb_stock) +
  labs(title = "Daily Close Stock Price of Facebook from 2014-2018 with\n30 Day Forecast Using Naive Method",
       x = "Day", y = "Close Price ($USD)") +
  theme(legend.position = "None")

fb_stock |>
  model(MEAN(Close)) |>
  forecast(h = 30) |>
  autoplot(fb_stock) +
  labs(title = "Daily Close Stock Price of Facebook from 2014-2018 with\n30 Day Forecast Using Mean Method",
       x = "Day", y = "Close Price ($USD)") +
  theme(legend.position = "None")

Based on the plots above, I think the Naive method is the best since it uses the last observation as the forecast value going forward. Although the Drift method appears like a good option as well since the overall trend of the FB stock closing price has increased over time, we cannot confidently say whether the next 30 days will increase or decrease, since the closing price has seen extended periods of both increasing and decreasing trends. The Naive method makes more sense than the Mean method since the closing price of the next day is highly dependent on that of the previous day, so using the last observation makes sense.

5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

# Extract data of interest
recent_production <- aus_production |> filter(year(Quarter) >= 1992)

# Define and estimate a model 
fit <- recent_production |> model(SNAIVE(Beer))

# Look at the residuals 
fit |> gg_tsresiduals()

# Look a some forecasts
fit |> forecast() |>
  autoplot(recent_production) + theme(legend.position = "None")

What do you conclude?

The residual time plot above shows that the values of the residuals hover around 0, suggesting that the mean of the residuals is close to 0. This plot also suggests that the residual variance is relatively constant. These two points suggest that there is little to no bias in the forecast, meaning that no adjustments need to be made for bias. Neither the residual time plot nor the ACF plot shows any sort of correlation or pattern, meaning that there is no additional information from the residuals can be used to better the forecast. Finally, the distribution of residuals is nearly normal, which instills confidence in the forecast since prediction intervals are easier to calculate for normally distributed residuals. Based on all of these factors above, I can conclude that the forecast generated above appears to account for all information provided and looks like white noise, which sould result in a good forecast.

5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

aus_economy = global_economy |>
  filter(Country == "Australia")

aus_exports = aus_economy |>
  model(NAIVE(Exports))

aus_exports |> gg_tsresiduals()

aus_exports |> forecast(h = 5) |>
  autoplot(aus_economy) + theme(legend.position = "None")

For the above forecast, the Naive method was used for the Australian exports since there is no seasonal component of this data.

The residual time plot above shows that the values of the residuals hover around 0, suggesting that the mean of the residuals is close to 0. This plot also suggests that the residual variance is relatively constant. These two points suggest that there is little to no bias in the forecast, meaning that no adjustments need to be made for bias. The residual time plot does not show any sort of correlation or pattern. However, the ACF plot shows a slight pattern where there are local maximums at lags of 8 and 16, and local minimums at lags 4 and 8. This suggests a level of cyclicity within the data that is not being accounted for in the forecast that could improve the forecast.

Finally, the distribution of residuals is nearly normal, which instills confidence in the forecast since prediction intervals are easier to calculate for normally distributed residuals. Based on all of these factors above, I can conclude that the prediction interval from the forecast is good, but the overall forecast could be improved by accounting for the cyclicity described above.

brick_prod = aus_production |>
  filter(!is.na(Bricks))

brick_model = brick_prod |>
  model(SNAIVE(Bricks))

brick_model |> gg_tsresiduals()

brick_model |> forecast(h = "5 years") |>
  autoplot(brick_prod) + theme(legend.position = "None")

For the above forecast, the Seasonal Naive method was used for the Brick production since there is a seasonal component of this data.

Both the residual time plot and the ACF plot shows a pattern. The residual time plot shows cycles every 5 or so years, whereas the ACF shows a decreasing correlation until about a lag of 7, followed by an increasing correlation until about a lag of 20. This pattern then resumes with a decreasing correlation. This suggests a strong level of cyclicity within the data that is not being accounted for in the forecast that could improve the forecast.

Finally, the distribution of residuals is left skewed, which suggests that the prediction interval is not easy to compute. Based on all of these factors above, I can conclude that the forecast is not very good and can be improved by accounting for other aspects of the data, such as its cyclicity.

5.7 For your retail time series (from Exercise 7 in Section 2.10):

Create a training dataset consisting of observations before 2011 using

set.seed(40194)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries_train <- myseries |> filter(year(Month) < 2011)

Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) + 
  autolayer(myseries_train, Turnover, colour = "red")

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

turnover_fit <- myseries_train |> model(SNAIVE(Turnover))

Check the residuals.

turnover_fit |> gg_tsresiduals()

Do the residuals appear to be uncorrelated and normally distributed?

The residuals do not appear to be uncorrelated. The residuals appear to increase in magnitude as time increases. Additionally, the ACF plot shows that the correlation decreases until a lag of 12, then increases until a lag of 18, and then decreases again until a lag of 24. This suggests a level of cyclicity that’s not being accountd for in the forecast.

Based on the histogram above, the residuals appears to be nearly normally distributed.

Produce forecasts for the test data

turnover_fc <- turnover_fit |> 
  forecast(new_data = anti_join(myseries, myseries_train)) 
turnover_fc |> autoplot(myseries)

Compare the accuracy of your forecasts against the actual values.

turnover_fit |> accuracy()

turnover_fc |> accuracy(myseries)

As anticipated, the accuracy of the forecasts is worse than the accuracy of the training data itself, since the training data was used in generating the model and forecast.

How sensitive are the accuracy measures to the amount of training data used?

myseries_train_low <- myseries |> filter(year(Month) < 2009)
myseries_train_high <- myseries |> filter(year(Month) < 2013)

turnover_fit_low <- myseries_train_low |> model(SNAIVE(Turnover))
turnover_fit_high <- myseries_train_high |> model(SNAIVE(Turnover))

turnover_fc_low <- turnover_fit_low |> 
  forecast(new_data = anti_join(myseries, myseries_train_low)) 
turnover_fc_low |> autoplot(myseries) +
  labs(title = "Forecast with Training Data up to 2009")

turnover_fc_high <- turnover_fit_high |> 
  forecast(new_data = anti_join(myseries, myseries_train_high)) 
turnover_fc_high |> autoplot(myseries) +
  labs(title = "Forecast with Training Data up to 2013")

turnover_acc = turnover_fc |> accuracy(myseries) |>
  mutate(train_year = 2011) |>
  relocate(train_year)

turnover_low_acc = turnover_fc_low |> accuracy(myseries) |>
  mutate(train_year = 2009) |>
  relocate(train_year)

turnover_high_acc = turnover_fc_high |> accuracy(myseries) |>
  mutate(train_year = 2013) |>
  relocate(train_year)

bind_rows(turnover_low_acc, turnover_acc, turnover_high_acc) |>
  select(train_year, RMSE)

Looking at the tibble above, the RMSE increases by roughly 27.5% when there is less training data. With more training data, the RMSE also increases, but only by about 1.2%. This suggests that the accuracy is sensitive with less training data, but doesn’t change much once more training data is added.

Data 624 Homework 3

Michael Drake

2026-02-18