Data 624 Week 4

Instructions

Do exercises 5.1, 5.2, 5.3, 5.4 and 5.7 in the Hyndman book. Please submit your Rpubs link as well as your .pdf file showing your run code.

# Load libraries
library(fpp3)
library(fable)
library(tsibble)
library(dplyr)
library(ggplot2)
library(feasts)
library(gridExtra)
library(imputeTS)

Exercise 5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy) - RW with Drift - Population data tends to have strong trend but has no seasonal component.

Bricks (aus_production - SNAIVE - Brick production contains a strong seasonal pattern.

NSW Lambs (aus_livestock) SNAIVE - Livestock volume contains a strong seasonal pattern.

Household wealth (hh_budget) - RW with Drift - Financial data (as household wealth) contains strong trend pattern but has no seasonal component.

Australian takeaway food turnover (aus_retail) - SNAIVE - Retail sales contain strong seasonal component.

# Australian Population data (global_economy)
aus_population <- global_economy %>% filter(Country == "Australia")

# Fit the RW with drift model
fit_population <- aus_population %>%
  model(RW = RW(Population ~ drift()))

# Create forecast
forecast_population <- fit_population %>% forecast(h = "10 years")

# Plot forecast
autoplot(forecast_population, aus_population) +
  labs(title = "Forecast: Australian Population (RW with Drift)", y = "Population")

# Bricks data (aus_production)
bricks <- aus_production %>%
  as_tsibble(index = Quarter)

# Drop rows with missing values in the Bricks column
bricks <- bricks %>%
  filter(!is.na(Bricks))

# Fit the SNAIVE model
fit_bricks <- bricks %>%
  model(SNAIVE = SNAIVE(Bricks))

# Create forecast
forecast_bricks <- fit_bricks %>% forecast(h = "2 years")

# Plot the forecast
autoplot(forecast_bricks, bricks) +
  labs(title = "Forecast: Brick Production (SNAIVE)", y = "Bricks Production")

# NSW Lambs data (aus_livestock)
lambs <- aus_livestock %>% filter(Animal == "Lambs", State == "New South Wales")

# Fit the SNAIVE model
fit_lambs <- lambs %>%
  model(SNAIVE = SNAIVE(Count))

# Create forecast
forecast_lambs <- fit_lambs %>% forecast(h = "1 year")

# Plot forecast
autoplot(forecast_lambs, lambs) +
  labs(title = "Forecast: NSW Lamb Production (SNAIVE)", y = "Lambs (count)")

# Household Wealth data (hh_budget)
household_wealth <- hh_budget %>% filter(Country == "Australia")

# Fit the RW with drift model
fit_household_wealth <- household_wealth %>%
  model(RW = RW(Wealth ~ drift()))

# Create forecast
forecast_household_wealth <- fit_household_wealth %>% forecast(h = "5 years")

# Plot forecast
autoplot(forecast_household_wealth, household_wealth) +
  labs(title = "Forecast: Australia Household Wealth (RW with Drift)", y = "Wealth (Millions)")

# Australian Takeaway Food Turnover data (aus_retail)
aus_takeaway <- aus_retail %>% 
  filter(Industry == "Takeaway food services", State == "Western Australia")

# Fit the SNAIVE model
fit_takeaway <- aus_takeaway %>%
  model(SNAIVE = SNAIVE(Turnover))

# Create the forecast
forecast_takeaway <- fit_takeaway %>% forecast(h = "1 year")

# Plot forecast
autoplot(forecast_takeaway, aus_takeaway) +
  labs(title = "Forecast: Takeaway Food Turnover in Western Australia (SNAIVE)", 
       y = "Turnover (Millions)")

Exercise 5.2

Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series.
Produce forecasts using the drift method and plot them.
Show that the forecasts are identical to extending the line drawn between the first and last observations.
Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

It is difficult to tell which of the models is best. It is obvious that the seasonal naive is not appropriate as the data has no seasonal component. However, the forecasts for RW with Drift, Naive and the linear forecasts are very similar and nearly identical for this data. In this particular case, if we were looking for a basic benchmark, we would use the naive model to align with the efficient market hypothesis.

# Filter gafa_stock dataset for Facebook
fb_stock <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  update_tsibble(regular = TRUE) %>%
  fill_gaps(Close = NA)

# Time plot of Facebook stock price
ptime <- autoplot(fb_stock, Close) +
  labs(title = "Facebook Stock Price", 
       x = "Date", 
       y = "Stock Price (USD)")

# Fit a drift model to the Facebook stock price data
fb_drift_model <- fb_stock %>%
  model(Drift = RW(Close ~ drift()))

# Generate forecasts
fb_drift_forecast <- fb_drift_model %>%
  forecast(h = "12 months")

# Plot data with the drift forecast
p1 <- autoplot(fb_drift_forecast, fb_stock) +
  labs(title = "Facebook Stock Price with Drift Forecast",
       x = "Date", 
       y = "Stock Price (USD)") +
  guides(colour = guide_legend(title = "Forecast"))

# Filter the gafa_stock dataset to use TradingDay index
fb_stock_tsibble <- gafa_stock %>%
  filter(Symbol == "FB") %>%    
  mutate(TradingDay = row_number()) %>%
  as_tsibble(index = TradingDay, regular = TRUE) %>%
  drop_na(Close)


# Fit a drift model
fb_drift_model <- fb_stock_tsibble %>%
  model(Drift = RW(Close ~ drift()))

# Generate forecasts
fb_drift_forecast <- fb_drift_model %>%
  forecast(h = 365)

# Plot data data with the drift forecast
p2 <- autoplot(fb_drift_forecast, fb_stock_tsibble) +
  labs(title = "Facebook Stock Price with Drift Forecast by Trading Day",
       x = "Trading Day", 
       y = "Stock Price (USD)") +
  guides(colour = guide_legend(title = "Forecast"))

# Display plots
grid.arrange(ptime,p1, p2, ncol = 1)

# Filter the gafa_stock dataset and create TradingDay variable
fb_stock_tsibble <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  mutate(TradingDay = row_number()) %>%
  as_tsibble(index = TradingDay, regular = TRUE) %>%
  drop_na(Close)

# Extract the first and last trading days and close prices
first_day <- first(fb_stock$Date)
last_day <- last(fb_stock$Date)
first_close <- first(fb_stock$Close)
last_close <- last(fb_stock$Close)

# Calculate the slope and intercept of the line
slope <- (last_close - first_close) / as.numeric(difftime(last_day, first_day, units = "days"))
intercept <- first_close - slope * as.numeric(first_day)

# Define forecast horizon
forecast_horizon <- 365

# Define the forecast horizon
future_days <- seq(last_day + 1, by = 1, length.out = forecast_horizon)

# Create a data frame for the future dates and forecasted values
forecast_df <- tibble(
  Date = future_days,
  Close = intercept + slope * as.numeric(future_days))

# Create a data frame for the line between the first and last points
line_df <- tibble(
  Date = c(first_day, last_day),
  Close = c(first_close, last_close))

# Plot the original data along with the line and forecast
p3 <- autoplot(fb_stock, Close) +
  geom_line(data = line_df, aes(x = Date, y = Close), color = "blue", size = .5) +
  geom_line(data = forecast_df, aes(x = Date, y = Close), color = "red", linetype = "dashed", size = .5) +
  labs(title = "Facebook Stock Price with Linear Forecast",
       x = "Trading Day", 
       y = "Stock Price (USD)") +
  guides(colour = guide_legend(title = "Forecast"))

p3

# Filter the gafa_stock dataset and create TradingDay variable
fb_stock_tsibble <- gafa_stock %>%
  filter(Symbol == "FB") %>%
  mutate(TradingDay = row_number()) %>%
  as_tsibble(index = TradingDay, regular = TRUE) %>%
  drop_na(Close)

# Fit a naive model
fb_naive_model <- fb_stock_tsibble %>%
  model(NAIVE(Close))

# Generate naive forecasts
fb_naive_forecast <- fb_naive_model %>%
  forecast(h = 365)

# Plot the original data and naive forecast
p4 <- fb_naive_forecast %>%
  autoplot(fb_stock_tsibble) +
  labs(title = "Facebook Stock Price with Naive Forecast",
       x = "Trading Day", 
       y = "Stock Price (USD)") +
  guides(colour = guide_legend(title = "Forecast"))


#Convert to a regular time series and fill missing dates
fb_stock <- fb_stock %>%
  update_tsibble(regular = TRUE) %>%
  fill_gaps(Close = NA) 

# Fit a seasonal naive model
fb_snaive_model <- fb_stock %>%
  model(SNAIVE(Close))

# Generate seasonal naive forecasts
fb_snaive_forecast <- fb_snaive_model %>%
  forecast(h = 365)

# Plot data and seasonal naive forecast
p5 <- fb_snaive_forecast %>%
  autoplot(fb_stock) +
  labs(title = "Facebook Stock Price with Seasonal Naive Forecast",
       x = "Date", 
       y = "Stock Price (USD)") +
  guides(colour = guide_legend(title = "Forecast"))

# Display plots
grid.arrange(p4, p5, ncol = 1)

Exercise 5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. What do you conclude?

Visually, the residuals show that some season pattern information remains unaccounted for in the model. The innovation residuals have a mean around zero but point to a seasonal pattern within the plot. The ACF plot has no significantly correlated spikes except for lag 4; the residuals looked nearly normally distributed. The forecast itself visually fits the seasonal pattern.

However, the Ljung-Box test allows us to test the residuals and indicates the model may be performing more poorly than the visual indicate. In this case, because the LB pvalue is (less than) < 0.05, we can reject the hypothesis that the residuals are white noise and can conclude the residuals contain information that is not accounted for in the model. The high LB statistic also points to this conclusion the series is not white noise.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

# Ljung-Box test to check for autocorrelation
augment(fit) |>
  features(.innov, ljung_box, lag = 8) |>
  select(lb_stat, lb_pvalue)

## # A tibble: 1 × 2
##   lb_stat lb_pvalue
##     <dbl>     <dbl>
## 1    32.3 0.0000834

Exercise 5.4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports: We use the NAIVE model since it has no seasonal component (as yearly data). The residuals hover around a mean of zero, the ACF shows no significantly correlated spikes except for lag 1. The residuals are normally distributed. In addition, the LB pvalue is (greater than) > .05 and we can accept the the hypothesis that the residuals are white noise. The moderately low LB stat also points to this conclusion.
Bricks Production: We use the SNAIVE model to account for the seasonal components of the data. However, the residuals do not appear to hover around mean zero and seem to contain a seasonal pattern. The ACF plot shows a handful of significantly correlated lags; the residuals are not normally distributed and skew left. The LB pvalue is (less than) <.05 so we reject the null hypothesis that the residuals are white and conclude there is additional seasonality that is not accounted for in the model. The high LB stat also points to this conclusion.

# Filter Exports data
recent_exports <- global_economy |>
  filter(Country == "Australia")|>
  select(Exports) |>
  drop_na()

# Define and estimate a model for Australian Exports using NAIVE
fit_exports <- recent_exports |> model(NAIVE(Exports))

# Exports Residuals
fit_exports |> gg_tsresiduals() +
  ggtitle("Residuals of Australian Exports Forecast") +
  labs(x = "Year", y = "Residuals")

# Exports Forecast
fit_exports |> forecast() |> autoplot(recent_exports) +
  ggtitle("Forecast for Australian Exports") +
  labs(x = "Year", y = "Exports (in millions)")

# Ljung-Box test on residuals for Exports
augment(fit_exports) |>
  features(.innov, ljung_box, lag = 10) |>
  select(lb_stat, lb_pvalue)

## # A tibble: 1 × 2
##   lb_stat lb_pvalue
##     <dbl>     <dbl>
## 1    16.4    0.0896

# Filter Bricks Data
recent_bricks <- aus_production |>
  filter(year(Quarter) >= 1992) |>
  select(Bricks) |>
  drop_na()

# Bricks using SNAIVE
fit_bricks <- recent_bricks |> model(SNAIVE(Bricks))

# Bricks Residuals
fit_bricks |> gg_tsresiduals() +
  ggtitle("Residuals of Bricks Forecast") +
  labs(x = "Quarter", y = "Residuals")

# Bricks Forecast
fit_bricks |> forecast() |> autoplot(recent_bricks) +
  ggtitle("Forecast for Bricks Production") +
  labs(x = "Quarter", y = "Bricks (in millions)")

# Ljung-Box test on residuals for Bricks
augment(fit_bricks) |>
  features(.innov, ljung_box, lag = 8) |>
  select(lb_stat, lb_pvalue)

## # A tibble: 1 × 2
##   lb_stat lb_pvalue
##     <dbl>     <dbl>
## 1    62.2  1.74e-10

Exercise 5.7

For your retail time series (from Exercise 7 in Section 2.10):

Create a training dataset consisting of observations before 2011 using
Check that your data have been split appropriately by producing the following plot.
Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
Check the residuals.Do the residuals appear to be uncorrelated and normally distributed?
Produce forecasts for the test data
Compare the accuracy of your forecasts against the actual values.
How sensitive are the accuracy measures to the amount of training data used?

For this example, I compared the SNAIVE and RW with Drift accuracy.

Both models show poor performance between the training and test datasets. The test data accuracy show significantly poorer results in comparison to the training data across all metrics. However, the difference in performance between training and testing datasets is smaller in the SNAIVE model than the RW model.
Using MAPE as the primary accuracy metric used to compare the models, the SNAIVE model performs relatively better with a percentage error of 14% over the 31% error of the RW Drift model.
The SNAIVE model performs better as retail data contains a strong seasonal component that the SNAIVE can better model than RW.
However, SNAIVE residuals show some volatility around the mean, have highly correlated lags and the residuals data are not normally distributed. This would suggest that the SNAIVE model is not capturing all of the possible seasonal patterns in the data.

# My Series from 2.10
set.seed(123456789)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`, 1))

# Create a training dataset
myseries_train <- myseries |>
  filter(year(Month) < 2011)

# Plot split data
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "orange")

# Fit a seasonal naïve model (SNAIVE)
fit_snaive <- myseries_train |>
  model(SNAIVE = SNAIVE(Turnover))

# Check the residuals for the SNAIVE model
fit_snaive |> gg_tsresiduals() +
  labs(title = "Residuals for Seasonal Naive Model (SNAIVE)")

# Fit a random walk with drift model
fit_rw <- myseries_train |>
  model(RW_Drift = RW(Turnover ~ drift()))

# Check the residuals for the Random Walk with Drift model
fit_rw |> gg_tsresiduals() +
  labs(title = "Residuals for Random Walk with Drift Model")

# Generate forecasts for the test data using both models
fc_snaive <- fit_snaive |>
  forecast(new_data = anti_join(myseries, myseries_train))

fc_rw <- fit_rw |>
  forecast(new_data = anti_join(myseries, myseries_train))

# Plot forecasts for SNAIVE model
fc_snaive |> autoplot(myseries) +
  autolayer(myseries_train, Turnover, series = "Training Data") +
  labs(title = "Forecast: SNAIVE Model")

# Plot forecasts for Random Walk with Drift model
fc_rw |> autoplot(myseries) +
  autolayer(myseries_train, Turnover, series = "Training Data") +
  labs(title = "Forecast: Random Walk with Drift Model")

# Compare the accuracy of the forecasts against the actual values for both models
combined_accuracy <- bind_rows(
  accuracy(fit_snaive) |> mutate(Model = "SNAIVE", Type = "Training"),
  accuracy(fc_snaive, myseries) |> mutate(Model = "SNAIVE", Type = "Forecast"),
  accuracy(fit_rw) |> mutate(Model = "Random Walk with Drift", Type = "Training"),
  accuracy(fc_rw, myseries) |> mutate(Model = "Random Walk with Drift", Type = "Forecast")
)|> select(-State, -Industry, -Model, -Type, -ACF1)
combined_accuracy

## # A tibble: 4 × 9
##   .model   .type           ME  RMSE   MAE    MPE  MAPE  MASE RMSSE
##   <chr>    <chr>        <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE   Training  1.11e+ 0  2.74  1.81   6.81  14.0 1     1    
## 2 SNAIVE   Test     -5.45e- 1  5.47  4.54  -4.61  14.9 2.50  2.00 
## 3 RW_Drift Training -6.00e-16  2.33  1.49  -2.57  13.3 0.822 0.850
## 4 RW_Drift Test     -9.54e+ 0 10.4   9.57 -31.6   31.7 5.28  3.79