library(fpp3)
library(tidyverse)
library(kableExtra)
library(seasonal)
library(reactable)
library(tsibble)
options(scipen=100)

1. Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)

data("global_economy")
global_economy |>
  select(Country, Population) |>
  filter(Country == "Australia") |>
  autoplot() +
  labs(x = "Year", y = "Total Population") +
  ggtitle("Total Population in Australia") +
  theme(plot.title = element_text(hjust=0.3))

aus_pop <- global_economy |>
  select(Country, Population) |>
  filter(Country == "Australia")
aus_pop |>
  model(NAIVE(Population)) |> 
  forecast(h=10) |> 
  autoplot(aus_pop) + 
  labs(x = "Year", y = "Total Population") +
  ggtitle("10-Year Forecast for Australian Population") +
  theme(plot.title = element_text(hjust=0.3))

Bricks (aus_production)

data("aus_production") 
aus_production |>
  select(Bricks) |>
  filter(!is.na(Bricks)) |>
  autoplot() +
  labs(x = "Year(Quarter)", y = "Bricks Production") +
  ggtitle("Quarterly Production of Bricks in Australia") +
  theme(plot.title = element_text(hjust=0.3))

aus_bricks <- aus_production |>
  select(Bricks) |>
  filter(!is.na(Bricks))
aus_bricks |>
  model(SNAIVE(Bricks)) |>
  forecast(h=10) |>
  autoplot(aus_production) +
  labs(x = "Year(Quarter)", y = "Bricks Production") +
  ggtitle("10-Year Forecast of Bricks Production in Australia") +
  theme(plot.title = element_text(hjust=0.3))

NSW Lambs (aus_livestock)

data("aus_livestock")
aus_livestock |>
  select(Animal, State, Count) |>
  filter(Animal == "Lambs", State == "New South Wales") |>
  autoplot() + 
  labs(x = "Year", y = "Lamb Production") +
  ggtitle("Lamb Production in New South Wales") +
  theme(plot.title = element_text(hjust=0.3))

  #select(Count)
  #model(SNAIVE(Count)) |>
  #forecast(h=10)
  #autoplot(aus_live, show.legend = FALSE)
nsw_lamb_count <- aus_livestock |>
  select(Animal, State, Count) |>
  filter(Animal == "Lambs", State == "New South Wales")
nsw_lamb_count |>
  model(NAIVE(Count)) |>
  forecast(h=10) |>
  autoplot(nsw_lamb_count) +
  labs(x = "Month", y = "Lamb Production") +
  ggtitle("10-Year Forecast of Lamb Production in New South Wales") +
  theme(plot.title = element_text(hjust=0.3)) 

nsw_lambs <- aus_livestock %>% filter(Animal=='Lambs',State=='New South Wales') %>% select(Count)
nsw_lambs %>% model(NAIVE(Count)) %>% forecast(h=15) %>% autoplot(nsw_lambs)

Household wealth (hh_budget)

data("hh_budget")
hh_budget |>
  autoplot(Wealth) + 
  labs(x = "Year", y = "Household Wealth") +
  ggtitle("Yearly Household Wealth Per Country") +
  theme(plot.title = element_text(hjust=0.3)) 

hh_budget |>
  model(RW(Wealth ~ drift())) |>
  #model(NAIVE(Wealth)) |>
  forecast(h = 10) |>
  autoplot(hh_budget) +
  facet_wrap(Country ~ . , scales = "free") +
  labs(x = "Year", y = "Household Wealth ") +
  ggtitle("10-Year Forecast Household Wealth Per Country") +
  theme(plot.title = element_text(hjust=0.5))

Australian takeaway food turnover (aus_retail)

data("aus_retail")
aus_retail |>
  filter(Industry == "Takeaway food services") |>
  autoplot() +
  facet_wrap(State ~ ., scales = "free") +
  #labs(x = "Month", y = "Turnover") +
  ggtitle("Takeaway Food Services Turnover in Australia") +
  theme(plot.title = element_text(hjust=0.07)) +
  theme(legend.position = "none")

aus_food_turnover <- aus_retail |>
  select(State, Industry, Turnover) |>
  filter(Industry == "Takeaway food services")
aus_food_turnover |>
  model(SNAIVE(Turnover)) |>
  #model(RW(Turnover ~ drift())) |>
  forecast(h=10) |>
  autoplot() +
  facet_wrap(~State, scales = "free") +
  ggtitle("10-Year Forecast of Takeaway Food Services Turnover in Australia") +
  theme(plot.title = element_text(hjust=0.5))

2. Use the Facebook stock price (data set gafa_stock) to do the following:

a. Produce a time plot of the series.

data("gafa_stock")
fb_stock <- gafa_stock |>
  filter(Symbol == "FB") |>
  mutate(trading_day = row_number()) |>
  update_tsibble(index = trading_day, regular = TRUE)
fb_stock |>
  autoplot(Close) +
  labs(x = "Trading Day", y = "Closing Stock Price") +
  ggtitle("Trading Day Facebook Stock Closing Price") +
  theme(plot.title = element_text(hjust=0.5))

b. Produce forecasts using the drift method and plot them.

fb_stock |>
  model(RW(Close ~ drift())) |>
  forecast(h=60) |>
  autoplot(fb_stock) +
  labs(x = "Trading Day", y = "Closing Stock Price") +
  ggtitle("60-Day Trading Day Forecast of Facebook Stock Closing Price") +
  theme(plot.title = element_text(hjust=0.5))

c. Show that the forecasts are identical to extending the line drawn between the first and last observations.

fb_stock |>
  model(RW(Close ~ drift())) |>
  forecast(h=60) |>
  autoplot(fb_stock) +
  labs(x = "Trading Day", y = "Closing Stock Price") +
  ggtitle("60-Day Trading Day Forecast of Facebook Stock Closing Price") +
  theme(plot.title = element_text(hjust=0.5)) +
  geom_segment(aes(x = 1, y = 54.7, xend = 1258, yend = 131),
               colour = "blue", linetype = "dashed")

  # Used fb_stock[fb_stock$trading_day == "1", ] and fb_stock[fb_stock$trading_day == "1258", ] to get y values

d. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb_stock %>%
  model(Mean = MEAN(Close),
        #Seasonal_Naive = SNAIVE(Close),
        Naive = NAIVE(Close),
        Drift = NAIVE(Close ~ drift())) %>%
  forecast(h = 60) %>%
  autoplot(fb_stock) +
  labs(x = "Trading Day", y = "Closing Stock Price") +
  ggtitle("60-Day Trading Day Forecasts of Facebook Stock Closing Price") +
  theme(plot.title = element_text(hjust=0.5))

I think the drift function is the best forecast for this data. The interval levels appear to be smaller compared to Mean and Naive, and therefore it would lend itself to better predict the Facebook closing stock price.

3. Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help. What do you conclude?

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

Based on the plots, there doesn’t appear to be any autocorrelation, which indicates that there is white noise. The residuals are not close to zero, and the histogram of the residuals does not show a normal distribution.

I will check the residuals residuals are distinguishable from a white noise series using the Box-Pierce and Ljung-Box tests:

aug <- recent_production |>
  model(SNAIVE(Beer)) |>
  augment()
autoplot(aug, .innov) +
  labs(y = "Beer Production",
       title = "Residuals from the Seasonal Naive Method") + 
  theme(plot.title = element_text(hjust=0.5))

The lag was set to 8, using the \(l = 2m\) method for quarterly seasonal data:

# Box-Pierce Test; lag = 2 * 4
aug |> features(.innov, box_pierce, lag = 8)
## # A tibble: 1 × 3
##   .model       bp_stat bp_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    29.7  0.000234
# Ljung-Box Test; lag = 2 * 4
aug |> features(.innov, ljung_box, lag = 8)
## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    32.3 0.0000834

The p-values for each test is less than 0.05, so we can conclude that the residuals are distinguishable from a white noise series.

4. Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports - global_economy

aus_exports <- global_economy |>
  select(Year, Country, Exports) |>
  filter(Country == "Australia") 
fit <- aus_exports |> model(NAIVE(Exports))
fit |> gg_tsresiduals()

fit |> forecast() |> autoplot(aus_exports)

While the histogram of residuals appear to show a normal distribution with its center near zero, the innovation residuals does not show autocorrelation due to the residuals not appearing to be close to the zero threshold.

aug_exports <- aus_exports |>
  model(NAIVE()) |>
  augment()
autoplot(aug_exports, .innov) +
  labs(y = "Beer Production",
       title = "Residuals from the Naive Method") + 
  theme(plot.title = element_text(hjust=0.5))

# Box-Pierce Test
aug_exports |> features(.innov, box_pierce, lag = 10)
## # A tibble: 1 × 4
##   Country   .model  bp_stat bp_pvalue
##   <fct>     <chr>     <dbl>     <dbl>
## 1 Australia NAIVE()    14.6     0.148
# Ljung-Box Test
aug_exports |> features(.innov, ljung_box, lag = 10)
## # A tibble: 1 × 4
##   Country   .model  lb_stat lb_pvalue
##   <fct>     <chr>     <dbl>     <dbl>
## 1 Australia NAIVE()    16.4    0.0896

Since the p-values of each test are greater than 0.05, we can conclude that the residuals are not distinguishable from a white noise series.

Bricks - aus_production
fit <- aus_bricks |> model(SNAIVE(Bricks))
fit |> gg_tsresiduals()

fit |> forecast() |> autoplot(aus_bricks)

aug_bricks <- aus_bricks |>
  model(SNAIVE()) |>
  augment()
autoplot(aug_bricks, .innov) +
  labs(y = "Brick Production",
       title = "Residuals from the Seasonal Naive Method") + 
  theme(plot.title = element_text(hjust=0.5))

The innovation residuals starts off near the zero threshold, and then begins to have random fluctuations around 1975 Q1. The histogram residuals appear to show a skewness to the left. Based on the plots, there appears to be white noise in the data. We’ll use the Box-Pierce and Ljung-Box tests to see if we can conclude that the residuals are distinguishable from white noise:

# Box-Pierce Test; lag = 2 * 4
aug_bricks |> features(.innov, box_pierce, lag = 8)
## # A tibble: 1 × 3
##   .model   bp_stat bp_pvalue
##   <chr>      <dbl>     <dbl>
## 1 SNAIVE()    267.         0
# Ljung-Box Test; lag = 2 * 4
aug_bricks |> features(.innov, ljung_box, lag = 8)
## # A tibble: 1 × 3
##   .model   lb_stat lb_pvalue
##   <chr>      <dbl>     <dbl>
## 1 SNAIVE()    274.         0

With a p-value of 0, we can conclude that the residuals are distinguishable from the white noise series.

7. For your retail time series (from Exercise 7 in Section 2.10):

a. Create a training dataset consisting of observations before 2011 using:

set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
  filter(year(Month) < 2011)

b. Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, color = "red")

c. Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE(Turnover))

d. Check the residuals. Do the residuals appear to be uncorrelated and normally distributed?

fit |> gg_tsresiduals()

With the exception of a sharp downward spike between January 1995 and January 1997, the innovation residuals appear to be close to zero. The histogram residuals appears to have a normal distribution, but does show a slight skewness to the left and its center is not zero. The ACF plot shows positive residuals between 0 and 11, and negative residuals between 12 and 24. Based on these observations, the residuals doesn’t appear to uncorrelated and not normally distributed.

aug_retail <- fit |>
  augment() 
# Box-Pierce Test; lag = 2 * 12
bp <- aug_retail |> features(.innov, box_pierce, lag = 24)

# Ljung-Box Test; lag = 2 * 12
lb <- aug_retail |> features(.innov, ljung_box, lag = 24)
reactable(bp)
reactable(lb)

The Box-Pierce and Ljung-Box tests have a p-value of zero, allowing us to conclude that the residuals are distinguishable from white noise.

e. Produce forecasts for the test data.

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))
fc |> autoplot(myseries)

f. Compare the accuracy of your forecasts against the actual values.

fit |> 
  accuracy() |>
  reactable()
fc |> 
  accuracy(myseries) |>
  reactable()

g. How sensitive are the accuracy measures to the amount of training data used?

With the exception of ACF1 and MAPE, the errors in the test data increased in comparison to the training data. The MASE in the test set increased, which is interesting because the data is seasonal and would assume that the MASE in the test set would decrease as a result. This indicates that the accuracy using SNAIVE for this dataset may not be better than using another method.