DATA 624 Homework 3

Author

Henock Montcho

Published

March 27, 2025

library(fpp3)
library(tsibble)
library(dplyr)
library(scales)

1-) Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)

#Exploration of Australian Population (global_economy) to determine what method

aus_population <- global_economy  |>
  filter(Country == "Australia")  |>
  select("Country", "Year", "Population")  

aus_population  |>
  autoplot(Population)

#The autoplot shows an increasiing trend. `RW(y ~ drift())` is then the most appropriate method

aus_population  |>
  model(RW(Population ~ drift()))  |>
  forecast(h = 8)  |>
  autoplot(aus_population) +
  scale_y_continuous(labels = comma) +
  labs(title = "Australia Population",
       subtitle = "Forecasted from 2017 through 2025")

Bricks (aus_production)

#Exploration of Bricks (aus_production) to determine what method
bricks <- aus_production  |>
  select("Bricks") 

bricks|>
  autoplot(Bricks)

#The autoplot shows some seasonal patterns, the SNAIVE is the most suitable method
bricks  |>
  filter(!is.na(Bricks))  |>
  model(SNAIVE(Bricks))  |>
  forecast(h = 22)  |>
  autoplot(bricks) +
  labs(title = "Australia Bricks Production",
       subtitle = "Forecasted from 1956 Q1 through 2010 Q4")

NSW Lambs (aus_livestock)

#Exploration of NSW Lambs (aus_livestock) to determine what method
nsw_lambs <- aus_livestock  |>
  filter(Animal == "Lambs", State == "New South Wales")

nsw_lambs  |>
  autoplot(Count)

#The autoplot does not show some strong seasonal patterns, the NAIVE is the most suitable method

nsw_lambs  |>
  model(NAIVE(Count))  |>
  forecast(h = 84)  |>
  autoplot(nsw_lambs) +
  scale_y_continuous(labels = comma) +
  labs(title = "Australia Livestock Slaughter",
       subtitle = "Forecasted from January 2019 through December 2025")

Household wealth (hh_budget)

#Exploration of Household wealth (hh_budget) to determine what method
hh_wealth <- hh_budget  |>
  select("Wealth")

hh_wealth  |>
  autoplot()

#The RW method is suitable for time series data with a trend. This method allows the forecasts to increase or decrease over time, with the amount of change (drift) being the average change observed in the historical data.

hh_wealth  |>
  model(RW(Wealth ~ drift()))  |>
  forecast(h = 5)  |>
  autoplot(hh_budget) +
  labs(title = "Household Wealth",
       subtitle = "Forecasted from 2017 through 2025")

Australian takeaway food turnover (aus_retail).

aus_takeaway <- aus_retail  |>
  filter(Industry == "Cafes, restaurants and takeaway food services")  |>
  select("Industry", "Month", "Turnover", "State")

aus_takeaway  |>
  autoplot(Turnover)

#The RW method is suitable for time series data with a trend. This method allows the forecasts to increase or decrease over time, with the amount of change (drift) being the average change observed in the historical data.

aus_takeaway  |>
  model(RW(Turnover ~ drift()))  |>
  forecast(h = 84)  |>
  autoplot(aus_takeaway) +
  facet_wrap(~State, scales = "free") +
  labs(title = "Australian Cafes, restaurants, and takeaway food services",
       subtitle = "Forecasted from January 2019 through Decembre 2025")

2-) Use the Facebook stock price (data set gafa_stock) to do the following:

a- Produce a time plot of the series

facebook <- gafa_stock  |>
  filter(Symbol == "FB")  |>
  mutate(Day = row_number())  |>
  update_tsibble(index = Day, regular = TRUE)

facebook  |>
  autoplot(Adj_Close) +
  labs(title= "Daily Adjusted Close Price of Facebook", y = "USD")

b- Produce forecasts using the drift method and plot them.

facebook_plot <- facebook  |>
  model(RW(Adj_Close ~ drift()))  |>
  forecast(h = 365)  |>
  autoplot(facebook) +
  labs(title = "Daily Adjusted Close Price of Facebook",
       subtitle = "Forecasted from 1 Jan 2019 through 31 Dec 2019",
       y = "USD")
facebook_plot

c- Show that the forecasts are identical to extending the line drawn between the first and last observations.

facebook_plot1 <- facebook_plot  +
  geom_segment(aes(x = min(facebook$Day), y = min(facebook$Adj_Close), xend = max(facebook$Day), yend = 131.09),
               colour = "blue", linetype = "dashed")
facebook_plot1

#131.09 for yend is the adjusted close price corresponding to the maximum of Day

d- Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

facebook  |>
  model(`Naive` = NAIVE(Adj_Close),
        Mean = MEAN(Adj_Close),
        Drift = RW(Adj_Close ~ drift()))  |>
  forecast(h = 365)  |>
  autoplot(facebook, level = NULL) +
  labs(title = "Daily Adjusted Close Price of Facebook",
       subtitle = "Forecasted with other Benchmark Functions from 1 Jan 2019 through 31 Dec 2019",
       y = "USD")

Comment: The data not being seasonal, the SNAIVE method will not be appropriate. However, the Drift method seems the most appropriate for this method allows the forecasts to increase or decrease over time, with the amount of change (drift) being the average change observed in the historical data.

3-) Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help. What do you conclude?

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)

# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))

# Look at the residuals
fit |> gg_tsresiduals() +
  ggtitle("Residual Plots for Australian Beer Production")

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production) +
  ggtitle("Australian Quarterly Beer Production")

Let’s run the portmanteau tests:

augment(fit)  |>
  features(.resid, ljung_box, lag = 2)

# A tibble: 1 × 3
  .model       lb_stat lb_pvalue
  <chr>          <dbl>     <dbl>
1 SNAIVE(Beer)    4.11     0.128

Comment: lb_pvalue = 0.1283666 which is > 0.05, so the hypothesis of white noise is not rejected.

4-) Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports series from global_economy

# Extract data of interest
aus_exports <- global_economy  |>
  filter(Country == "Australia")

# Define and estimate a model
fit <- aus_exports  |>
  model(NAIVE(Exports))

# Look at the residuals
fit  |>
  gg_tsresiduals() +
  ggtitle("Residual Plots for Australian Exports")

# Look at some forecasts
fit  |>
  forecast()  |>
  autoplot(aus_exports) +
  ggtitle("Annual Australian Exports")

#Box-Pierce test, ℓ=10 for non-seasonal data
fit  |>
  augment()  |>
  features(.innov, ljung_box, lag = 10)

# A tibble: 1 × 4
  Country   .model         lb_stat lb_pvalue
  <fct>     <chr>            <dbl>     <dbl>
1 Australia NAIVE(Exports)    16.4    0.0896

Comment: the .resid histogram shows a bell curve distribution centered around the mean which in this case is 0. The innovative residuals plot shows the points randomly scattered around the horizontal axis (zero line), indicating no discernible pattern. No trend is observed from the same plot. This is could be an indicator of white noises. However, lb_pvalue = 0.09 (0.1) which is close to 0.05. This indicates that the hypothesis of white is not rejected for a confidence level of 90%. It is a white noise distribution.

Bricks series from aus_production

# Extract data of interest
bricks <- aus_production  |>
  select("Bricks")  |>
  filter(!is.na(Bricks))  

# Define and estimate a model
fit <- bricks  |>
  model(SNAIVE(Bricks))

# Look at the residuals
fit  |>
  gg_tsresiduals() +
  ggtitle("Residual Plots for Australian Bricks production")

# Look at some forecasts
fit  |>
  forecast()  |>
  autoplot(bricks) +
  ggtitle("Quarterly Australian Bricks production")

#Box-Pierce test, ℓ=2 for seasonal data
fit  |>
  augment()  |>
  features(.innov, ljung_box, lag = 2)

# A tibble: 1 × 3
  .model         lb_stat lb_pvalue
  <chr>            <dbl>     <dbl>
1 SNAIVE(Bricks)    176.         0

Comment: the .resid histogram shows a bell curve distribution not centered around the mean 0 and skewed to the left. The innovative residuals plot shows the points randomly scattered around the horizontal axis (zero line) starting in 1975Q1 (approximately), indicating no discernible pattern. No trend is observed from the same plot. This is could be an indicator of white noises. However, lb_pvalue = 0 which is less than 0.05. This indicates that the hypothesis of white noise is rejected.

7-) For your retail time series (from Exercise 7 in Section 2.10):

Create a training dataset consisting of observations before 2011 using

set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries_train <- myseries |>
  filter(year(Month) < 2011)

Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE(Turnover))

Check the residuals.

fit |> gg_tsresiduals() +
  ggtitle("Residual Plots for Australian Retail Turnover")

From the ACF plot, more autocorrelations (vertical black lines) are outside the confidence interval (blue dashed line). This indicates that the residuals are correlated. The residuals distribution is not normal and not centered around zero.

Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))
fc |> autoplot(myseries)

Comment: Overall, beside the peaks from the actual data that do not fall in the forecasted data, the remaining of the actual data falls in the forecasted data.

Compare the accuracy of your forecasts against the actual values.

fit |> accuracy()

# A tibble: 1 × 12
  State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
  <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Norther… Clothin… SNAIV… Trai… 0.439  1.21 0.915  5.23  12.4     1     1 0.768

fc |> accuracy(myseries)

# A tibble: 1 × 12
  .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
  <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SNAIVE(T… Nort… Clothin… Test  0.836  1.55  1.24  5.94  9.06  1.36  1.28 0.601

Comment: the errors are smaller on the training values compared to the actual values.

How sensitive are the accuracy measures to the amount of training data used?

Accuracy measures are sensitive to the amount of training data used, which can also depend on how the data used is split . Increasing the amount of training data can increase the accuracy. However, with the law of the diminishing returns, too much training data may have little to no effects, this does not increase the performance of the data. It will also depend on the model that we are using. A cross validation with the smallest RMSE computed will be useful.