library(fpp3)
library(latex2exp)
library(seasonal)

## Warning in system(paste(x13.bin, file.path(tdir, "Testairline")), intern =
## TRUE, : running command
## '/Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/library/x13binary/bin/x13ashtml
## /var/folders/z2/f1f7v_cs07sdpththdqy3mlr0000gn/T//RtmpxoPvKo/x13binary__818f55b56529__dir/Testairline
## 2>/dev/null' had status 134

5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population

Since population usually increase over time, RW(y ~ drift()) would be the best for this dataset. We can fit all three models and compare the forecast accuracy measures. The model with the lowest RMSE is usually prefered.

pop_train <- global_economy %>%
  filter(Country == "Australia")  %>%
  filter_index("1960"~ "2000")

pop_fit <- pop_train %>%
  model(
    `Naïve` = NAIVE(Population),
    `Seasonal naïve` = SNAIVE(Population),
    Drift = RW(Population ~drift())
  )

## Warning: 1 error encountered for Seasonal naïve
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.

pop_fc <- pop_fit %>%
  forecast(h=10)

accuracy(pop_fc,pop_train )

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 10 observations are missing between 2001 and 2010

## # A tibble: 3 × 11
##   .model         Country   .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>          <fct>     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift          Australia Test    NaN   NaN   NaN   NaN   NaN   NaN   NaN    NA
## 2 Naïve          Australia Test    NaN   NaN   NaN   NaN   NaN   NaN   NaN    NA
## 3 Seasonal naïve Australia Test    NaN   NaN   NaN   NaN   NaN   NaN   NaN    NA

pop_fc %>%
  autoplot(pop_train) + 
  labs(title = "Forecasts for Australia Populations")

Bricks

Since brick production has seasonal patterns, SNAIVE() would be the best for this dataset. We can fit all three models and compare the forecast accuracy measures. The model with the lowest RMSE is usually prefered.

bricks <- aus_production %>%
  select(Quarter, Bricks)

bricks_train <- aus_production %>%
  filter_index("1992 Q1" ~ "2004 Q4")

bricks_test <- aus_production %>%
  filter_index("2005 Q1" ~ .)

bricks_fit <- bricks_train %>%
  model(
    `Naïve` = NAIVE(Bricks),
    `Seasonal naïve` = SNAIVE(Bricks),
    Drift = RW(Bricks ~drift())
  )

bricks_fc <- bricks_fit %>%
  forecast(h = 10)

accuracy(bricks_fc, bricks_test)

## # A tibble: 3 × 10
##   .model         .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>          <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift          Test   -2.41  39.9  39.9 -1.65 10.3    NaN   NaN  -0.5
## 2 Naïve          Test   -2     40.0  40   -1.55 10.3    NaN   NaN  -0.5
## 3 Seasonal naïve Test  -21     39.1  33   -6.23  8.98   NaN   NaN  -0.5

SNAIVE()is a better model here as it has the lowest RMSE.

bricks_fc %>%
  autoplot(bricks_train, level = NULL) +
  autolayer(bricks_test,colour = "black")+
  labs(title= "Bricks Production Forecasts")+
  guides(colour = guide_legend(title = "Forecast"))

## Plot variable not specified, automatically selected `.vars = Beer`

NSW Lambs

Since livestock has seasonal patterns, SNAIVE() would be the best for this dataset. We can fit all three models and compare the forecast accuracy measures. The model with the lowest RMSE is usually prefered.

lambs_data<- aus_livestock %>%
  filter(Animal=="Lambs", State == "New South Wales")%>%
  select(Month, Count) 

lambs_train <- lambs_data %>%
  filter_index("2005 Jan" ~ "2017 Dec")

lambs_test <- lambs_data %>%
  filter_index("2018 Jan" ~ "2019 Feb")

lambs_fit <- lambs_train %>%
  model(
    `Naïve` = NAIVE(Count),
    `Seasonal naïve` = SNAIVE(Count),
    Drift = RW(Count ~drift())
  )

lambs_fc <- lambs_fit %>%
  forecast(h = 14)

accuracy(lambs_fc, lambs_test)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 2 observations are missing between 2019 Jan and 2019 Feb

## # A tibble: 3 × 10
##   .model         .type     ME   RMSE    MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>          <chr>  <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift          Test  47547. 69580. 62076.  9.90   14.5   NaN   NaN 0.206
## 2 Naïve          Test  49242. 70397. 62858. 10.3    14.6   NaN   NaN 0.196
## 3 Seasonal naïve Test   9367. 52218. 42733.  0.957  10.7   NaN   NaN 0.519

SNAIVE()is a better model here as it has the lowest RMSE.

lambs_fc %>%
  autoplot(lambs_train, level = NULL) +
  autolayer(lambs_test,colour = "black")+
  labs(title= "New South Wales Lambs Forecasts")+
  guides(colour = guide_legend(title = "Forecast"))

## Plot variable not specified, automatically selected `.vars = Count`

Household wealth

Since household wealth usually has an upward trend and is plotted annually, RW(y ~ drift()) would be the best for this dataset. We can fit all three models and compare the forecast accuracy measures. The model with the lowest RMSE is usually prefered.

wealth_data <- hh_budget %>%
  select(Year, Wealth)

wealth_train <- wealth_data %>%
  filter(Year <= 2010)

wealth_test <- wealth_data %>%
  filter(Year > 2010)

wealth_fit <- wealth_train %>%
  model(
    `Naïve` = NAIVE(Wealth),
    `Seasonal naïve` = SNAIVE(Wealth),
    Drift = RW(Wealth ~drift())
  )

## Warning: 4 errors (1 unique) encountered for Seasonal naïve
## [4] Non-seasonal model specification provided, use RW() or provide a different lag specification.

wealth_fc <- wealth_fit %>%
  forecast(h = 10)

accuracy(wealth_fc, wealth_test)

## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 4 observations are missing between 2017 and 2020

## # A tibble: 12 × 11
##    .model       Country .type    ME  RMSE   MAE    MPE   MAPE  MASE RMSSE   ACF1
##    <chr>        <chr>   <chr> <dbl> <dbl> <dbl>  <dbl>  <dbl> <dbl> <dbl>  <dbl>
##  1 Drift        Austra… Test   21.7  36.6  30.5   5.20   7.95   NaN   NaN  0.346
##  2 Drift        Canada  Test   34.9  43.4  36.1   6.44   6.72   NaN   NaN  0.371
##  3 Drift        Japan   Test   28.3  31.7  28.3   4.77   4.77   NaN   NaN  0.273
##  4 Drift        USA     Test   36.0  51.9  46.8   5.87   8.00   NaN   NaN  0.377
##  5 Naïve        Austra… Test   28.1  43.0  36.2   6.86   9.42   NaN   NaN  0.363
##  6 Naïve        Canada  Test   50.0  59.9  50.0   9.30   9.30   NaN   NaN  0.414
##  7 Naïve        Japan   Test   54.7  59.9  54.7   9.25   9.25   NaN   NaN  0.432
##  8 Naïve        USA     Test   47.1  63.2  54.8   7.78   9.28   NaN   NaN  0.404
##  9 Seasonal na… Austra… Test  NaN   NaN   NaN   NaN    NaN      NaN   NaN NA    
## 10 Seasonal na… Canada  Test  NaN   NaN   NaN   NaN    NaN      NaN   NaN NA    
## 11 Seasonal na… Japan   Test  NaN   NaN   NaN   NaN    NaN      NaN   NaN NA    
## 12 Seasonal na… USA     Test  NaN   NaN   NaN   NaN    NaN      NaN   NaN NA

wealth_fc %>%
  autoplot(wealth_train, level = NULL) +
  autolayer(wealth_test,colour = "black")+
  labs(title= "Household Wealth  Forecasts")+
  guides(colour = guide_legend(title = "Forecast"))

## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_line()`).

Australian takeaway food turnover

takeaway_data <- aus_retail %>%
  filter(Industry =="Takeaway food services", State =="Australian Capital Territory") %>%
  select(Month, Turnover)

takeaway_train <- takeaway_data %>%
  filter_index("2000 Jan" ~ "2014 Dec")

takeaway_test <- takeaway_data %>%
  filter_index("2015 Jan" ~ "2018 Dec")

takeaway_fit <- takeaway_train %>%
  model(
    `Naïve` = NAIVE(Turnover),
    `Seasonal naïve` = SNAIVE(Turnover),
    Drift = RW(Turnover ~drift())
  )

takeaway_fc <- takeaway_fit %>%
  forecast(h = 10)

accuracy(takeaway_fc, takeaway_test)

## # A tibble: 3 × 10
##   .model         .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>          <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift          Test  -1.54  1.98  1.54 -8.23  8.23   NaN   NaN 0.569
## 2 Naïve          Test  -1.11  1.80  1.23 -6.14  6.70   NaN   NaN 0.606
## 3 Seasonal naïve Test   3.68  4.08  3.68 18.3  18.3    NaN   NaN 0.562

takeaway_fc %>%
  autoplot(takeaway_train, level = NULL) +
  autolayer(takeaway_test,colour = "black")+
  labs(title= "Australian takeaway food turnover Forecasts")+
  guides(colour = guide_legend(title = "Forecast"))

5.2

Use the Facebook stock price (data set gafa_stock) to do the following:

part a

Produce a time plot of the series.

facebook_stock <- gafa_stock %>%
  filter(Symbol == 'FB') %>% 
  mutate(day = row_number()) %>%
  update_tsibble(index = day, regular = TRUE)

facebook_stock %>%
  autoplot(Close) + 
  labs(title = "Facbook Daily Closing Stock Prices")

part b

Produce forecasts using the drift method and plot them.

facebook_2015 <- facebook_stock |> 
  filter(year(Date) == 2015)

facebook_fit <- facebook_2015 %>%
  model(
    Drift = RW(Close ~drift())
  )

facebook_jan_2016 <- facebook_stock |>
  filter(yearmonth(Date) == yearmonth("2016 Jan"))

facebook_fc <- facebook_fit |>
  forecast(new_data = facebook_jan_2016)
# Plot the forecasts
facebook_fc |>
  autoplot(facebook_2015) +
  autolayer(facebook_jan_2016, Close, colour = "black") +
  labs(y = "$US",
       title = "facebook daily closing stock prices",
       subtitle = "(Jan 2015 - Jan 2016)") +
  guides(colour = guide_legend(title = "Forecast"))

part c

Show that the forecasts are identical to extending the line drawn between the first and last observations.

facebook_fit <- facebook_2015 %>%
  model(
    `Naïve` = NAIVE(Close))


facebook_jan_2016 <- facebook_stock |>
  filter(yearmonth(Date) == yearmonth("2016 Jan"))

facebook_fc <- facebook_fit |>
  forecast(new_data = facebook_jan_2016)
# Plot the forecasts
facebook_fc |>
  autoplot(facebook_2015) +
  autolayer(facebook_jan_2016, Close, colour = "black") +
  labs(y = "$US",
       title = "facebook daily closing stock prices",
       subtitle = "(Jan 2015 - Jan 2016)") +
  guides(colour = guide_legend(title = "Forecast"))

part d

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

If you want to forecast overall trend, Drift method is the best to forecast the Facebook stocks because it accounts for the overall trend over time. If you want to forecast stock price short term, the naive method may be a good choice as it assume the next value will be the same/similar to recent values.

facebook_fit <- facebook_2015 %>%
  model(
    Mean = MEAN(Close))

facebook_jan_2016 <- facebook_stock |>
  filter(yearmonth(Date) == yearmonth("2016 Jan"))

facebook_fc <- facebook_fit |>
  forecast(new_data = facebook_jan_2016)
# Plot the forecasts
facebook_fc |>
  autoplot(facebook_2015) +
  autolayer(facebook_jan_2016, Close, colour = "black") +
  labs(y = "$US",
       title = "facebook daily closing stock prices",
       subtitle = "(Jan 2015 - Jan 2016)") +
  guides(colour = guide_legend(title = "Forecast"))

5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. What do you conclude?

The residuals are centered about 0 and do not show any obvious trend. The histogram of the residual is nearly symmetric and almost bi normal distributed. The autocorrection plot show some spikes, especially at lag 4, which means that some patterns are left unexplained. The seasonal naïve method does a good job in capturing seasonality as it repeats the same vale from same quarter the last year. But it did not capture all the structure as we saw there are still some significant spikes in the acf plot.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

The Ljing-Box test on the residuals of the seasonal naive model reveals a test statistics of 32.27 with a p-value of 0.00008. Since the p-vlaue < 0.05, we we can conclude the residuals are not white noise.

augment(fit) |> features(.innov, ljung_box, lag=8)

## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    32.3 0.0000834

augment(fit) |> features(.innov, box_pierce, lag=8)

## # A tibble: 1 × 3
##   .model       bp_stat bp_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    29.7  0.000234

5.4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports

Since export data is yearly and has no strong seasonal patterns, NAIVE() is more appropriate. The residual distribution appears to be spead and not exactly bell shaped. There are some large spikes especially in 2009-2010. The p-value for the Ljung- Box test is 0.194, which is greater than 0.05. The p-value for box piece test is 0.414, which is greater than 0.05. These suggest residuals behave like white noise.

export <- global_economy %>%
  filter(Country == "Australia", Year>= 1992)%>%
  select(Year, Exports)

export_fit <- export %>%
  model(NAIVE(Exports))
# Look at the residuals
export_fit %>%
  gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
export_fit %>%
  forecast(h = 10) %>%
  autoplot(export)

augment(export_fit) |> features(.innov, ljung_box, lag=8)

## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 NAIVE(Exports)    11.1     0.194

augment(export_fit) |> features(.innov, box_pierce, lag=8)

## # A tibble: 1 × 3
##   .model         bp_stat bp_pvalue
##   <chr>            <dbl>     <dbl>
## 1 NAIVE(Exports)    8.20     0.414

Bricks

Residuals fluctuates with large negative values around 1994-1995 and early 2000s. There are strong positive autocorection at lag 1 and 2 and the rest of the lags are negative. The distribution is skwed (not normal). The p-vlaues of both test are less that 0.05. All of these points that the reisduals are not white noise.

# modified from code given by text

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992, !is.na(Bricks))
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Bricks))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

augment(fit) |> features(.innov, ljung_box, lag=8)

## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 SNAIVE(Bricks)    62.2  1.74e-10

augment(fit) |> features(.innov, box_pierce, lag=8)

## # A tibble: 1 × 3
##   .model         bp_stat     bp_pvalue
##   <chr>            <dbl>         <dbl>
## 1 SNAIVE(Bricks)    56.2 0.00000000256

5.7

For your retail time series (from Exercise 7 in Section 2.10):

part a

Create a training dataset consisting of observations before 2011 using

set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries_train <- myseries |>
  filter(year(Month) < 2011)

part b

Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

part c

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE(Turnover))

part d

Check the residuals. Do the residuals appear to be uncorrelated and normally distributed?

The residuals fluctuate aorund 0. In the ACF plot there are autocorrection at several lags which means residuals appear to be uncorrelated.

The distribution of the histogram is almost normal.

The residuals are not white noise.

fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

part e

Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

part f

Compare the accuracy of your forecasts against the actual values.

Errors (RMSE, MAE) are higher in the test set than training which is expected. .

fit |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439  1.21 0.915  5.23  12.4     1     1 0.768

fc |> accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test  0.836  1.55  1.24  5.94  9.06  1.36  1.28 0.601

part g

How sensitive are the accuracy measures to the amount of training data used?

If you use less training data, the accuracy will likely to decrease. If you use more training data, the accuracy may increase. But excessive amount of data may also lead to overfitting.

DATA 624 Homework 3

Susanna Wong

2025-09-15

5.1

Australian Population

Bricks

NSW Lambs

Household wealth

Australian takeaway food turnover

5.2

part a

part b

part c

part d

5.3

5.4

Australian Exports

Bricks

5.7

part a

part b

part c

part d

part e

part f

part g