HW 3: Residuals, Benchmark Methods for Forecasting, & Measures of Accuracy

HW 3

5.1, 5.2, 5.3, 5.4 and 5.7

library(fpp3)

5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)
Bricks (aus_production)
NSW Lambs (aus_livestock)
Household wealth (hh_budget).
Australian takeaway food turnover (aus_retail).

Australian Population (global_economy)

ge <- global_economy %>% 
  filter(Country=="Australia") %>% 
  select(Year, Population)

train <- ge %>% 
  filter_index("1960" ~ "2017")

ge_fit <- train |>
  model(
    `Naïve` = NAIVE(Population),
    `Drift` = RW(Population ~ drift())
  )

ge_fc <- ge_fit %>%  forecast(h = "10 years")

# Plot forecasts against actual values
ge_fc %>% 
  autoplot(train, level = NULL) +
  autolayer(
    filter_index(ge, "2018" ~ .),
    colour = "black"
  ) +
  labs(
    y = "Population",
    title = "Forecasts for Australian Population"
  ) +
  guides(color = guide_legend(title = "Forecast"))

## Plot variable not specified, automatically selected `.vars = Population`

Seasonal Naive method is not applicable here, since there is only one data point per year. Drift works best.

Bricks (aus_production)

bricks <- aus_production %>% 
  select(Quarter, Bricks)

train <- bricks %>% 
  filter_index("1990 Q1" ~ "2005 Q2")

bricks_fit <- train %>% 
  model(
    `Naïve` = NAIVE(Bricks),
    `Seasonal Naive` = SNAIVE(Bricks),
    `Drift` = RW(Bricks ~ drift())
  )

bricks_fc <- bricks_fit %>%  forecast(h = "5 years")

# Plot forecasts against actual values
bricks_fc %>% 
  autoplot(train, level = NULL) +
  autolayer(
    filter_index(bricks, "2005 Q3" ~ .),
    colour = "black"
  ) +
  labs(
    y = "Bricks",
    title = "Forecasts for Brick Production"
  ) +
  guides(color = guide_legend(title = "Forecast"))

## Plot variable not specified, automatically selected `.vars = Bricks`

## Warning: Removed 20 rows containing missing values (`geom_line()`).

Seasonal Naive follows the zig-zagging pattern best.

NSW Lambs (aus_livestock)

al <- aus_livestock %>% 
  filter(Animal=="Lambs", State=="New South Wales") %>% 
  select(Month, Count)

train <- al %>% 
  filter_index("2014 Jan" ~ "2018 Dec")

al_fit <- train |>
  model(
    `Naïve` = NAIVE(Count),
    `Seasonal Naive` = SNAIVE(Count),
    `Drift` = RW(Count ~ drift())
  )

al_fc <- al_fit %>%  forecast(h = "5 years")

# Plot forecasts against actual values
al_fc %>% 
  autoplot(train, level = NULL) +
  autolayer(
    filter_index(al, "2018 Dec" ~ .),
    colour = "black"
  ) +
  labs(
    y = "Count",
    title = "Forecasts for NSW Lambs"
  ) +
  guides(color = guide_legend(title = "Forecast"))

## Plot variable not specified, automatically selected `.vars = Count`
## `geom_line()`: Each group consists of only one observation.
## i Do you need to adjust the group aesthetic?

Seasonal Naive works best.

Household wealth (hh_budget).

hb <- hh_budget %>% 
  select(Year, Wealth)

train <- hb %>% 
  filter_index("1995" ~ "2016")

hb_fit <- train |>
  model(
    `Naïve` = NAIVE(Wealth),
    `Drift` = RW(Wealth ~ drift())
  )

hb_fc <- hb_fit %>%  forecast(h = "10 years")

# Plot forecasts against actual values
hb_fc %>% 
  autoplot(train, level = NULL) +
  autolayer(
    filter_index(hb, "2017" ~ .),
    colour = "black"
  ) +
  labs(
    y = "Wealth (percentage of net disposable income)",
    title = "Forecasts for Household Wealth"
  ) +
  guides(color = guide_legend(title = "Forecast"))

## Plot variable not specified, automatically selected `.vars = Wealth`

Again the Seasonal Naive method is not applicable here. Drift seems to be a better fit since it models the upward trend better.

Australian takeaway food turnover (aus_retail).

food <- aus_retail %>%
  filter(State %in% c("Australian Capital Territory", "South Australia", "Western Australia"),Industry == "Food retailing") %>% 
  select(Month, Turnover)

train <- food %>% 
  filter_index("2005 Jan" ~ "2010 Dec")

food_fit <- train %>% 
  model(
    `Naïve` = NAIVE(Turnover),
    `Seasonal Naive` = SNAIVE(Turnover),
    `Drift` = RW(Turnover ~ drift())
  )

food_fc <- food_fit %>%  forecast(h = "5 years")

# Plot forecasts against actual values
food_fc %>% 
  autoplot(train, level = NULL) +
  autolayer(
    filter_index(food, "2011 Jan" ~ .),
    colour = "black"
  ) +
  labs(
    y = "Turnover",
    title = "Forecasts for Food Retail Turnover"
  ) +
  guides(color = guide_legend(title = "Forecast"))

## Plot variable not specified, automatically selected `.vars = Turnover`

It wouldn’t have looked very organized if all States were plotted so only the ones with “Australia” in the name were chosen. For all three, Seasonal Naive captures the pattern well, but the lines need to be shifted higher. The Naive model is at the right level, but doesn’t capture the pattern. It is probably the best because it’s a decent average.

5.2

Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series.

fb <- gafa_stock %>% 
  filter(Symbol == "FB", year(Date) >= 2015) %>% 
  select(Date, Close) #%>% 
  # mutate(day = row_number()) |>
  # update_tsibble(index = day, regular = TRUE)

fb %>% autoplot()

## Plot variable not specified, automatically selected `.vars = Close`

Produce forecasts using the drift method and plot them.

fb <- gafa_stock %>% 
  filter(Symbol == "FB", year(Date) >= 2015) %>% 
  select(Date, Close) %>% 
  mutate(day = row_number()) |>
  update_tsibble(index = day, regular = TRUE)

# Filter the year of interest
fb_2015 <- fb %>%  filter(year(Date) == 2015)
# Fit the models
fb_fit <- fb_2015 |>
  model(
    Drift = NAIVE(Close ~ drift())
  )
# Produce forecasts for the trading days in January 2016
fb_jan_2016 <- fb %>% 
  filter(yearmonth(Date) == yearmonth("2016 Jan"))
fb_fc <- fb_fit %>% 
  forecast(new_data = fb_jan_2016)
# Plot the forecasts
fb_fc %>% 
  autoplot(fb_2015, level = NULL) +
  autolayer(fb_jan_2016, Close, colour = "black") +
  labs(y = "$US",
       title = "Facebook daily closing stock prices",
       subtitle = "(Jan 2015 - Jan 2016)") +
  guides(colour = guide_legend(title = "Forecast"))

Show that the forecasts are identical to extending the line drawn between the first and last observations.

fb_fc %>% 
  autoplot(fb_2015, level = NULL) +
  autolayer(fb_jan_2016, Close, color = "black") +
  
  autolayer(bind_rows(fb_2015 %>% head(1),fb_2015 %>% tail(1)), Close, color="orange") + 


  labs(y = "$US",
       title = "Facebook daily closing stock prices",
       subtitle = "(Jan 2015 - Jan 2016)") +
  guides(colour = guide_legend(title = "Forecast"))

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb <- gafa_stock %>% 
  filter(Symbol == "FB", year(Date) >= 2015) %>% 
  select(Date, Close) %>% 
  mutate(day = row_number()) |>
  update_tsibble(index = day, regular = TRUE)

# Filter the year of interest
fb_2015 <- fb %>%  filter(year(Date) == 2015)
# Fit the models
fb_fit <- fb_2015 |>
  model(
    Mean = MEAN(Close),
    `Naïve` = NAIVE(Close),
    `Seasonal Naive` = SNAIVE(Close),
    Drift = NAIVE(Close ~ drift())
  )

## Warning: 1 error encountered for Seasonal Naive
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.

# Produce forecasts for the trading days in January 2016
fb_jan_2016 <- fb %>% 
  filter(yearmonth(Date) == yearmonth("2016 Jan"))
fb_fc <- fb_fit %>% 
  forecast(new_data = fb_jan_2016)
# Plot the forecasts
fb_fc %>% 
  autoplot(fb_2015, level = NULL) +
  autolayer(fb_jan_2016, Close, colour = "black") +
  labs(y = "$US",
       title = "Facebook daily closing stock prices",
       subtitle = "(Jan 2015 - Jan 2016)") +
  guides(colour = guide_legend(title = "Forecast"))

## Warning: Removed 19 rows containing missing values (`()`).

None of these benchmark models work well because the stock price is too volatile. There needs to be a model that takes the zig-zag patterns into account.

5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values (`geom_line()`).

## Warning: Removed 4 rows containing missing values (`geom_point()`).

## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

What do you conclude?

There is high autocorrelation on lag 4. To a lesser extent, lags 1, 3, and 10 also make contact with the dotted lines. In addition, the residual histogram is not normal-shaped. The residuals appear to be equally above and below the 0 line, but, overall they do not look like white noise.

5.4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports

aus_exp <- global_economy %>% 
  filter(Country=="Australia", Year >=2005) %>% 
  select(Year, Exports)

fit <- aus_exp %>% model(NAIVE(Exports))

fit %>% gg_tsresiduals()

## Warning: Removed 1 row containing missing values (`geom_line()`).

## Warning: Removed 1 rows containing missing values (`geom_point()`).

## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).

fit %>% forecast() %>% autoplot(aus_exp)

In the ACF plot, only the first lag makes contact with the dotted line, which isn’t bad. The residuals appear to be symmetric above and below the 0 line. However, the residuals histogram is not normal so this probably not white noise.

Bricks from aus_production

bricks <- aus_production %>% 
  select(Quarter, Bricks) %>% 
  filter(year(Quarter) >= 2000)

fit <- bricks %>% model(SNAIVE(Bricks))

fit %>% gg_tsresiduals()

## Warning: Removed 24 rows containing missing values (`geom_line()`).

## Warning: Removed 24 rows containing missing values (`geom_point()`).

## Warning: Removed 24 rows containing non-finite values (`stat_bin()`).

fit %>% forecast() %>% autoplot(bricks)

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning: Removed 8 rows containing missing values (`()`).

## Warning: Removed 20 rows containing missing values (`geom_line()`).

With a Seasonal Naive model, there is only autocorrelation for the first lag, which is to be expected. But the residuals histogram is clearly not normal, and the innovation residuals do not average to 0. This is not white noise.

5.7

For your retail time series (from Exercise 7 in Section 2.10):

set.seed(50)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

Create a training dataset consisting of observations before 2011 using

myseries_train <- myseries |>
  filter(year(Month) < 2011)

Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE(Turnover))

Check the residuals.

fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values (`geom_line()`).

## Warning: Removed 12 rows containing missing values (`geom_point()`).

## Warning: Removed 12 rows containing non-finite values (`stat_bin()`).

Do the residuals appear to be uncorrelated and normally distributed? The residuals appear to be correlated. In the innovation residuals plot, the variance is not homoscedastic. Variance increases over time. In the ACF plot, the autocorrelation is significantly different from zero in several places. In the residuals histogram, the distributin is not normal and there are some outliers.

Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

Compare the accuracy of your forecasts against the actual values.

fit |> accuracy()

## # A tibble: 1 x 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Queensl~ Other r~ SNAIV~ Trai~  2.94  7.16  5.23  6.38  10.2     1     1 0.713

fc |> accuracy(myseries)

## # A tibble: 1 x 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T~ Quee~ Other r~ Test   17.3  26.8  22.0  12.6  18.8  4.22  3.75 0.795

How sensitive are the accuracy measures to the amount of training data used? Having too much training data can make a model overfit, leading to decreased accuracy. Likewise, too little training data can make it underfit.

HW 3: Residuals, Benchmark Methods for Forecasting, & Measures of Accuracy

Tora Mullings

2023-09-23

HW 3

5.1

5.2

5.3

5.4

5.7