suppressWarnings({
  library(fpp3)
  library(dplyr)
  library(tidyr)
  library(fable)
})

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.5
## ✔ dplyr       1.1.2     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.0     ✔ feasts      0.3.2
## ✔ lubridate   1.9.2     ✔ fable       0.3.4
## ✔ ggplot2     3.5.1     ✔ fabletools  0.4.2

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

1. Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)

aus_pop <- global_economy %>%
  filter(Country == "Australia")

autoplot(aus_pop, Population) +
  labs(title = "Australian Population")

Since this time series follows a clear linear trend, a drift model that uses the average rate of change between the first and last points will be the most appropriate forecast.

aus_pop_model <- aus_pop %>%
  model(RW(Population ~ drift()))

aus_pop_model %>%
  forecast() %>%
  autoplot(aus_pop) +
  labs(title = "Australian Population Forecast")

Bricks (aus_production)

aus_bricks <- aus_production[,c(1,4)] %>%
  drop_na()

autoplot(aus_bricks, Bricks) +
  labs(title = "Australian Brick Production")

Since the bricks time series exhibits seasonality, the SNAIVE() method will be most appropriate.

aus_bricks_model <- aus_bricks %>%
  model(SNAIVE(Bricks))

aus_bricks_model %>%
  forecast() %>%
  autoplot(aus_bricks) +
  labs(title = "Australian Brick Production Forecast")

NSW Lambs (aus_livestock)

nsw_lambs <- aus_livestock %>%
  filter(State == "New South Wales") %>%
  filter(Animal == "Lambs")

autoplot(nsw_lambs, Count) +
  labs(title = "Lambs in New South Wales")

Similarly to the production of bricks, the population of lambs exhibits seasonality, so SNAIVE() is most appropriate.

nsw_lambs_model <- nsw_lambs %>%
  model(SNAIVE(Count))

nsw_lambs_model %>%
  forecast() %>%
  autoplot(nsw_lambs) +
  labs(title = "New South Wales Lambs Forecast")

Household wealth (hh_budget).

hh_wealth <- hh_budget[,c(1,2,7)] %>%
  as_tsibble(index = "Year")

autoplot(hh_wealth, Wealth) +
  labs(title = "Household Wealth", y = "Wealth (USD)")

With an upward trend, the drift method will be appropriate.

hh_wealth_model <- hh_wealth %>%
  model(RW(Wealth ~ drift()))

hh_wealth_model %>%
  forecast() %>%
  autoplot(hh_wealth) +
  labs(title = "Household Wealth Forecast")

Australian takeaway food turnover (aus_retail).

aus_takeaway <- aus_retail %>%
  filter(Industry == "Takeaway food services")

autoplot(aus_takeaway, Turnover) +
  labs(title = "Turnover at Australian Takeaways")

With clear seasonality, the SNAIVE() method will be most appropriate.

aus_takeaway_model <- aus_takeaway %>%
  model(SNAIVE(Turnover))

aus_takeaway_forecast <- aus_takeaway_model %>%
  forecast()

ggplot() +
  geom_line(data = aus_takeaway, aes(x = Month, y = Turnover, color = State)) +
  geom_line(data = aus_takeaway_forecast, aes(x = Month, y = .mean, color = State), linetype = "dashed") + 
  labs(title = "Australian Takeaway Turnover Forecast", 
       y = "Turnover", 
       x = "Time") +
  theme_minimal()

2. Use the Facebook stock price (data set gafa_stock) to do the following:

a.) Produce a time plot of the series.

fb_close_price <- gafa_stock[,c(1,2,6)] %>%
  filter(Symbol == "FB", year(Date) >= 2014) %>%
  mutate(day = row_number()) %>%
  update_tsibble(index = Date, regular = TRUE) %>%
  fill_gaps()

autoplot(fb_close_price, Close) +
  labs(title = "Facebook Closing Stock Price")

b.) Produce forecasts using the drift method and plot them.

fb_close_model <- fb_close_price %>%
  model(RW(Close ~ drift()))

fb_close_forecast <- fb_close_model %>%
  forecast(h = "3 years")

fb_close_forecast %>%
autoplot(fb_close_price) +
  labs(title = "Facebook Close Price Forecast")

c.) Show that the forecasts are identical to extending the line drawn between the first and last observations.

We can show that the equation of the forecast line and the equation of the line drawn between the first and last observations are the same by using algebra.

fb_close_forecast <- fb_close_forecast %>%
  mutate(day = row_number())

# compare slopes
slope_forecast <- (fb_close_forecast$.mean[1096] - fb_close_forecast$.mean[1]) / (1096-1)

slope_actual <- (fb_close_price$Close[1825] - fb_close_price$Close[1]) / (1258-1)

round(slope_actual, 5) == round(slope_forecast, 5)

## [1] TRUE

#compare y-intercepts

forecast_b <- 131.1508 - (slope_forecast * 1259)

actual_b <- 54.71 - (slope_actual * 1)

round(forecast_b, 3) == round(actual_b, 3)

## [1] TRUE

Since both the forecast and the line drawn between the first and last observations have the same slope and y-intercept, they are the same line.

d.) Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb_close_price %>%
  model(NAIVE(Close)) %>%
  forecast(h = "3 years") %>%
  autoplot(fb_close_price) +
  labs(title = "Facebook Close Price Forecast: NAIVE Model")

fb_close_price %>%
  model(SNAIVE(Close)) %>%
  forecast(h = "3 years") %>%
  autoplot(fb_close_price) +
  labs(title = "Facebook Close Price Forecast: SNAIVE Model")

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

I think the NAIVE model is best. The drift model is not appropriate because it relies on the historical data to predict an overall upward trend, when the recent trend has been downward. I would not use the SNAIVE model because there is not apparent seasonality.

3. Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts.

The following code will help.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

What do you conclude?

The residuals do appear to be uncorrelated. They are also approximately normally distributed, with a mean of 0. This suggests that the SNAIVE model is a good fit for the data.

4. Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production.

Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports

aus_exports <- global_economy[,c(1,3,8)] %>%
  filter(Country == "Australia")

autoplot(aus_exports, Exports) +
  labs(title = "Australian Exports")

aus_exports_model <- aus_exports %>%
  model(NAIVE(Exports))

aus_exports_model %>%
  forecast() %>%
  autoplot(aus_exports) +
  labs(title = "Australian Exports Forecast")

aus_exports_model %>% gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

I used the NAIVE model because there does not appear to be seasonality in the data. The residuals are uncorrelated and normally distributed with a mean of 0, so we can conclude this is a good fit.

Australian Bricks

aus_bricks_model %>%
  forecast() %>%
  autoplot(aus_bricks) +
  labs(title = "Australian Brick Production Forecast")

gg_tsresiduals(aus_bricks_model)

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

I used the SNAIVE model since there is seasonality in the data. These residual plots do not reveal as strong of a fit as the previous two models were for their data sets. There is some heteroscedasticity, and the residuals appear to be skewed rather than normally distributed. However, the mean is still close to 0, and there’s no obvious pattern in the residuals. Still, there may be a better way to model this data.

7. For your retail time series (from Exercise 7 in Section 2.10):

set.seed(1989)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

autoplot(myseries, Turnover) +
  labs(title = "Turnover at Australian Retail Stores")

a.) Create a training dataset consisting of observations before 2011 using

myseries_train <- myseries |>
  filter(year(Month) < 2011)

b.) Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red") +
  labs(title = "Australian Retail Turnover: Test/ Train Split", y = "Turnover")

c.) Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE(Turnover))

d.) Check the residuals.

fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

Do the residuals appear to be uncorrelated and normally distributed?

The residuals are reasonably uncorrelated and normally distributed, but the mean is a bit higher than zero. The model may benefit from adding the mean of the residuals to the forecast.

e.) Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

f.) Compare the accuracy of your forecasts against the actual values.

fit |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Food re… SNAIV… Trai…  3.10  4.99  3.99  5.28  6.99  1.00     1 0.823

fc |> accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Food re… Test   20.1  23.4  20.6  16.6  17.0  5.15  4.69 0.932

The forecast did not produce very accurate predictions for the test data, which we can see from the measures of accuracy like the root mean squared error. The SNAIVE model accounted for the seasonality, but not the upward trend, leaving the forecast significantly lower than the actual data. In addition, we had previously used a Box Cox transformation on this time series due to the unequal variance. The model may be improved by first performing the Box Cox transformation, then creating the model and forecast, and finally un-transforming the forecast back to the original scale.

g.) How sensitive are the accuracy measures to the amount of training data used?

Given the upward trend in this data, the amount of training data used should significantly impact the accuracy of the forecast. For example, if we use all but the last year for training data:

myseries_train_2 <- myseries |>
  filter(year(Month) < 2018)

fit_2 <- myseries_train_2 |>
  model(SNAIVE(Turnover))

fit_2 |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

fc_2 <- fit_2 |>
  forecast(new_data = anti_join(myseries, myseries_train_2))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc_2 |> autoplot(myseries)

fit_2 |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Food re… SNAIV… Trai…  3.38  5.47  4.22  4.89  6.34  1.00     1 0.829

fc_2 |> accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Food re… Test   2.22  2.98   2.6  1.67  2.02 0.616 0.546 0.194

We can see from the graph that the forecast much more closely predicts the actual data, and measures of accuracy like the RMSE decrease compared to the first forecast that had less training data.

DATA 624 Homework 3: Forecasting

Molly Siebecker

2024-09-22