Data 624 Assignment 3

Load packages

library(fpp3)
library(RColorBrewer)
library(seasonal)

Instructions

Do exercises 5.1, 5.2, 5.3, 5.4 and 5.7 in the Hyndman book. Please submit your Rpubs link as well as your .pdf file showing your run code.

Exercise 1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)

The Australian Population dataset from global_economy seems to be fairly linear. As such, it makes sense to forecast using the drift method, which will extrapolate a line drawn between the first and last observations.

global_economy_aus <- global_economy %>% filter(Country == "Australia") 
global_economy_aus %>% autoplot(Population)

global_economy_aus %>% model(drift = RW(Population ~ drift())) %>% forecast(h = 10) %>% 
  autoplot(global_economy_aus, level = NULL, size = 2)

Bricks (aus_production)

The Bricks dataset from aus_production appears to have some seasonality within 5 years. Since the index for this tsibble is in quarters, I specified a 5 year lag with the SNAIVE forecast, which repeats the 5 year seasonal pattern; in my case, this was over 40 quarters which is equivalent to 10 years.

aus_production_bricks <- aus_production %>% filter(!is.na(Bricks))
aus_production_bricks %>% autoplot(Bricks)

aus_production_bricks %>% model(seasonal_naive = SNAIVE(Bricks ~ lag("5 year"))) %>% 
  forecast(h = 40) %>% autoplot(aus_production_bricks,level = NULL, size = 2)

NSW Lambs (aus_livestock)

The New South Wales Lamb dataset from aus_livestock also appears to have seasonality, although over a single year. As such, no lag was specified and SNAIVE was used again for the forecast.

aus_livestock_lambs <- aus_livestock %>% filter(Animal == "Lambs", State == "New South Wales") 
aus_livestock_lambs %>% autoplot()

aus_livestock_lambs %>% model(seasonal_naive = SNAIVE(Count)) %>% 
  forecast(h = 10) %>% autoplot(aus_livestock_lambs,level = NULL)

Household wealth (hh_budget)

The household wealth dataset from hh_budget does not show any seasonal patterns, although there are potentially cycles of wealth spikes and declines. There could have been potential to forecast with SNAIVE while including a drift component. However, I chose purely the drift method for this forecast as the cycle pattern was not regular enough to justify the use of SNAIVE.

hh_budget_wealth <- hh_budget %>% filter(!is.na(Wealth))
hh_budget_wealth %>% autoplot(Wealth)

hh_budget_wealth %>% 
  model(drift = RW(Wealth ~ drift())) %>% 
  forecast(h = 10) %>% 
  autoplot(hh_budget_wealth, level = NULL, size = 2)

Australian takeaway food turnover (aus_retail)

The Australian takeaway food turnover data from aus_retail shows a clear seasonal pattern as well as upward trend. As such, I forecasted with decomposition, using the SNAIVE method for the seasonal component and drift method for the seasonal adjusted portion (trend cycle and remainder).

aus_takeaway <- aus_retail %>% filter(Industry == "Takeaway food services")
aus_takeaway %>% autoplot(Turnover)

aus_takeaway %>% filter(State == "Australian Capital Territory") %>% 
  model(stlf = decomposition_model(
    STL(Turnover ~ trend(), robust = TRUE),
    RW(season_adjust ~ drift()))) %>%
  forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) + 
  labs(title = "Forecast of Turnover for Takeaway Food Services in Australian Capital Territory")

aus_takeaway %>% filter(State == "New South Wales") %>% 
  model(stlf = decomposition_model(
    STL(Turnover ~ trend(), robust = TRUE),
    RW(season_adjust ~ drift()))) %>%
  forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) + 
  labs(title = "Forecast of Turnover for Takeaway Food Services in New South Wales")

aus_takeaway %>% filter(State == "Northern Territory") %>% 
  model(stlf = decomposition_model(
    STL(Turnover ~ trend(), robust = TRUE),
    RW(season_adjust ~ drift()))) %>%
  forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) + 
  labs(title = "Forecast of Turnover for Takeaway Food Services in Northern Territory")

aus_takeaway %>% filter(State == "Queensland") %>% 
  model(stlf = decomposition_model(
    STL(Turnover ~ trend(), robust = TRUE),
    RW(season_adjust ~ drift()))) %>%
  forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) + 
  labs(title = "Forecast of Turnover for Takeaway Food Services in Queensland")

aus_takeaway %>% filter(State == "South Australia") %>% 
  model(stlf = decomposition_model(
    STL(Turnover ~ trend(), robust = TRUE),
    RW(season_adjust ~ drift()))) %>%
  forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) + 
  labs(title = "Forecast of Turnover for Takeaway Food Services in South Australia")

aus_takeaway %>% filter(State == "Queensland") %>% 
  model(stlf = decomposition_model(
    STL(Turnover ~ trend(), robust = TRUE),
    RW(season_adjust ~ drift()))) %>%
  forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) + 
  labs(title = "Forecast of Turnover for Takeaway Food Services in Queensland")

aus_takeaway %>% filter(State == "Tasmania") %>% 
  model(stlf = decomposition_model(
    STL(Turnover ~ trend(), robust = TRUE),
    RW(season_adjust ~ drift()))) %>%
  forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) + 
  labs(title = "Forecast of Turnover for Takeaway Food Services in Tasmania")

aus_takeaway %>% filter(State == "Victoria") %>% 
  model(stlf = decomposition_model(
    STL(Turnover ~ trend(), robust = TRUE),
    RW(season_adjust ~ drift()))) %>%
  forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) + 
  labs(title = "Forecast of Turnover for Takeaway Food Services in Victoria")

aus_takeaway %>% filter(State == "Western Australia") %>% 
  model(stlf = decomposition_model(
    STL(Turnover ~ trend(), robust = TRUE),
    RW(season_adjust ~ drift()))) %>%
  forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) + 
  labs(title = "Forecast of Turnover for Takeaway Food Services in Western Australia")

Exercise 2

Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series.

I filtered on the symbol FB. For the sake of this exercise, I chose one column - stock price at open - to focus my analysis.

gafa_stock_fb <- gafa_stock %>% filter(Symbol == "FB") 
gafa_stock_fb %>% autoplot(Open) + 
  labs(title = "Facebook stock prices at open", y = "USD", x = "Year")

Produce forecasts using the drift method and plot them.

In order to forecast with the drift method, I needed to mutate the index and update the tsibble with this new index, as stock price data would have gaps during non-trading days. I forecasted 252 days ahead, as there are 252 average trading days per year.

gafa_stock_fb <- gafa_stock_fb %>% 
  mutate(new_index = row_number()) %>% 
  update_tsibble(index = new_index, regular = TRUE) 
gafa_stock_forecasted <- gafa_stock_fb %>% 
  model(RW(Open ~ drift())) %>% 
  forecast(h = 252) 
gafa_stock_forecasted %>% autoplot(gafa_stock_fb, level = NULL)

Show that the forecasts are identical to extending the line drawn between the first and last observations.

To show that the forecasts are identical, I plotted the forecast and added a segment between the first and last observations. The lines are shown to intersect at the last observation, and they appear to have the same slope, hence they are identical lines.

first_open = gafa_stock_fb %>% 
  filter(Date == min(Date)) %>% select(Open)
last_open = gafa_stock_fb %>% 
  filter(Date == max(Date)) %>% select(Open)
proof <- gafa_stock_forecasted %>% autoplot(gafa_stock_fb, level = NULL)
proof + annotate("segment", 
                 x = first_open$new_index, 
                 y = first_open$Open, 
                 xend = last_open$new_index, 
                 yend = last_open$Open, color = "red")

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

SNAIVE, NAIVE, DRIFT, and MEAN were used to forecast the dataset. Using SNAIVE is fairly nonsensical as there did not appear to be seasons. If seasonality were to occur, it could potentially be over a year (252 trading days). For SNAIVE, I plotted a 1 year season and 1 month season, though both do not appear to be good fits. MEAN and NAIVE also do not appear to be good fits, since the data shows a clear upward trend. The only function that would make sense in this case is drift, and even so it does not take into account the day to day fluctuation, as this fluctuation does not appear to be seasonal.

gafa_stock_fb %>% model(
  seasonal_naive_month = SNAIVE(Open ~ lag(21)),
  seasonal_naive_year = SNAIVE(Open ~ lag(252)),
  naive = NAIVE(Open),
  drift = RW(Open ~ drift()),
  mean = MEAN(Open)) %>% 
  forecast(h = 252) %>% 
  autoplot(gafa_stock_fb, level = NULL)

Exercise 3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

The residuals plot shows that there might be some slight heteroscedasticity as the variance of residuals is not completely the same. For instance, it does appear to increase and decrease in various a couple of times, with the notable decreases at around 1996-1997 and 2007-2008. The autocorrelations plot shows a large spike at lag 4 and minor spikes at lag 1 and 3. Since there are approximately three spikes outside of the autocorrelation bounds, there is some autocorrelation and the residuals therefore do not look like white noise. This would imply the model is not a good fit.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992) 
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

Exercise 4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

For the Australian Exports series from global_economy, the residuals appear generally homoscedastic, though variance increases a bit towards the end of the time series. The autocorrelations show ACF values that are almost all within the bounds, which also supports that the residuals are white noise. This would indicate that the NAIVE model is a good fit. This is likely because the data is not changing drastically in the 60 years of data collection.

# Extract data of interest
GE_Australian_Exports <- global_economy |>
  filter(Country == "Australia") 
GE_Australian_Exports %>% autoplot(Exports)

# Define and estimate a model
fit <- GE_Australian_Exports |> model(NAIVE(Exports))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(GE_Australian_Exports) + ylim(0,25)

For the Brick series from aus_production, I used SNAIVE to model and forecast the data. The residuals plot is clearly heteroscedastic and the autocorrelation shows ACF values that are all beyond the bounds. This indicates the model is not a good fit.

# Extract data of interest
aus_production |> autoplot(Bricks)

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

# Define and estimate a model
fit <- aus_production_bricks %>% model(seasonal_naive = SNAIVE(Bricks ~ lag("5 year"))) 
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 20 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast(h = 40) %>% autoplot(aus_production_bricks,level = NULL, size = 2)

Exercise 7

For your retail time series (from Exercise 7 in Section 2.10):

Create a training dataset consisting of observations before 2011 using

set.seed(32)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
  filter(year(Month) < 2011)

Check that your data have been split appropriately by producing the following plot.

From the following plot, I can see the data has been split appropriately through the color change.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

Since the data potentially has some seasonality, SNAIVE is applied.

fit <- myseries_train |>
  model(SNAIVE(Turnover))

Check the residuals.

The residuals do not appear to be normal, nor uncorrelated. The autocorrelation plot is again largely outside of bounds and residuals are heteroscedastic.

fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

Produce forecasts for the test data

The forecasts were plotted and the model does not seem to be a good predictor. We can see that the test data set is beyond the bounds of the 95% confidence level in multiple areas.

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))
fc |> autoplot(myseries)

Compare the accuracy of your forecasts against the actual values.

The forecast is not very accurate, shown by the Root Mean Squared Scaled Error (RMSSE) of the test data. Since scale is adjusted, a good accuracy would be under 1.

fit |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South A… Pharmac… SNAIV… Trai…  3.13  6.61  4.55  7.89  10.9     1     1 0.795

fc |> accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Pharmac… Test   18.5  22.7  18.8  15.1  15.5  4.13  3.43 0.912

How sensitive are the accuracy measures to the amount of training data used?

To assess the sensitivity of the accuracy measures, I performed the above exercise again but with different training/test data split. I split the test data 5 years after and before the original split (2011 +/- 5 years). Given the large fluctuation in accuracy depending on the data split, the accuracy measures did seem very sensitive to the amount of training data used. I would suggest that this sensitivity is due to the shape of the dataset, where values closer in time to the test data is more accurate. The inclusion (or contrarily, exclusion) of this data in the training model seems to contribute to the sensitivity of the accuracy measures more so than the “amount” of training data per se. For instance, cutting the data at 2016 instead of 2011 brings down the RMSSE down to 0.895, suggesting that the model could be an okay fit. In contrast, cutting the data at 2006 results in a much higher RMSSE at 9.52.

set.seed(32)
myseries2 <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries2_train <- myseries2 |>
  filter(year(Month) < 2016)
fit2 <- myseries2_train |>
  model(SNAIVE(Turnover))
fit2 |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

fc2 <- fit2 |>
  forecast(new_data = anti_join(myseries2, myseries2_train))
fc2 |> autoplot(myseries2)

fit2 |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South A… Pharmac… SNAIV… Trai…  3.55  7.13  4.95  7.45  10.2     1     1 0.777

fc2 |> accuracy(myseries2)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Pharmac… Test  0.900  6.38  5.17 0.580  4.07  1.04 0.895 0.536

set.seed(32)
myseries2 <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries2_train <- myseries2 |>
  filter(year(Month) < 2006)
fit2 <- myseries2_train |>
  model(SNAIVE(Turnover))
fit2 |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

fc2 <- fit2 |>
  forecast(new_data = anti_join(myseries2, myseries2_train))
fc2 |> autoplot(myseries2)

fit2 |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South A… Pharmac… SNAIV… Trai…  2.25  4.95  3.57  7.74  11.0     1     1 0.773

fc2 |> accuracy(myseries2)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Pharmac… Test   42.4  47.1  42.4  39.0  39.0  11.9  9.52 0.932