library(fpp3)
## -- Attaching packages -------------------------------------------- fpp3 0.4.0 --
## v tibble      3.1.4          v tsibble     1.0.1.9999
## v dplyr       1.0.7          v tsibbledata 0.3.0.9000
## v tidyr       1.1.3          v feasts      0.2.2     
## v lubridate   1.7.10         v fable       0.3.1     
## v ggplot2     3.3.5
## -- Conflicts ------------------------------------------------- fpp3_conflicts --
## x lubridate::date()    masks base::date()
## x dplyr::filter()      masks stats::filter()
## x tsibble::intersect() masks base::intersect()
## x tsibble::interval()  masks lubridate::interval()
## x dplyr::lag()         masks stats::lag()
## x tsibble::setdiff()   masks base::setdiff()
## x tsibble::union()     masks base::union()

6.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

  1. Australian Population (global_economy)
fit<-global_economy %>% filter(Country=='Australia') %>%model(RW(Population ~ drift()))
fit %>%
  forecast(h = "10 years")   %>%
  autoplot(global_economy %>% filter(Country=='Australia'))

With a relatively linear graph, drift is a pretty decent model.

  1. Bricks (aus_production)
fit<-aus_production%>% filter_index("1970 Q1" ~ "2004 Q4") %>% select(Bricks)%>%  fill_gaps()%>%model(SNAIVE(Bricks ~ lag(year)))  
## Warning: 1 error encountered for SNAIVE(Bricks ~ lag(year))
## [1] no applicable method for 'get_frequencies' applied to an object of class "function"
fit %>%
  forecast(h = "10 years")   %>%
  autoplot(aus_production %>% select(Bricks))
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 40 row(s) containing missing values (geom_path).
## Warning: Removed 20 row(s) containing missing values (geom_path).

c) NSW Lambs (aus_livestock)

targetData <- aus_livestock%>% filter(State=='New South Wales' & Animal == 'Lambs')
fit<-targetData%>% model(SNAIVE(Count ~ lag("year")))

fit %>%  forecast(h="10 years")   %>%  autoplot(targetData)+ labs(title='seasonal lambs')

targetData <- aus_livestock%>% filter(State=='New South Wales' & Animal == 'Lambs')
fit<-targetData%>% model(RW(Count ~ drift()))
fit %>%
  forecast(h = "10 years")   %>%
  autoplot(targetData)+labs(title='Drift lambs')

Here either a drift or seasonal forecast might be desired depending on the needs (a seasonal forecast might be best for determining peak birthing capacity planning, while a drift forecast might be better for a reasonable prediction of annual lamb vaccine production )

  1. Household wealth (hh_budget).
targetData <- hh_budget%>% select(Wealth)
fit<-targetData%>% model(RW(Wealth ~ drift()))
fit %>%
  forecast(h = "10 years")   %>%
  autoplot(targetData)+labs(title='HH Wealth')

Drift is a non horrible method

  1. Australian takeaway food turnover (aus_retail).

Here unlike the previous question we combine the various regions as that seems more what is asked for an “Australian” view

targetData <- aus_retail  %>% filter(Industry == 'Takeaway food services')  %>% summarize(Turnover=sum(Turnover))
fit<-targetData%>% model(RW(Turnover ~ drift()))
fit %>%
  forecast(h = "10 years")   %>%
  autoplot(targetData)+labs(title='Turnover')

Drift is probably a reasonable answer in this case, though case can be made for seasonal as well (depending on use of course)

6.2

Use the Facebook stock price (data set gafa_stock) to do the following:

  1. Produce a time plot of the series.
targetData <- gafa_stock %>% filter(Symbol=='FB')   %>%
  mutate(day = row_number()) %>%
  update_tsibble(index = day, regular = TRUE)
target_2018 <- targetData %>% filter(year(Date) == 2018 & yearmonth(Date) < yearmonth("2018 Dec"))
targetData %>% autoplot(vars(Adj_Close)) 

b) Produce forecasts using the drift method and plot them.

fit<- target_2018 %>% model(RW(Adj_Close ~ drift()))
target_dec_2018 <- targetData %>%
  filter(yearmonth(Date) == yearmonth("2018 Dec"))
fit%>%forecast(new_data=target_dec_2018)%>% autoplot(targetData %>% filter(yearmonth(Date) < yearmonth("2018 Dec")))

c) Show that the forecasts are identical to extending the line drawn between the first and last observations.

(fit%>%forecast(new_data=target_dec_2018))$Adj_Close
## <distribution[19]>
##  [1] N(140, 18)  N(140, 37)  N(140, 55)  N(140, 74)  N(140, 93)  N(140, 112)
##  [7] N(139, 131) N(139, 150) N(139, 169) N(139, 189) N(139, 209) N(138, 229)
## [13] N(138, 249) N(138, 269) N(138, 289) N(138, 310) N(138, 331) N(137, 352)
## [19] N(137, 373)
(target_2018$Adj_Close[[232]]/target_2018$Adj_Close[[1]])/233*(233+19)+(target_2018$Adj_Close[[232]])
## [1] 141.4483
  1. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

6.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

Extract data of interest

recent_production <- aus_production %>%
  filter(year(Quarter) >= 1992)

Define and estimate a model

fit <- recent_production %>% model(SNAIVE(Beer))

Look at the residuals

fit %>% gg_tsresiduals()
## Warning: Removed 4 row(s) containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).

Look a some forecasts

fit %>% forecast() %>% autoplot(recent_production)

What do you conclude?

Mostly normal residuals, but imperfectly so, which suggests further development might help reduce “wobble”. There is a very strong negative 4 quarter lag which suggests uncorrected issues. As a general guide the forecast doesn’t look out of line however.

6.4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

  1. Exports

Extract data of interest

recent_production <- global_economy %>%
  filter(Country == 'Australia')

Define and estimate a model

fit <- recent_production %>% model(NAIVE(Exports))

Look at the residuals

fit %>% gg_tsresiduals()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).

Look a some forecasts

fit %>% forecast() %>% autoplot(recent_production)

What do you conclude?

Balanced Normal residuals suggest nothing too amiss. We don’t do a seasonal Naive as the data is yearly and doesn’t seem to have clear seasonal pattern visible.

  1. Bricks

Extract data of interest

recent_production <- aus_production %>%
  filter(year(Quarter) >= 1992) %>% select(Bricks) %>% as_tibble() %>% as_tsibble(index=Quarter)

Define and estimate a model

fit <- recent_production %>% model(NAIVE(Bricks))

Look at the residuals

fit %>% gg_tsresiduals()
## Warning: Removed 21 row(s) containing missing values (geom_path).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing non-finite values (stat_bin).

Look a some forecasts

fit %>% forecast() %>% autoplot(recent_production)
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 8 row(s) containing missing values (geom_path).
## Warning: Removed 20 row(s) containing missing values (geom_path).

What do you conclude?

Unbalanced residuals suggest a not great forecast Are the following statements true or false? Explain your answer.

  1. Good forecast methods should have normally distributed residuals.

True, as this indicates there aren’t excessive trend or seasonal influences that might be better accounted for using another model though size is also important

  1. A model with small residuals will give good forecasts.

False, All a model with small residuals shows is that it would have been right in the past. Near zero mean, constant variance, and controlled temporal factors suggest a good forecast, but there are plenty of situations where the past is just not a guide. (Predicting the cell phone market using pre iphone data would have everyone having very small cheap phones with 100 MegaPixels )

  1. The best measure of forecast accuracy is MAPE.

False, the best measure of forecast accuracy is a time machine, and thus not having any error, failing that there is no one singular evaluation that is all powerful. Some error directions might matter more than others.

  1. If your model doesn’t forecast well, you should make it more complicated.

False, In real actual life not much can be predicted with even 90% confidence; you can improve the back test, but that doesn’t mean you can improve the performance.

  1. Always choose the model with the best forecast accuracy as measured on the test set.

False, Simplicity wins 9 times out of 10, if including the phase of the moon and the color of legging a celebrity wears that day adds .01% to the predicted accuracy of the spot price of oil it still isn’t probably worth adding them in.

For your retail time series (from Exercise 8 in Section 2.10):

Create a training dataset consisting of observations before 2011 using

set.seed(12345678-4321)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries %>%
  filter(year(Month) < 2011)

Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train %>%
  model(SNAIVE(Turnover))

Check the residuals.

fit %>% gg_tsresiduals()
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing non-finite values (stat_bin).

Do the residuals appear to be uncorrelated and normally distributed?

There is some definite further seasonal and trend adjustments that could be made. Normal distribution is decent though.

Produce forecasts for the test data

fc <- fit %>%
  forecast(new_data = anti_join(myseries, myseries_train))
## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")
fc %>% autoplot(myseries)

Compare the accuracy of your forecasts against the actual values.

fit %>% accuracy()
## # A tibble: 1 x 12
##   State  Industry  .model  .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <chr>     <chr>   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 New S~ Clothing~ SNAIVE~ Trai~  9.01  21.1  16.6  4.22  7.49     1     1 0.528
fc %>% accuracy(myseries)
## # A tibble: 1 x 12
##   .model  State  Industry  .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>   <chr>  <chr>     <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE~ New S~ Clothing~ Test   104.  125.  106.  21.5  22.2  6.42  5.93 0.870

It fails fairly horribly to account for trend as hinted at in the lag data.

How sensitive are the accuracy measures to the amount of training data used?

In so far as training data reduces testing data it moves rather rapidly, also because the failure is trend related shifting to something with more data reduces the error markedly.

myseries_train <- myseries %>%
  filter(year(Month) < 2018)
fit <- myseries_train %>%
  model(SNAIVE(Turnover))
fc <- fit %>%   forecast(new_data = anti_join(myseries, myseries_train))
## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")
fc %>% autoplot(myseries)

fc %>% accuracy(myseries)
## # A tibble: 1 x 12
##   .model  State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>   <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 SNAIVE~ New ~ Clothin~ Test   12.9  19.1  15.7  2.51  3.09 0.834 0.778 -0.0100

By simply reducing the number of years predicted and tested we can massively improved the forecast model in this case, though there is little difference in the model results over just using shifted data.

myseries_train <- myseries %>%
  filter(year(Month) < 2018 & year(Month) > 1982+7)
myseries_data <- myseries %>%filter( year(Month) > 1982+7)
fit <- myseries_train %>%
  model(SNAIVE(Turnover))
fc <- fit %>%   forecast(new_data = anti_join(myseries_data,myseries_train))
## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")
fc %>% autoplot(myseries_data)

fc %>% accuracy(myseries_data)
## # A tibble: 1 x 12
##   .model  State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>   <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 SNAIVE~ New ~ Clothin~ Test   12.9  19.1  15.7  2.51  3.09 0.754 0.712 -0.0100