library(fpp3)
## -- Attaching packages -------------------------------------------- fpp3 0.4.0 --
## v tibble 3.1.4 v tsibble 1.0.1.9999
## v dplyr 1.0.7 v tsibbledata 0.3.0.9000
## v tidyr 1.1.3 v feasts 0.2.2
## v lubridate 1.7.10 v fable 0.3.1
## v ggplot2 3.3.5
## -- Conflicts ------------------------------------------------- fpp3_conflicts --
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x tsibble::intersect() masks base::intersect()
## x tsibble::interval() masks lubridate::interval()
## x dplyr::lag() masks stats::lag()
## x tsibble::setdiff() masks base::setdiff()
## x tsibble::union() masks base::union()
Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:
fit<-global_economy %>% filter(Country=='Australia') %>%model(RW(Population ~ drift()))
fit %>%
forecast(h = "10 years") %>%
autoplot(global_economy %>% filter(Country=='Australia'))
With a relatively linear graph, drift is a pretty decent model.
fit<-aus_production%>% filter_index("1970 Q1" ~ "2004 Q4") %>% select(Bricks)%>% fill_gaps()%>%model(SNAIVE(Bricks ~ lag(year)))
## Warning: 1 error encountered for SNAIVE(Bricks ~ lag(year))
## [1] no applicable method for 'get_frequencies' applied to an object of class "function"
fit %>%
forecast(h = "10 years") %>%
autoplot(aus_production %>% select(Bricks))
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 40 row(s) containing missing values (geom_path).
## Warning: Removed 20 row(s) containing missing values (geom_path).
c) NSW Lambs (aus_livestock)
targetData <- aus_livestock%>% filter(State=='New South Wales' & Animal == 'Lambs')
fit<-targetData%>% model(SNAIVE(Count ~ lag("year")))
fit %>% forecast(h="10 years") %>% autoplot(targetData)+ labs(title='seasonal lambs')
targetData <- aus_livestock%>% filter(State=='New South Wales' & Animal == 'Lambs')
fit<-targetData%>% model(RW(Count ~ drift()))
fit %>%
forecast(h = "10 years") %>%
autoplot(targetData)+labs(title='Drift lambs')
Here either a drift or seasonal forecast might be desired depending on the needs (a seasonal forecast might be best for determining peak birthing capacity planning, while a drift forecast might be better for a reasonable prediction of annual lamb vaccine production )
targetData <- hh_budget%>% select(Wealth)
fit<-targetData%>% model(RW(Wealth ~ drift()))
fit %>%
forecast(h = "10 years") %>%
autoplot(targetData)+labs(title='HH Wealth')
Drift is a non horrible method
Here unlike the previous question we combine the various regions as that seems more what is asked for an “Australian” view
targetData <- aus_retail %>% filter(Industry == 'Takeaway food services') %>% summarize(Turnover=sum(Turnover))
fit<-targetData%>% model(RW(Turnover ~ drift()))
fit %>%
forecast(h = "10 years") %>%
autoplot(targetData)+labs(title='Turnover')
Drift is probably a reasonable answer in this case, though case can be made for seasonal as well (depending on use of course)
Use the Facebook stock price (data set gafa_stock) to do the following:
targetData <- gafa_stock %>% filter(Symbol=='FB') %>%
mutate(day = row_number()) %>%
update_tsibble(index = day, regular = TRUE)
target_2018 <- targetData %>% filter(year(Date) == 2018 & yearmonth(Date) < yearmonth("2018 Dec"))
targetData %>% autoplot(vars(Adj_Close))
b) Produce forecasts using the drift method and plot them.
fit<- target_2018 %>% model(RW(Adj_Close ~ drift()))
target_dec_2018 <- targetData %>%
filter(yearmonth(Date) == yearmonth("2018 Dec"))
fit%>%forecast(new_data=target_dec_2018)%>% autoplot(targetData %>% filter(yearmonth(Date) < yearmonth("2018 Dec")))
c) Show that the forecasts are identical to extending the line drawn between the first and last observations.
(fit%>%forecast(new_data=target_dec_2018))$Adj_Close
## <distribution[19]>
## [1] N(140, 18) N(140, 37) N(140, 55) N(140, 74) N(140, 93) N(140, 112)
## [7] N(139, 131) N(139, 150) N(139, 169) N(139, 189) N(139, 209) N(138, 229)
## [13] N(138, 249) N(138, 269) N(138, 289) N(138, 310) N(138, 331) N(137, 352)
## [19] N(137, 373)
(target_2018$Adj_Close[[232]]/target_2018$Adj_Close[[1]])/233*(233+19)+(target_2018$Adj_Close[[232]])
## [1] 141.4483
Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.
recent_production <- aus_production %>%
filter(year(Quarter) >= 1992)
fit <- recent_production %>% model(SNAIVE(Beer))
fit %>% gg_tsresiduals()
## Warning: Removed 4 row(s) containing missing values (geom_path).
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).
fit %>% forecast() %>% autoplot(recent_production)
What do you conclude?
Mostly normal residuals, but imperfectly so, which suggests further development might help reduce “wobble”. There is a very strong negative 4 quarter lag which suggests uncorrected issues. As a general guide the forecast doesn’t look out of line however.
Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
recent_production <- global_economy %>%
filter(Country == 'Australia')
fit <- recent_production %>% model(NAIVE(Exports))
fit %>% gg_tsresiduals()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).
fit %>% forecast() %>% autoplot(recent_production)
What do you conclude?
Balanced Normal residuals suggest nothing too amiss. We don’t do a seasonal Naive as the data is yearly and doesn’t seem to have clear seasonal pattern visible.
recent_production <- aus_production %>%
filter(year(Quarter) >= 1992) %>% select(Bricks) %>% as_tibble() %>% as_tsibble(index=Quarter)
fit <- recent_production %>% model(NAIVE(Bricks))
fit %>% gg_tsresiduals()
## Warning: Removed 21 row(s) containing missing values (geom_path).
## Warning: Removed 21 rows containing missing values (geom_point).
## Warning: Removed 21 rows containing non-finite values (stat_bin).
fit %>% forecast() %>% autoplot(recent_production)
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 8 row(s) containing missing values (geom_path).
## Warning: Removed 20 row(s) containing missing values (geom_path).
Unbalanced residuals suggest a not great forecast Are the following statements true or false? Explain your answer.
True, as this indicates there aren’t excessive trend or seasonal influences that might be better accounted for using another model though size is also important
False, All a model with small residuals shows is that it would have been right in the past. Near zero mean, constant variance, and controlled temporal factors suggest a good forecast, but there are plenty of situations where the past is just not a guide. (Predicting the cell phone market using pre iphone data would have everyone having very small cheap phones with 100 MegaPixels )
False, the best measure of forecast accuracy is a time machine, and thus not having any error, failing that there is no one singular evaluation that is all powerful. Some error directions might matter more than others.
False, In real actual life not much can be predicted with even 90% confidence; you can improve the back test, but that doesn’t mean you can improve the performance.
False, Simplicity wins 9 times out of 10, if including the phase of the moon and the color of legging a celebrity wears that day adds .01% to the predicted accuracy of the spot price of oil it still isn’t probably worth adding them in.
For your retail time series (from Exercise 8 in Section 2.10):
Create a training dataset consisting of observations before 2011 using
set.seed(12345678-4321)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries %>%
filter(year(Month) < 2011)
Check that your data have been split appropriately by producing the following plot.
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
fit <- myseries_train %>%
model(SNAIVE(Turnover))
Check the residuals.
fit %>% gg_tsresiduals()
## Warning: Removed 12 row(s) containing missing values (geom_path).
## Warning: Removed 12 rows containing missing values (geom_point).
## Warning: Removed 12 rows containing non-finite values (stat_bin).
Do the residuals appear to be uncorrelated and normally distributed?
There is some definite further seasonal and trend adjustments that could be made. Normal distribution is decent though.
Produce forecasts for the test data
fc <- fit %>%
forecast(new_data = anti_join(myseries, myseries_train))
## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")
fc %>% autoplot(myseries)
Compare the accuracy of your forecasts against the actual values.
fit %>% accuracy()
## # A tibble: 1 x 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 New S~ Clothing~ SNAIVE~ Trai~ 9.01 21.1 16.6 4.22 7.49 1 1 0.528
fc %>% accuracy(myseries)
## # A tibble: 1 x 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE~ New S~ Clothing~ Test 104. 125. 106. 21.5 22.2 6.42 5.93 0.870
It fails fairly horribly to account for trend as hinted at in the lag data.
How sensitive are the accuracy measures to the amount of training data used?
In so far as training data reduces testing data it moves rather rapidly, also because the failure is trend related shifting to something with more data reduces the error markedly.
myseries_train <- myseries %>%
filter(year(Month) < 2018)
fit <- myseries_train %>%
model(SNAIVE(Turnover))
fc <- fit %>% forecast(new_data = anti_join(myseries, myseries_train))
## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")
fc %>% autoplot(myseries)
fc %>% accuracy(myseries)
## # A tibble: 1 x 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE~ New ~ Clothin~ Test 12.9 19.1 15.7 2.51 3.09 0.834 0.778 -0.0100
By simply reducing the number of years predicted and tested we can massively improved the forecast model in this case, though there is little difference in the model results over just using shifted data.
myseries_train <- myseries %>%
filter(year(Month) < 2018 & year(Month) > 1982+7)
myseries_data <- myseries %>%filter( year(Month) > 1982+7)
fit <- myseries_train %>%
model(SNAIVE(Turnover))
fc <- fit %>% forecast(new_data = anti_join(myseries_data,myseries_train))
## Joining, by = c("State", "Industry", "Series ID", "Month", "Turnover")
fc %>% autoplot(myseries_data)
fc %>% accuracy(myseries_data)
## # A tibble: 1 x 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE~ New ~ Clothin~ Test 12.9 19.1 15.7 2.51 3.09 0.754 0.712 -0.0100