Data 624 Homework 3

library(fpp3)
## Warning: package 'fpp3' was built under R version 4.4.2
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble      3.2.1     ✔ tsibble     1.1.5
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.1
## ✔ lubridate   1.9.3     ✔ fable       0.4.1
## ✔ ggplot2     3.5.1
## Warning: package 'tsibble' was built under R version 4.4.2
## Warning: package 'tsibbledata' was built under R version 4.4.2
## Warning: package 'feasts' was built under R version 4.4.2
## Warning: package 'fabletools' was built under R version 4.4.2
## Warning: package 'fable' was built under R version 4.4.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library(plotly)
## Warning: package 'plotly' was built under R version 4.4.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tsibble)

5.1 Produce forecasts for the following series using whichever of ‘NAIVE(y)’, ‘SNAIVE(y)’ or ‘RW(y ~ drift())’ is more appropriate in each case:

Australian Population (global_economy)

aus_pop <- global_economy %>% filter(Country=='Australia') %>% select(Population)
aus_pop %>% model(NAIVE(Population)) %>% forecast(h=15) %>% autoplot(aus_pop)

Bricks (aus_production)

bricks <- aus_production %>% select(Quarter,Bricks) %>% filter(!is.na(Bricks))
bricks %>% model(NAIVE(Bricks)) %>% forecast(h=15) %>% autoplot(bricks)

NSW Lambs (aus_livestock)

nsw_lambs <- aus_livestock %>% filter(Animal=='Lambs',State=='New South Wales') %>% select(Count)
nsw_lambs %>% model(NAIVE(Count)) %>% forecast(h=15) %>% autoplot(nsw_lambs)

Household wealth (hh_budget)

house_w <- hh_budget
house_w_mod <- house_w %>% model(RW(Wealth ~ drift())) %>% forecast(h=15)
autoplot(house_w) + autolayer(house_w_mod)
## Plot variable not specified, automatically selected `.vars = Debt`

Australian takeaway food turnover (aus_retail).

aus_take <- aus_retail %>% filter(Industry=='Takeaway food services') %>% select(Turnover)
aus_take_mod <- aus_take %>% model(NAIVE(Turnover)) %>% forecast(h=15)
autoplot(aus_take) + autolayer(aus_take_mod)
## Plot variable not specified, automatically selected `.vars = Turnover`

5.2 Use the Facebook stock price (data set gafa_stock) to do the following: Produce a time plot of the series.

fb <- gafa_stock %>% filter(Symbol=='FB') %>% mutate(Day = row_number()) %>% update_tsibble(index=Day, regular=TRUE)
fb %>% autoplot(Open)

Produce forecasts using the drift method and plot them.

fb_mod <- fb %>% model(RW(Open ~ drift())) %>% forecast(h=60)
autoplot(fb) + autolayer(fb_mod)
## Plot variable not specified, automatically selected `.vars = Open`

Show that the forecasts are identical to extending the line drawn between the first and last observations.

autoplot(fb) + autolayer(fb_mod) + geom_segment(x=min(fb$Day), y = fb$Open[which.min(fb$Day)], xend=max(fb$Day), yend = fb$Open[which.max(fb$Day)])
## Plot variable not specified, automatically selected `.vars = Open`

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb_mod <- fb %>% model(Mean = MEAN(Open),
                        Naive = NAIVE(Open),
                        SNaive = SNAIVE(Open),
                        Drift = RW(Open ~ drift())) %>% forecast(h=60)
## Warning: 1 error encountered for SNaive
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.
autoplot(fb) + autolayer(fb_mod)
## Plot variable not specified, automatically selected `.vars = Open`
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 60 rows containing missing values or values outside the scale range
## (`geom_line()`).

5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

Apply seasonal naïve method

recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
aus_beer_mod <- recent_production %>% model(SNAIVE(Beer))

Check if residuals look like white noise

aus_beer_mod %>% gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

The residuals do not look like white noise. The residual graph shows a bimodal distribution and the ACF graph shows lags extending the suggested limit lines. Thus we can conclude that the residuals are not uncorrelated and normally distributed.

Plot forecasts

aus_beer_mod %>% forecast() %>% autoplot(recent_production)

Tests

augment(aus_beer_mod) |> features(.innov, box_pierce, lag = 10)
## # A tibble: 1 × 3
##   .model       bp_stat bp_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    34.4  0.000160
augment(aus_beer_mod) |> features(.innov, ljung_box, lag = 10)
## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    37.8 0.0000412

What do you conclude? Both tests , box-pierce and ljung-box, have a p-value of less than 0.05 and thus we can conclude that the residuals are distinguishable from white noise and that the model does not explain the variance in the data.

5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case. Apply method

aus_bricks <- recent_production %>% filter(!is.na(Bricks))
aus_bricks_mod <- aus_bricks %>% model(NAIVE(Bricks))

Check if residuals look like white noise

aus_bricks_mod %>% gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

The residuals do not look like white noise. The histogram shows us that the residuals follow a bimodal distribution and most likely do not have a mean of 0 and are not normally distributed. The ACF graph shows lags passing the suggested limit lines thus. Thus we can conclude that the residuals are not normally distributed and are not uncoordinated.

Plot forecasts

aus_bricks_mod %>% forecast() %>% autoplot(bricks)

Tests

augment(aus_bricks_mod) |> features(.innov, box_pierce, lag = 10)
## # A tibble: 1 × 3
##   .model        bp_stat bp_pvalue
##   <chr>           <dbl>     <dbl>
## 1 NAIVE(Bricks)    70.7  3.22e-11
augment(aus_bricks_mod) |> features(.innov, ljung_box, lag = 10)
## # A tibble: 1 × 3
##   .model        lb_stat lb_pvalue
##   <chr>           <dbl>     <dbl>
## 1 NAIVE(Bricks)    82.8  1.42e-13

What do you conclude? Both tests , box-pierce and ljung-box, have a p-value of less than 0.05 and thus we can conclude that the residuals are distinguishable from white noise and that the model does not explain the variance in the data.

5.7 For your retail time series (from Exercise 8 in Section 2.10): Create a training dataset consisting of observations before 2011 using

set.seed(246810)
myseries <- aus_retail %>%
  filter(`Series ID` == 'A3349849A')

myseries_train <- myseries %>%
  filter(year(Month) < 2011)

Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train %>%
  model(SNAIVE(Turnover))

Check the residuals.

fit %>% gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

Do the residuals appear to be uncorrelated and normally distributed? The histogram shows that the residuals are not centered around 0 and there is skewness to the right thus they are not normally distributed. The ACF plot shows that the residuals are not uncorrelated.

Produce forecasts for the test data

fc <- fit %>%
  forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc %>% autoplot(myseries)

Compare the accuracy of your forecasts against the actual values.

fit %>% accuracy()
## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Austral… Cafes, … SNAIV… Trai… 0.985  3.37  2.53  5.05  16.1     1     1 0.826
fc %>% accuracy(myseries)
## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Aust… Cafes, … Test   5.86  8.42  7.38  13.4  19.0  2.92  2.50 0.847

The errors for the training data are lower than the errors for the test data.

How sensitive are the accuracy measures to the amount of training data used? Accuracy measures are betters for the training model with an increased amount of training data, but there is a decrease in the accuracy measures for the test model in the same situation.