\[Libraries \]

library(fpp3)

## Warning: package 'fpp3' was built under R version 4.3.3

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.5
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.3.2
## ✔ lubridate   1.9.3     ✔ fable       0.3.4
## ✔ ggplot2     3.5.1     ✔ fabletools  0.4.2

## Warning: package 'dplyr' was built under R version 4.3.2

## Warning: package 'tidyr' was built under R version 4.3.2

## Warning: package 'lubridate' was built under R version 4.3.2

## Warning: package 'ggplot2' was built under R version 4.3.3

## Warning: package 'tsibble' was built under R version 4.3.3

## Warning: package 'tsibbledata' was built under R version 4.3.3

## Warning: package 'feasts' was built under R version 4.3.3

## Warning: package 'fabletools' was built under R version 4.3.3

## Warning: package 'fable' was built under R version 4.3.3

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.3.3

## Warning: package 'readr' was built under R version 4.3.2

## Warning: package 'purrr' was built under R version 4.3.2

## Warning: package 'stringr' was built under R version 4.3.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ readr   2.1.5
## ✔ purrr   1.0.2     ✔ stringr 1.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

\[5.1:\]

# Australian Population (global_economy):

global_economy |>
  filter(Country == 'Australia') |>
  model(RW(Population ~ drift())) |>
  forecast(h = 5) |>
  autoplot(global_economy) + 
  labs(title = 'Australian Population',
       subtitle = 'Drift Method: 5 year forecast')

Use of the RW(y ~ drift()) is most appropriate because the data is increasing and has no seasonality.

#Bricks (aus_production):

aus_production |>
  filter(!is.na(Bricks)) |>
  model(SNAIVE(Bricks)) |>
  forecast(h = '5 years') |>
  autoplot(aus_production) + 
  labs(title = 'Austrialian Clay Brick Production',
       subtitle = 'SNAIVE Method: 5 year forecast')

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

Use of the SNAIVE method is most appropriate because the data is highly seasonal.

#NSW Lambs (aus_livestock):

aus_livestock |>
  filter(Animal == 'Lambs',
         State == 'New South Wales',
         !is.na(Count)) |>
  model(classical_decomposition(Count, type = 'additive')) |>
  components() |>
  autoplot()

## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_line()`).

#The seasonal component is insignificant compared to the trend and random components.

aus_livestock |>
  filter(Animal == 'Lambs',
         State == 'New South Wales',
         !is.na(Count)) |>
  model(NAIVE(Count)) |>
  forecast(h = '5 years') |>
  autoplot(aus_livestock) +
  labs(title = 'New South Wales Lambs Slaughter',
       subtitle = 'NAIVE Model: 5 year forecast')

Use of the NAIVE method is appropriate because there is no seasonality to the data.

# Household wealth (hh_budget).

hh_budget |>
  filter(!is.na(Wealth)) |>
  model(RW(Wealth ~ drift())) |>
  forecast(h = '5 years') |>
  autoplot(hh_budget) +
  labs(title = 'Household Wealth',
       subtitle = 'Drift Method: 5 year forecast')

Use of the RW(y ~ drift()) is most appropriate the data has no seasonality and is generally increasing.

# Australian takeaway food turnover (aus_retail).

aus_retail |>
  filter(Industry == 'Takeaway food services') |>
  model(stl = STL(Turnover)) |>
  components() |>
  autoplot()

aus_retail |>
  filter(Industry == 'Takeaway food services') |>
  model(SNAIVE(Turnover)) |>
  forecast(h = '5 years') |>
  autoplot(aus_retail) + 
  labs(title = 'Australian Takeaway food services Turnover',
       subtitle = 'SNAIVE Model: 5 year forecast') +
  facet_wrap(~State, scales = 'free')

Use of the SNAIVE model is most appropriate because the STL decomposition shows a small seasonal component.

\[5.2:\]

# a.

fb <- gafa_stock |>
  filter(Symbol == 'FB') |>
  mutate(day = row_number()) |>
  update_tsibble(index = day, regular = TRUE)

fb |>
  autoplot(Close) +
  labs(title = 'Daily close price of Facebook')

# b.

fb |>
  model(RW(Close ~ drift())) |>
  forecast(h = 60) |>
  autoplot(fb) +
  labs(title = 'Daily Close price of Facebook',
       subtitle = 'Drift Method: 60 traiding day forecast')

#c.

fb |>
  model(RW(Close ~ drift())) |>
  forecast(h = 60) |>
  autoplot(fb) +
  labs(title = 'Daily Close price of Facebook',
       subtitle = 'Drift Method: 60 traiding day forecast') + 
  geom_segment(x = fb |>
                 filter(day == min(day)) |>
                 pull(day), 
               y = fb |>
                 filter(day == min(day)) |>
                 pull(Close),
               xend = fb |>
                 filter(day == max(day)) |>
                 pull(day),
               yend = fb |>
                 filter(day == max(day)) |>
                 pull(Close),
               colour = 'blue',
               linetype = 'dashed')

#d.

fb |>
  model(mean = MEAN(Close),
        Naive = NAIVE(Close),
        Drift = RW(Close ~ drift())) |>
  forecast(h = 60) |>
  autoplot(fb)

# It makes no sense to use the SNAIVE method since the data has no seasonality component. Using MEAN and NAIVE produce a constant forecast which doesn’t take the overall trend into account.

\[5.3:\]

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

\[Box-Pierce Test\]

fit |>
  augment() |>
  features(.innov, box_pierce, lag = 8)

## # A tibble: 1 × 3
##   .model       bp_stat bp_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    29.7  0.000234

\[Ljung-Box Test\]

fit |>
  augment() |>
  features(.innov, ljung_box, lag = 8)

## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    32.3 0.0000834

Conclusion:The p-value from both tests is less than 0.05, which indicates that the residuals are not explained by white noise.

\[5.4:\]

# Australian Exports from global_economy:

global_economy |>
  filter(Country == 'Australia') |>
  autoplot(Exports) + 
  labs(title = 'Australian Exports')

model_ae <- global_economy |>
  filter(Country == 'Australia') |>
  model(NAIVE(Exports))

model_ae |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

model_ae |> forecast() |> autoplot(global_economy)

#The residuals are centered around zero and appear to be normally distributed..

model_ae |>
  augment() |>
  features(.innov, ljung_box, lag = 10)

## # A tibble: 1 × 4
##   Country   .model         lb_stat lb_pvalue
##   <fct>     <chr>            <dbl>     <dbl>
## 1 Australia NAIVE(Exports)    16.4    0.0896

#The Ljung-Box test produces a p-value greater than 0.05 which allows us to conclude that the residuals resemble white noise.

#Bricks from aus_production:


aus_production |>
  autoplot(Bricks) + 
  labs(title = 'Australian Clay Brick Production')

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

model_b <- aus_production |>
  model(SNAIVE(Bricks))

model_b |> gg_tsresiduals()

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).

model_b |> forecast() |> autoplot(aus_production)

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

# The residuals do not seem to be centered at zero.

model_b |>
  augment() |>
  features(.innov, ljung_box, lag = 8)

## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 SNAIVE(Bricks)    274.         0

#The Ljung-Box test produces a p-value less than 0.05 which allows us to conclude that the residuals do not resemble white noise.

\[5.7:\]

set.seed(1234)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`, 1))

#a. 

myseries_train <- myseries |> filter(year(Month) < 2011)

#b. 
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "green")

#c.

myseries_fit <- myseries_train |>
  model(SNAIVE(Turnover))

# d.

myseries_fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

#The residuals do not appear to be uncorrelated nor normally distributed.

#e.

fc <- myseries_fit |>
  forecast(new_data = anti_join(myseries, myseries_train, 
                                by = join_by(State, Industry, `Series ID`, Month, Turnover)))
fc |> autoplot(myseries)

#f.

myseries_fit |> 
  accuracy() |>
  select(.model, .type, RMSE, MAE, MAPE, MASE, RMSSE)

## # A tibble: 1 × 7
##   .model           .type     RMSE   MAE  MAPE  MASE RMSSE
##   <chr>            <chr>    <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Turnover) Training  2.90  2.22  10.7     1     1

fc |> 
  accuracy(myseries) |>
  select(.model, .type, RMSE, MAE, MAPE, MASE, RMSSE)

## # A tibble: 1 × 7
##   .model           .type  RMSE   MAE  MAPE  MASE RMSSE
##   <chr>            <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Turnover) Test   9.13  7.58  14.4  3.42  3.15

#g.Accuracy measures are very sensitive to the amount of training data used. The more training data, the better the accuracy. With time series it is also important to use training data as close to the time periods being forecast.

DATA 624 HW3

CN

2024-09-14