Homework Instruction:

Do exercises 5.1, 5.2, 5.3, 5.4 and 5.7 in the Hyndman book. Please submit your Rpubs link as well as your .pdf file showing your run code.

library(fpp3)
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tibble      3.3.0     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.2
## ✔ lubridate   1.9.4     ✔ fable       0.4.1
## ✔ ggplot2     3.5.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

Exercise 5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

a. Australian Population (global_economy)

head(global_economy)
## # A tsibble: 6 x 9 [1Y]
## # Key:       Country [1]
##   Country     Code   Year         GDP Growth   CPI Imports Exports Population
##   <fct>       <fct> <dbl>       <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
## 1 Afghanistan AFG    1960  537777811.     NA    NA    7.02    4.13    8996351
## 2 Afghanistan AFG    1961  548888896.     NA    NA    8.10    4.45    9166764
## 3 Afghanistan AFG    1962  546666678.     NA    NA    9.35    4.88    9345868
## 4 Afghanistan AFG    1963  751111191.     NA    NA   16.9     9.17    9533954
## 5 Afghanistan AFG    1964  800000044.     NA    NA   18.1     8.89    9731361
## 6 Afghanistan AFG    1965 1006666638.     NA    NA   21.4    11.3     9938414
aus_pop <- global_economy  %>% 
  filter(Country=="Australia") %>%
  select (Population)

aus_pop %>%
  autoplot(Population)

Generate forecasts for 10 years & Plot froecast

 aus_pop %>% 
  model(
    Drift = RW(Population ~ drift())
  ) %>%
  forecast(h =10) %>%
  autoplot(aus_pop) +
  labs (title = "Australian Population Forecast")

#### b. Bricks (aus_production)

bricks <- aus_production %>%
  select (Bricks)

bricks %>%
  autoplot()
## Plot variable not specified, automatically selected `.vars = Bricks`
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

Generate forecasts & Plot forecast

bricks %>%
  filter(!is.na(Bricks)) %>%
  model(SNAIVE(Bricks ~ lag("year"))) %>%
  forecast( h = 12) %>%
  autoplot(bricks)  +
  labs (title = "Bricks Forecast")
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

#### c. NSW Lambs (aus_livestock)

nsw_lambs <- aus_livestock %>%
  filter (State == "New South Wales",
          Animal == "Lambs") 

autoplot(nsw_lambs,Count)

Generate forecasts & Plot forecast

nsw_lambs %>%
  filter(!is.na(Count)) %>%
  model(Drift = RW(Count ~ drift())) %>%
  forecast( h = 12) %>%
  autoplot(nsw_lambs) +
  labs (title = "NSW Lambs Forecast")

#### d. Household wealth (hh_budget).

hsh_wth <- hh_budget %>%
  select(Country, Wealth)

autoplot(hsh_wth, Wealth)

Generate forecasts & Plot forecast

hsh_wth %>%
  filter(!is.na(Wealth)) %>%
  model(Drift = RW(Wealth ~ drift())) %>%
  forecast(h = 10) %>%
  autoplot(hsh_wth) +
  labs (title = "Household Wealth Forecast")

#### e. Australian takeaway food turnover (aus_retail).

takeaway <- aus_retail %>%
  filter (Industry == "Takeaway food services", 
          State == "Australian Capital Territory")

autoplot(takeaway,Turnover)

Generate forecasts & Plot forecast

takeaway %>%
  filter(!is.na(Turnover)) %>%
  model(SNAIVE(Turnover ~ lag("year"))) %>%
  forecast(h = 24) %>%
  autoplot(takeaway) +
  labs (title = "Australian Capital Terirtory's Takeaway Food Service Turnover Forecast")

Exercise 5.2

Use the Facebook stock price (data set gafa_stock) to do the following:

a. Produce a time plot of the series.

head(gafa_stock)
## # A tsibble: 6 x 8 [!]
## # Key:       Symbol [1]
##   Symbol Date        Open  High   Low Close Adj_Close    Volume
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>     <dbl>
## 1 AAPL   2014-01-02  79.4  79.6  78.9  79.0      67.0  58671200
## 2 AAPL   2014-01-03  79.0  79.1  77.2  77.3      65.5  98116900
## 3 AAPL   2014-01-06  76.8  78.1  76.2  77.7      65.9 103152700
## 4 AAPL   2014-01-07  77.8  78.0  76.8  77.1      65.4  79302300
## 5 AAPL   2014-01-08  77.0  77.9  77.0  77.6      65.8  64632400
## 6 AAPL   2014-01-09  78.1  78.1  76.5  76.6      65.0  69787200
fb <- gafa_stock %>%
  filter (Symbol == "FB") %>%
  mutate (day = row_number()) %>%
  select (day,Close) %>%
  update_tsibble(index = day, regular = TRUE)

autoplot(fb,Close)

#### b. Produce forecasts using the drift method and plot them.

fb  %>%
  model(Drift = RW(Close ~ drift())) %>%
  forecast(h = 50) %>%
  autoplot(fb) +
  labs(title = "Facebook Close Price Forecast - Drift Method")

#### c. Show that the forecasts are identical to extending the line drawn between the first and last observations.

ggplot(fb, aes(x=day))+
  geom_line(aes(y =Close)) + 
  geom_segment(aes(x = min (day), y= first(Close),
               xend = max(day),
               yend = last(Close)),
               color ="blue",
               linetype = "dashed"
  )
## Warning in geom_segment(aes(x = min(day), y = first(Close), xend = max(day), : All aesthetics have length 1, but the data has 1258 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
##   a single row.

#### d. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb %>%
  model(
    Mean=MEAN(Close),
    Naive=NAIVE(Close),
    Drife= NAIVE(Close ~ drift())
      ) %>%
  forecast(h=50) %>%
  autoplot(fb, level=NULL)

I would prefer the Naive method, as the time series shows a clear downward trend beginning around day 1125. While I don’t favor an overly conservative approach like the Mean method, I also find the Drift method unsuitable given the current pattern. The Naive method strikes a better balance and aligns more closely with the observed data. The time series plot reveals that prices tend to fluctuate within a relatively stable range, suggesting limited directional momentum. Moreover, forecasting stock prices based solely on historical data is insufficient, and external events and market triggers at specific points in time must also be considered to improve predictive accuracy.

Exercise 5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help. What do you conclude?

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

The residuals appear to be centered around zero and exhibit constant variance. Additionally, the ACF plot shows a noticeable spike at lag 4.

Portmanteau tests

#Box-Pierce test 
augment(fit) %>% 
  features(.innov, box_pierce, lag=8)
## # A tibble: 1 × 3
##   .model       bp_stat bp_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    29.7  0.000234
#Ljung-Box test 
augment(fit) %>% 
  features(.innov, ljung_box, lag=8)
## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    32.3 0.0000834

According to both tests, the p-value is less than 0.05, so we can reject the null hypothesis of white noise. Therefore, the residuals are not white noise.

Exercise 5.4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports series from global_economy

aus_export <- global_economy %>%
  filter (Country == "Australia") %>%
  select (Country, Exports)

autoplot(aus_export)
## Plot variable not specified, automatically selected `.vars = Exports`

fit <- aus_export %>%
  model(NAIVE(Exports))

fit %>% gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

#Box-Pierce test 
augment(fit) %>% 
  features(.innov, box_pierce, lag=10)
## # A tibble: 1 × 4
##   Country   .model         bp_stat bp_pvalue
##   <fct>     <chr>            <dbl>     <dbl>
## 1 Australia NAIVE(Exports)    14.6     0.148
#Ljung-Box test 
augment(fit) %>% 
  features(.innov, ljung_box, lag=10)
## # A tibble: 1 × 4
##   Country   .model         lb_stat lb_pvalue
##   <fct>     <chr>            <dbl>     <dbl>
## 1 Australia NAIVE(Exports)    16.4    0.0896

According to the above test results, both p-values are greater than 0.05, indicating a lack of statistical significance. Therefore, we cannot reject the null hypothesis, and we conclude that the residuals are not distinguishable from a white noise series.

Bricks series from aus_production

bricks <- aus_production %>%
  select(Bricks)

autoplot(bricks)
## Plot variable not specified, automatically selected `.vars = Bricks`
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

fit <- bricks %>%
  model(SNAIVE(Bricks))

fit %>% gg_tsresiduals()
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).

Portmanteau tests 1 (lag = 8)

#Box-Pierce test 
augment(fit) %>% 
  features(.innov, box_pierce, lag=8)
## # A tibble: 1 × 3
##   .model         bp_stat bp_pvalue
##   <chr>            <dbl>     <dbl>
## 1 SNAIVE(Bricks)    267.         0
#Ljung-Box test 
augment(fit) %>% 
  features(.innov, ljung_box, lag=8)
## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 SNAIVE(Bricks)    274.         0

Portmanteau tests 2 (lag =10)

#Box-Pierce test 
augment(fit) %>% 
  features(.innov, box_pierce, lag=10)
## # A tibble: 1 × 3
##   .model         bp_stat bp_pvalue
##   <chr>            <dbl>     <dbl>
## 1 SNAIVE(Bricks)    292.         0
#Ljung-Box test 
augment(fit) %>% 
  features(.innov, ljung_box, lag=10)
## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 SNAIVE(Bricks)    301.         0

According to two different Portmanteau tests conducted at lag values of 10 and 8, both p-values equal to 0. This allows us to reject the null hypothesis of white noise. Therefore, the residuals are not white noise.

Exercise 5.7

For your retail time series (from Exercise 7 in Section 2.10):

set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

a. Create a training dataset consisting of observations before 2011 using

myseries_train <- myseries |>
  filter(year(Month) < 2011)

b. Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

c. Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE(Turnover))

d. Check the residuals.

fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

Do the residuals appear to be uncorrelated and normally distributed? Based on the plots, the residuals shows right skewness and are not centered around zero. Additionally, the ACF plot reveals noticeable autocorrelation within the time series.

e. Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)

f. Compare the accuracy of your forecasts against the actual values.

fit |> accuracy()
## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439  1.21 0.915  5.23  12.4     1     1 0.768
fc |> accuracy(myseries)
## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test  0.836  1.55  1.24  5.94  9.06  1.36  1.28 0.601

Overall, training data set had less error than the test data set.

g. How sensitive are the accuracy measures to the amount of training data used?

Accuracy measures are sensitive to the amount of training data used. Generally, increasing the volume of training data leads to better generalization and lower error rates are going up to a certain point. In other words, as more data is added, model accuracy tends to improve, particularly when the data captures meaningful patterns such as seasonality or trends.