Homework # 3 (624)

5.1 Produce forecasts for the following series

library(fpp3)

## Warning: package 'fpp3' was built under R version 4.3.2

## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.3
## ✔ dplyr       1.1.2     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.0     ✔ feasts      0.3.1
## ✔ lubridate   1.9.2     ✔ fable       0.3.3
## ✔ ggplot2     3.4.2     ✔ fabletools  0.3.4

## Warning: package 'tsibble' was built under R version 4.3.2

## Warning: package 'tsibbledata' was built under R version 4.3.2

## Warning: package 'feasts' was built under R version 4.3.2

## Warning: package 'fabletools' was built under R version 4.3.2

## Warning: package 'fable' was built under R version 4.3.2

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

head(global_economy)

## # A tsibble: 6 x 9 [1Y]
## # Key:       Country [1]
##   Country     Code   Year         GDP Growth   CPI Imports Exports Population
##   <fct>       <fct> <dbl>       <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
## 1 Afghanistan AFG    1960  537777811.     NA    NA    7.02    4.13    8996351
## 2 Afghanistan AFG    1961  548888896.     NA    NA    8.10    4.45    9166764
## 3 Afghanistan AFG    1962  546666678.     NA    NA    9.35    4.88    9345868
## 4 Afghanistan AFG    1963  751111191.     NA    NA   16.9     9.17    9533954
## 5 Afghanistan AFG    1964  800000044.     NA    NA   18.1     8.89    9731361
## 6 Afghanistan AFG    1965 1006666638.     NA    NA   21.4    11.3     9938414

A. Global_Economy for Australian Population

use whatever of naive(y),snaive(y) or Rnaive(y)

## Make sure to select one column so that the autoplot only selects the Population column 
Auspop <- global_economy %>%
  filter(Country == "Australia") %>%
  select(Population)

Auspop %>%
  autoplot() + 
  labs(title = "Australia Population")

## Plot variable not specified, automatically selected `.vars = Population`

## Using the naive forecast to forecast the next 5 years.. 
Auspop %>%
  model(Naive = NAIVE(Population)) %>%
  forecast(h = 5) %>%
  autoplot(Auspop) + labs(title = "Australia Population")

## Use the other forecasting methods to see how it does.. 
Aus_train <- Auspop %>%
  filter_index("2000" ~ "2017")

Aus_fit <- Aus_train %>%
  model(
    Naive = NAIVE(Population),
    Drift = RW(Population ~ drift()
  ))
Aus_fc <- Aus_fit %>%
  forecast(h = "5 years") 

Aus_fc %>%
  autoplot(Aus_train,level = NULL) + 
  autolayer(
    filter_index(Auspop,"2000" ~.),
    color = "black"
  ) +
  labs(title = "Australia Population") +
  guides(color = guide_legend(title = "Forecast")
  )

## Plot variable not specified, automatically selected `.vars = Population`

B. Bricks from Aus_production

Bricks1 <- aus_production %>%
  select(Bricks)

## There is empty values in this chart..
Bricks1 %>%
  autoplot() + labs(title = "Australia Brick Productions")

## Plot variable not specified, automatically selected `.vars = Bricks`

## Warning: Removed 20 rows containing missing values (`geom_line()`).

## Filter out na values
Bricks1 <- Bricks1 %>%
  filter(!is.na(Bricks))

## Now we forecast between 1992 to 2009
Bricks_train <- Bricks1 %>%
  filter_index("1992 Q1" ~ "2009 Q4")
  

Bricks_fit <- Bricks_train %>%
  model(
    Naive = NAIVE(Bricks),
    `Seasonal Naive` = SNAIVE(Bricks),
    Drift = RW(Bricks~drift())
  )

## Forecast for the next 4 quarters
Bricks_fc <- Bricks_fit %>%
  forecast(h = 18) 

Bricks_fc %>%
  autoplot(Bricks_train,level = NULL) + 
  autolayer(
    filter_index(Bricks1,"1992 Q1" ~.),
    color = "black"
  ) +
  labs(title = "Bricks Production") +
  guides(color = guide_legend(title = "Forecast")
  )

## Plot variable not specified, automatically selected `.vars = Bricks`

C. NSW Lambs From livestock

NSWlamb <- aus_livestock %>%
  filter(Animal == "Lambs",State == "New South Wales") %>%
  select(Count)

NSWlamb %>%
  autoplot() + labs(title = "New South Wales Lambs")

## Plot variable not specified, automatically selected `.vars = Count`

NSW_train <- NSWlamb %>%
  filter_index("2000 Jan"~.)

NSW_fit <- NSW_train %>%
  model(
    Naive = NAIVE(Count),
    `Seasonal Naive` = SNAIVE(Count),
    Drift = RW(Count~drift())
  )

## Forecast for the next 12 months
NSW_fc <- NSW_fit %>%
  forecast(h = 12) 

NSW_fc %>%
  autoplot(NSW_train,level = NULL) + 
  autolayer(
    filter_index(NSW_train,"2000 Jan" ~.),
    color = "black"
  ) +
  labs(title = "Lambs slaughter") +
  guides(color = guide_legend(title = "Forecast")
  )

## Plot variable not specified, automatically selected `.vars = Count`

D. Household Wealth (hh_budget)

hhwealth <- hh_budget %>%
  select(Wealth)

hhwealth %>%
  autoplot()

## Plot variable not specified, automatically selected `.vars = Wealth`

This plot seems to display a positive trend of wealth during the early 2010s followed by a decrease in 2008 and then climbs up afterwards.

hh_fit <- hhwealth %>%
  model(
    Naive = NAIVE(Wealth),
    `Seasonal Naive` = SNAIVE(Wealth),
    Drift = RW(Wealth~drift())
  )

## Warning: 4 errors (1 unique) encountered for Seasonal Naive
## [4] Non-seasonal model specification provided, use RW() or provide a different lag specification.

## Forecast for the next 12 months
hh_fc <- hh_fit %>%
  forecast(h = 12) 

hh_fc %>%
  autoplot(hhwealth,level = NULL) +
  labs(title = "HH wealth") +
  guides(color = guide_legend(title = "Forecast")
  )

## Warning: Removed 48 rows containing missing values (`()`).

E. Takeaway Food

Takeaway <- aus_retail %>%
  filter(Industry == "Takeaway food services",State == "Australian Capital Territory") %>%
  select(Turnover)

Takeaway %>%
  model(
    naive = NAIVE(Turnover),
    `seasonal_naive` = SNAIVE(Turnover),
    Drift = RW(Turnover~drift())) %>%
  forecast(h = 15) %>%
  autoplot(Takeaway,level = NULL) + labs(title = "Food Takeaway in Australia")

5.2 Facebook stock prices

A. Plot the Time-Series

fb_stock <- gafa_stock %>%
  filter(Symbol == "FB") 

ggplot(fb_stock,aes(x=Date)) +
  geom_line(aes(y=Close)) + 
  labs(title = "Facebook Stock Closing Price")

B. Produce Forecast with drift method.

Since the fb time-series are all irregular I am going to use the textbook example to fix it the irregular time series..

## Use example 5.2 from the textbook to re index it. 
## Set up a new time index based on the trading day rather than calendar days. 
fb_stock <- fb_stock %>%
  mutate(day = row_number()) %>%
  update_tsibble(index = day,regular = TRUE) %>%
  select(Close)

## Model the data..
fb_stock_fit <- fb_stock %>%
  model(
    drift = RW(Close~drift())
  )

fb_plot <- fb_stock_fit %>%
  forecast(h = 50) %>%
  autoplot(fb_stock) + labs(title = "Facebook stocks Forecast")

fb_plot

C. Show forecasts are identical by drawing line from first obs to last obs.

## Forecast the data by 50 days and then autoplot the fb_stock with it
ggplot(fb_stock,aes(x = day)) +
  geom_line(aes(y = Close)) + geom_segment(aes(x = min(day),y = first(Close),
                             xend = max(day),yend = last(Close)), color = "red",linetype = "dashed") +
  labs(title = "Facebook Stock Closing Prices")

D. Try other benchmarking methods..

fb_stock %>%
  model(
    Mean = MEAN(Close),
    `Naïve` = NAIVE(Close),
    Drift = NAIVE(Close ~ drift())
    ) %>%
  ## Forecast for a 100 days
  forecast(h = 100) %>%
  autoplot(fb_stock,level = NULL) + labs(title = "Facebook stocks forecast") +
  guides(colour = guide_legend(title = "Forecast"))

It is really difficult to tell but I would assume that the drift method is the best overall since the time series seems to have a positive trend overall in the long-run.

5.3 Apply a seasonal naïve method

Applying the code from the textbook

# Extract data of interest We can see some seasonal patterns for each quarter 
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
recent_production %>%
  autoplot()

## Plot variable not specified, automatically selected `.vars = Beer`

# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values (`geom_line()`).

## Warning: Removed 4 rows containing missing values (`geom_point()`).

## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).

# Look at some forecasts
fit |> forecast() |> autoplot(recent_production)

It seems the residuals doesn’t look like white noise the residuals seems to be centered around 0 but displays high variance while the histogram shows a bi modal normal distribution, and the acf doesn’t display any signs of correlation. Thus, the residuals seems normally distributed and not correlated.

Pormanteu Tests

We can use the two tests to see if the residuals are white noise or not.

## Use 8 for lag because the data is seasonal data and there are 4 quarters per year and we multiply it by 2
fit %>%
  augment() %>%
  features(.innov,box_pierce,lag = 8)

## # A tibble: 1 × 3
##   .model       bp_stat bp_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    29.7  0.000234

fit %>%
  augment() %>%
  features(.innov,ljung_box, lag = 8)

## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    32.3 0.0000834

Besides on both tests, since the p-value is less than 0.05 we can reject the hypothesis of white noise thus the residuals are not white noise..

5.4 Repeat the previous exercise..

Aus_exports <- global_economy %>%
  filter(Country == "Australia") %>%
  select(Exports)

Australian Exports

## Use the naive method.
Aus_fit <- Aus_exports %>%
  model(NAIVE(Exports))


## Forecast for the next 15 years..  
Aus_mod <- Aus_fit %>%
  forecast(h = 15)

Aus_mod %>%
  autoplot(Aus_exports) +
  labs(title = "Australian Exports")

## Check the residuals
Aus_fit %>%
  gg_tsresiduals()

## Warning: Removed 1 row containing missing values (`geom_line()`).

## Warning: Removed 1 rows containing missing values (`geom_point()`).

## Warning: Removed 1 rows containing non-finite values (`stat_bin()`).

The residuals look normally distributed but the lag plot shows some correlation at certain points, and the variance is centered around 0 but has high variance.

Pormanteu Plots..

## THe plot doesn't seems seasonal so I will use 10 according to the authors
Aus_fit %>%
  augment() %>%
  features(.innov,box_pierce,lag = 10)

## # A tibble: 1 × 3
##   .model         bp_stat bp_pvalue
##   <chr>            <dbl>     <dbl>
## 1 NAIVE(Exports)    14.6     0.148

Aus_fit %>%
  augment() %>%
  features(.innov,ljung_box,lag = 10)

## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 NAIVE(Exports)    16.4    0.0896

The p-value are not significant for both tests for the Australia Exports thus it seems that we fail to reject the null of white noise and conclude that the residuals are white noise….

Bricks

## This is quarterly data of bricks so lag = 8? 
Brickss <- aus_production %>%
  select(Bricks) %>%
  filter(!is.na(Bricks))

## Data appears seasonal 
Brickss %>%
  autoplot() +
  labs(title = "Clay Bricks Production")

## Plot variable not specified, automatically selected `.vars = Bricks`

Brick_fit <- Brickss %>%
  model(SNAIVE(Bricks))

Brick_fit %>%
  gg_tsresiduals()

## Warning: Removed 4 rows containing missing values (`geom_line()`).

## Warning: Removed 4 rows containing missing values (`geom_point()`).

## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).

The innovation residuals had low variation at first but then had higher variation during 1980, the ACF had a strong value at the start and the residuals are left-tail skewed, perhaps the residuals are white-noise?

Pormanteu Tests.

Brick_fit %>%
  augment() %>%
  features(.innov,box_pierce,lag = 10)

## # A tibble: 1 × 3
##   .model         bp_stat bp_pvalue
##   <chr>            <dbl>     <dbl>
## 1 SNAIVE(Bricks)    292.         0

Brick_fit %>%
  augment() %>%
  features(.innov,ljung_box,lag = 10)

## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 SNAIVE(Bricks)    301.         0

The p-value is zero but i feel like I messed somewhere.. cause the residuals seems like white noise to me.

5.7 For your retail time series

set.seed(12345678)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries_train <- myseries %>%
  filter(year(Month) < 2011)

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

fit <- myseries_train |>
  model(SNAIVE(Turnover))

fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values (`geom_line()`).

## Warning: Removed 12 rows containing missing values (`geom_point()`).

## Warning: Removed 12 rows containing non-finite values (`stat_bin()`).

The residuals doesn’t seem to be not uncorrelated and the distribution seems unimodal normal distribution. The ACF seems to shows correlation between each point of lag until the 11th value which decreases.

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

## Accuracy of the fit,. 
fit |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439  1.21 0.915  5.23  12.4     1     1 0.768

## Actual accuracy
fc |> accuracy(myseries)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test  0.836  1.55  1.24  5.94  9.06  1.36  1.28 0.601

I am unsure but it appears that the accuracy isn’t that sensitive since the MAE and the RMSE are pretty close in values in the training set.