DATA 624 - Homework 3

5.1 Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)

 knitr::opts_chunk$set(warning = FALSE, message = FALSE)   
  # Import fpp3 libraries
library(fpp3)

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.1
## ✔ lubridate   1.9.4     ✔ fable       0.4.1
## ✔ ggplot2     3.5.1

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

head(global_economy)

## # A tsibble: 6 x 9 [1Y]
## # Key:       Country [1]
##   Country     Code   Year         GDP Growth   CPI Imports Exports Population
##   <fct>       <fct> <dbl>       <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
## 1 Afghanistan AFG    1960  537777811.     NA    NA    7.02    4.13    8996351
## 2 Afghanistan AFG    1961  548888896.     NA    NA    8.10    4.45    9166764
## 3 Afghanistan AFG    1962  546666678.     NA    NA    9.35    4.88    9345868
## 4 Afghanistan AFG    1963  751111191.     NA    NA   16.9     9.17    9533954
## 5 Afghanistan AFG    1964  800000044.     NA    NA   18.1     8.89    9731361
## 6 Afghanistan AFG    1965 1006666638.     NA    NA   21.4    11.3     9938414

tail(global_economy)

## # A tsibble: 6 x 9 [1Y]
## # Key:       Country [1]
##   Country  Code   Year         GDP Growth   CPI Imports Exports Population
##   <fct>    <fct> <dbl>       <dbl>  <dbl> <dbl>   <dbl>   <dbl>      <dbl>
## 1 Zimbabwe ZWE    2012 17114849900 16.7    107.    49.0    25.2   14710826
## 2 Zimbabwe ZWE    2013 19091020000  1.99   109.    36.7    22.0   15054506
## 3 Zimbabwe ZWE    2014 19495519600  2.38   109.    33.7    20.9   15411675
## 4 Zimbabwe ZWE    2015 19963120600  1.78   106.    37.6    19.2   15777451
## 5 Zimbabwe ZWE    2016 20548678100  0.756  105.    31.3    19.9   16150362
## 6 Zimbabwe ZWE    2017 22040902300  4.70   106.    30.4    19.7   16529904

global_economy %>% 
  filter(Country == "Australia") %>%
  model(RW(Population ~ drift())) %>%
  forecast(h = 5) %>%
  
  autoplot(global_economy) +
  labs(title = "Australia Population Forecast")

The population of Australia has an increasing trend, it is good to use the RW(Population ~ drift()) method, to show growth in the forecast.

Bricks (aus_production)

  aus_bricks <- aus_production[,c(1,4)] %>%
  drop_na()

autoplot(aus_bricks, Bricks) +
  labs(title = "Australian Brick Production")

Since the bricks time series exhibits seasonality, the SNAIVE() method will be most appropriate.

austra_bricks_model <- aus_bricks %>%
  model(SNAIVE(Bricks))

austra_bricks_model %>%
  forecast() %>%
  autoplot(aus_bricks) +
  labs(title = " Forecasts for Australian Brick Production")

NSW Lambs (aus_livestock)

aus_livestock %>%
  filter(State == "New South Wales", 
         Animal == "Lambs") %>%
  model(NAIVE(Count)) %>%
  forecast(h = 24) %>%
  autoplot(aus_livestock) +
  labs(title = "Lambs in New South Wales")

Plot does not seem to be a constant trend or seasonality. The NAIVE() method seems to be the best out of three without any transformations done to it.

Household wealth (hh_budget).

# to see first 6 line of the data
head(hh_budget)

## # A tsibble: 6 x 8 [1Y]
## # Key:       Country [1]
##   Country    Year  Debt    DI Expenditure Savings Wealth Unemployment
##   <chr>     <dbl> <dbl> <dbl>       <dbl>   <dbl>  <dbl>        <dbl>
## 1 Australia  1995  95.7  3.72        3.40   5.24    315.         8.47
## 2 Australia  1996  99.5  3.98        2.97   6.47    315.         8.51
## 3 Australia  1997 108.   2.52        4.95   3.74    323.         8.36
## 4 Australia  1998 115.   4.02        5.73   1.29    339.         7.68
## 5 Australia  1999 121.   3.84        4.26   0.638   354.         6.87
## 6 Australia  2000 126.   3.77        3.18   1.99    350.         6.29

hh_wealth <- hh_budget |>
  filter(
    Country == "Australia"
  ) |>
  select(Wealth)

autoplot(hh_wealth)

From this plot, I don’t see any seasonality but I see that there are windows where trends stay consistent. I can see that it’s on a growth trajectory. I believe a RW(y ~ drift()) will be most applicable here:

Australian takeaway food turnover (aus_retail).

aus_taway <- aus_retail %>%
  filter(Industry == "Takeaway food services")

autoplot(aus_taway, Turnover) +
  labs(title = "Turnover at Australian Takeaways")

With clear seasonality, the SNAIVE() method will be most appropriate.

aus_takeaway_model <- aus_taway %>%
  model(SNAIVE(Turnover))
aus_takeaway_forecast <- aus_takeaway_model %>%
  forecast()

ggplot() +
  geom_line(data = aus_taway, aes(x = Month, y = Turnover, color = State)) +
  geom_line(data = aus_takeaway_forecast, aes(x = Month, y = .mean, color = State), linetype = "dashed") + 
  labs(title = "Australian Takeaway Turnover Forecast", 
       y = "Turnover", 
       x = "Time") +
  theme_minimal()

5.2:Use the Facebook stock price (data set gafa_stock) to do the following:

a.) Produce a time plot of the series.

data(gafa_stock)
facebook_stock <- gafa_stock |>
    filter(Symbol == "FB") |>
    mutate(day = row_number()) |>
    update_tsibble(index = day, regular = TRUE)
facebook_stock |>
    autoplot(Close)

b.) Produce forecasts using the drift method and plot them.

train <- facebook_stock |>
  filter_index("1" ~ "1228")
facebook_stock_fit <- train |>
  model(Drift = RW(Close ~ drift()))
facebook_stock_fc <- facebook_stock_fit |>
  forecast(h = 30)
p1 <- facebook_stock_fc |>
  autoplot(train, level=NULL) +
  autolayer(filter_index(facebook_stock, "1228" ~ .),
            Close,
            colour = "black") +
  labs(y = "Close",
       title = "Daily Forecasts for Facebook") +
  guides(color = guide_legend(title = "Forecast")) +
  theme()
p1

c.) Show that the forecasts are identical to extending the line drawn between the first and last observations.

facebook_stock %>%
  model(RW(Open ~ drift())) %>%
  forecast(h = 63) %>%
  autoplot(facebook_stock) +
  labs(title = "Daily Open Price of Facebook", y = "USD")

d.) Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

SNAIVE() did not work on the data as it is not seasonal. It is hard to choose one as the daily open price increased in price and then fell in our data. The drift function may be the best to capture that increase.

facebook_stock %>%
  model(Mean = MEAN(Open),
        `Naive` = NAIVE(Open),
        Drift = NAIVE(Open ~ drift())) %>%
  forecast(h = 63) %>%
  autoplot(facebook_stock, level = NULL) +
  labs(title = "Daily Open Price of Facebook", y = "USD")

The NAIVE(y) method is the best forecasting method here. Its forecasts are generally closer to the actual values for the last 30 days of data than either the Mean or Drift methods.

5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

What do you conclude?

looking at these graphs: 1. The mean of the residuals is close to 0 2. The time plot of the residuals show that the variation stays fairly consistent. 3. The histogram seems to be a little normal but not very normal. To me it looks like a bimodal distribution. With this, This suggests that the SNAIVE model is a good fit for the data.

5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case. Australian Exports

aus_exports <- global_economy[,c(1,3,8)] %>%
  filter(Country == "Australia")

autoplot(aus_exports, Exports) +
  labs(title = "Australian Exports")

aus_exports_model <- aus_exports %>%
  model(NAIVE(Exports))

aus_exports_model %>%
  forecast() %>%
  autoplot(aus_exports) +
  labs(title = "Australian Exports Forecast")

aus_exports_model %>% gg_tsresiduals()

I used the NAIVE model so it does not appear to be seasonality in the data. The residuals are uncorrelated and normally distributed with a mean of 0, we can conclude that forecasts built using this method will likely be pretty good.

Australian Bricks

bricks <- aus_production |>
  select(c("Quarter", "Bricks")) |>
  filter(!is.na(Bricks))
fit <- bricks |>
  model(SNAIVE(Bricks))
# residuals
fit |>
  gg_tsresiduals()

fit |>
    forecast() |>
    autoplot(bricks)

For the Australian bricks production data, I see it has spikes that lie outside the blue bounds of the ACF plot. So there is evidence of autocorrelation, the data are not white noise.

5.7 For your retail time series (from Exercise 7 in Section 2.10):

a.) Create a training dataset consisting of observations before 2011 using

data(aus_retail)
set.seed(1221)
my_series <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`, 1))
myseries_train <- my_series |>
    filter(year(Month) < 2011)

b.) Check that your data have been split appropriately by producing the following plot.

autoplot(my_series, Turnover) +
    autolayer(myseries_train, Turnover, color = "blue")

Yes, the data have been split appropriately.

c.) Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE(Turnover))

d.) Check the residuals.

fit |>
  gg_tsresiduals()

Do the residuals appear to be uncorrelated and normally distributed?

The residuals are reasonably uncorrelated and normally distributed, but the mean is a bit higher than zero. The model may benefit from adding the mean of the residuals to the forecast.

e.) Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(my_series, myseries_train))

fc |> autoplot(my_series)

Forecasted data does not account for the increase in the actual data. The actual data appears to fall mostly within the 80% confidence interval.

f.) Compare the accuracy of your forecasts against the actual values.

fit |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Austral… Newspap… SNAIV… Trai… 0.226  1.55  1.11  2.19  15.0     1     1 0.809

fc |> accuracy(my_series)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Aust… Newspap… Test  -3.51  3.62  3.51 -66.3  66.3  3.18  2.34 0.457

Forecast using this model are not very accurate. Since the MASE of the forecast model is greater than 1, I think the model is no better than a naive model.

g.) How sensitive are the accuracy measures to the amount of training data used?

with too little data the model will not have enough of an opportunity to build the model and train itself leading to an underfit model.The MASE accuracy measure we used to evaluate the above model is independent of the amount of training data used.

DATA 624 - Homework 3

Bikash Bhowmik

23 Feb 2025