library(fpp3)
## Registered S3 methods overwritten by 'ggtime':
##   method           from      
##   autolayer.fbl_ts fabletools
##   autolayer.tbl_ts fabletools
##   autoplot.dcmp_ts fabletools
##   autoplot.fbl_ts  fabletools
##   autoplot.tbl_ts  fabletools
##   fortify.fbl_ts   fabletools
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tibble      3.3.0     ✔ tsibble     1.2.0
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.5.0
## ✔ lubridate   1.9.4     ✔ fable       0.5.0
## ✔ ggplot2     4.0.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

5.1

#Australian Population (global_economy)

aus_population <- global_economy |>
  filter(Country == "Australia") |>
  select(Population)

aus_population |> autoplot(Population) + labs(title = "Australian Population Over Time")
## Warning: `autoplot.tbl_ts()` was deprecated in fabletools 0.6.0.
## ℹ Please use `ggtime::autoplot.tbl_ts()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
##   `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

aus_population |>
  model(RW(Population ~ drift())) |>
  forecast(h = 15) |>
  autoplot(aus_population) + labs(title = "Australian Population Over Time Forecast")
## Warning: `autoplot.fbl_ts()` was deprecated in fabletools 0.6.0.
## ℹ Please use `ggtime::autoplot.fbl_ts()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
##   `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

In the population dataset, seeing as there is an upwards trend and no seasonality, I will use the drift forecast method.

#Bricks (aus_production)
bricks <- aus_production |>
  select(Bricks) |>
  filter(!is.na(Bricks))

bricks |> autoplot(Bricks) + labs(title = "Australian Brick Production Over Time")

bricks |>
  model(SNAIVE(Bricks ~ lag("year"))) |>
  forecast(h = 15) |>
  autoplot(bricks) + labs(title = "Australian Brick Production Over Time Forecast")

In the brick dataset, there is a seasonal pattern and we have quarterly data, so I will use the seasonal naive method.

#NSW Lambs (aus_livestock)

lambs <- aus_livestock |>
  filter(State == "New South Wales" & Animal == "Lambs")

lambs |> autoplot(Count) + labs(title = "Australian Lambs Slaughtered Over Time")

lambs |>
  model(SNAIVE(Count)) |>
  forecast(h = 15) |>
  autoplot(lambs) + labs(title = "Australian Lambs Slaughtered Over Time Forecast")

In the lambs dataset, a seasonal pattern is also present, so I will use the seasonal naive method here too.

#Household wealth (hh_budget).

household_wealth <- hh_budget |>
  filter(Country == "Australia") |>
  select(Wealth)

household_wealth |> autoplot(Wealth) + labs(title = "Australian Household Wealth Over Time")

household_wealth |>
  model(RW(Wealth ~ drift())) |>
  forecast(h = 15) |>
  autoplot(household_wealth) + labs(title = "Australian Household Wealth Over Time Forecast")

An upward trend and no seasonality - drift method for household wealth.

#Australian takeaway food turnover (aus_retail).

aus_food <- aus_retail |>
  filter(Industry == "Cafes, restaurants and takeaway food services") |>
  select(Turnover) |>
  summarize(Turnover = sum(Turnover))
  

aus_food |> autoplot(Turnover) + labs(title = "Australian Takeaway Food Turnover Over Time")

aus_food |>
  model(SNAIVE(Turnover)) |>
  forecast(h = 15) |>
  autoplot(aus_food) + labs(title = "Australian Takeaway Food Turnover Over Time Forecast")

we have seasonality and monthly data - SNAIVE for turnover.

5.2

#a. Produce a time plot of the series.

facebook <- gafa_stock |>
  filter(Symbol == "FB") |>
  mutate(trading_day = row_number()) |> #because days are skipped!
  update_tsibble(index = trading_day, regular = TRUE)

facebook |> autoplot(Close) + labs(y = "Closing Price ($USD)", title = "Facebook Stock Closing Price Over Time")

#b. Produce forecasts using the drift method and plot them.
facebook |>
  model(RW(Close ~ drift())) |>
  forecast(h = 100) |>
  autoplot(facebook) + labs(y = "Closing Price ($USD)", title = "Facebook Stock Closing Price Over Time Forecast")

#c. Show that the forecasts are identical to extending the line drawn between the first and last observations.

T <- nrow(facebook)
y_1 <- head(facebook$Close, 1)
y_T <- tail(facebook$Close, 1)


facebook |>
  model(Drift = RW(Close ~ drift())) |>
  forecast(h = 100) |>
  autoplot(facebook, level = NULL) +
  annotate(
    "segment",
    x = 1, y = y_1, #first observation
    xend = T, yend = y_T, #last obs
    
    colour = "firebrick3", linetype = "dashed") +
  labs(y = "Closing Price ($USD)", title = "Facebook Stock Closing Price Over Time Forecast with Drift Line")

#d. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

facebook |>
  model(Mean = MEAN(Close),
        Naive = NAIVE(Close),
        Drift = RW(Close ~ drift())) |>
  forecast(h = 100) |>
  autoplot(facebook) +
  labs(y = "Closing Price ($USD)", title = "Facebook Stock Forecast Method Benchmark Comparison")

facebook |>
  model(Mean = MEAN(Close),
        Naive = NAIVE(Close),
        Drift = RW(Close ~ drift())) |>
  forecast(h = 100) |>
  autoplot(facebook, level = NULL) +
  labs(y = "Closing Price ($USD)", title = "Facebook Stock Forecast Method Benchmark Comparison")

I believe the best forecasting method for the closing price is the Naive method. It’s commonly used when forecasting stock prices. The mean method would not really give a great forecast because it just takes the average of all closing prices, which can be extremely volatile and not indicative of what’s to come. At the same time, the drift method implies that the trend of the stock will continue going the same way indefinitely, which is not always the case.

5.3

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

The residuals from the first plot appear to exhibit homoscedasticity for the most part, and do not show any trends or patterns. The mean of the .innov residuals is also close to 0, but not perfectly centered. However, there is autocorrelation - the bars exceed the blue lines in the ACF plot in 3 places, so we can say the residuals are not white noise. The residual distribution appears to be normal. The forecast plot does not seem to capture the slight downwards trend. All in all, the model does not satisfy all the conditions - the residuals are not white noise, meaning that they still contain some information which a better model could capture and use to produce more accurate forecasts.

5.4

# Extract data of interest
aus_export <- global_economy |>
  filter(Country == "Australia")

# Define and estimate a model
fit <- aus_export |> model(NAIVE(Exports))
# Look at the residuals
fit |> gg_tsresiduals()

# Look a some forecasts
fit |> forecast(h = 5) |> autoplot(aus_export)

Since the exports series is annual with no seasonal pattern, NAIVE is more appropriate than SNAIVE. The .innov residuals appear to be mostly close to 0, and I also do not see much autocorrelation, meaning that the residuals are white noise. The residuals are homoscedastic for the most part, except one point towards the end, but the homoscedasticity seems to continue as normal after that one point. I would say the NAIVE method is a good model to apply in this situation.

# Extract data of interest
aus_bricks <- aus_production |>
  select(Bricks) |>
  filter(!is.na(Bricks))

# Define and estimate a model
fit <- aus_bricks |> model(SNAIVE(Bricks))
# Look at the residuals
fit |> gg_tsresiduals()

# Look at some forecasts
fit |> forecast() |> autoplot(aus_bricks)

For Australian Brick production, we can see that the residuals are heteroscedastic, with the spread between the residuals being very different in range across different time periods. Another condition which isn’t satisfied, is that the ACF shows the residuals are very much autocorrelated and are not just white noise. The distribution is also left-skewed. And while the residuals’ mean is 0, the other conditions are not fulfilled, which means that applying this SNAIVE model wouldn’t be a good idea.

5.7

set.seed(33)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

#a
myseries_train <- myseries |>
  filter(year(Month) < 2011)
#b
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")
## Warning: `autolayer.tbl_ts()` was deprecated in fabletools 0.6.0.
## ℹ Please use `ggtime::autolayer.tbl_ts()` instead.
## ℹ Graphics functions have been moved to the {ggtime} package. Please use
##   `library(ggtime)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#c
fit <- myseries_train |>
  model(SNAIVE(Turnover))
#d
fit |> gg_tsresiduals()

The residuals are NOT uncorrelated and are NOT white noise, meaning we have autocorrelation, and they are also heteroscedastic. The mean is not equal to 0 either, and the distribution is slightly right-skewed. All of this means that the SNAIVE model is not appropriate for this data.

#e
fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)

#f
fit |> accuracy() #training
## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Queensl… Food re… SNAIV… Trai…  55.7  71.1  56.2  7.88  7.94     1     1 0.851
fc |> accuracy(myseries) #test
## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Quee… Food re… Test   326.  362.  326.  15.5  15.5  5.80  5.08 0.916

The training scores lower in every metric, and the lower the number - the better the model’s performance. The last year of training data (2010) is the highest point the series has reached so far, so when SNAIVE copies that as its forecast, it’s starting from the most recent and highest base. This means the initial test forecasts aren’t too far off. However, on the test data, which is what’s important, the model performs much worse, and that’s because SNAIVE ignores the upwards trend, which is why the forecast becomes worse and worse over time. Because SNAIVE assumes that only the most recent seasonal pattern will repeat, the upwards trend isn’t shown and, thus, is not included in the prediction. In essence, the model performs much better on the training data than on the test data, therefore it is poor.

#g - How sensitive are the accuracy measures to the amount of training data used?

#Earlier cutoff, less training data
myseries_train_2005 <- myseries |>
  filter(year(Month) < 2005)
fit_2005 <- myseries_train_2005 |>
  model(SNAIVE(Turnover))
fc_2005 <- fit_2005 |>
  forecast(new_data = anti_join(myseries, myseries_train_2005))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fit_2005 |> accuracy()
## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Queensl… Food re… SNAIV… Trai…  42.0  49.8  42.2  8.05  8.09     1     1 0.683
fc_2005 |> accuracy(myseries)
## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Quee… Food re… Test   694.  784.  695.  36.0  36.0  16.5  15.7 0.955

One way to see how sensitive the accuracy measures are to the amount of training data used is to change the train/test split. Using an earlier cutoff of 2005 instead of 2011 means the model is using an older seasonal cycle that is further from the test period, which resulted in worse test accuracy. This shows that the accuracy measures are sensitive to the amount of training data used and that the less recent the training cutoff, the worse SNAIVE performs, because it forecasts from a lower base while actual values have continued growing.