homework 3

5.1 Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy) Bricks (aus_production) NSW Lambs (aus_livestock) Household wealth (hh_budget). Australian takeaway food turnover (aus_retail).

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.5.2

## Warning: package 'ggplot2' was built under R version 4.5.2

## Warning: package 'tibble' was built under R version 4.5.2

## Warning: package 'readr' was built under R version 4.5.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(fpp3)

## Warning: package 'fpp3' was built under R version 4.5.2

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tsibble     1.1.6     ✔ feasts      0.4.2
## ✔ tsibbledata 0.4.1     ✔ fable       0.5.0

## Warning: package 'tsibble' was built under R version 4.5.2

## Warning: package 'tsibbledata' was built under R version 4.5.2

## Warning: package 'feasts' was built under R version 4.5.2

## Warning: package 'fabletools' was built under R version 4.5.2

## Warning: package 'fable' was built under R version 4.5.2

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

#Australian population shows a pretty linear increase
autoplot(global_economy |> filter(Country == "Australia"), Population)

#so I went with drift

aus_pop <- global_economy |> filter(Country == "Australia")

aus_pop |> model(RW(Population ~ drift())) |> forecast(h = "5 years") |>  autoplot(aus_pop)

#Bricks production is more seasonal
bricks <- aus_production |> select(Quarter, Bricks)

#removing NA values because this wasn't plotting

bricks |>
    filter(!is.na(Bricks)) |>
    model(SNAIVE(Bricks ~ lag("year"))) |>
    forecast(h = 4) |>
    autoplot(bricks)

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

#Lambs in NSW

lambs <- aus_livestock |> filter(Animal == "Lambs" & State == "New South Wales") 
autoplot(lambs)

## Plot variable not specified, automatically selected `.vars = Count`

#I wasn't 100% if this was seasonal, so I used X-13 ARIMA on it

x11_dcmp <- lambs |>
    model(x11 = X_13ARIMA_SEATS(Count ~ x11())) |>
    components()
autoplot(x11_dcmp)

#the seasonal trend isn't super strong, but it seems like the most sensible choice for livestock

lambs |>
    model(SNAIVE(Count)) |>
    forecast(h = "5 years") |>
    autoplot(lambs)

#I also plotted with mean (which is not one of the options)

lambs |>
    model(MEAN(Count)) |>
    forecast(h = "5 years") |>
    autoplot(lambs)

#wealth
aus_wealth <- hh_budget |> filter(Country == "Australia")
autoplot(aus_wealth, Wealth)

#the data has been trending upward for a while, so I will use the dirft method


aus_wealth |> model(RW(Wealth ~ drift())) |> forecast(h = 5) |>  autoplot(aus_wealth)

#Australian takeaway food turnover (aus_retail).


aus_retail_takeaway <- aus_retail |> 
    filter(Industry == "Takeaway food services") |> 
    index_by(Month) |>        
    summarise(Turnover = sum(Turnover))

autoplot(aus_retail_takeaway)

## Plot variable not specified, automatically selected `.vars = Turnover`

#food services is really seasonal, so I will use seasonal naive

aus_retail_takeaway |>
    model(SNAIVE(Turnover ~ lag("year"))) |>
    forecast(h = 24) |>
    autoplot(aus_retail_takeaway)

5.2 Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series. Produce forecasts using the drift method and plot them. Show that the forecasts are identical to extending the line drawn between the first and last observations. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb <- gafa_stock |> filter(Symbol == "FB")

#time plot
autoplot(fb,Close)

#taken from the book and adjusted

fb_stock <- gafa_stock |>
  filter(Symbol == "FB", year(Date) >= 2015) |>
  mutate(day = row_number()) |>
  update_tsibble(index = day, regular = TRUE)
# Filter the year of interest
fb_2015 <- fb_stock |> filter(year(Date) == 2015)
# Fit the models
fb_fit <- fb_2015 |>
  model(
    Drift = NAIVE(Close ~ drift())
  )

#forecasting starting in jan 2016

fb_jan_2016 <- fb_stock |>
  filter(yearmonth(Date) == yearmonth("2016 Jan"))


fb_fc <- fb_fit |>
  forecast(new_data = fb_jan_2016)


fb_fc |>
  autoplot(fb_2015, level = NULL) +
  autolayer(fb_jan_2016, Close, colour = "black") +
  labs(y = "$US",
       title = "Facebook daily closing stock prices",
       subtitle = "(Jan 2015 - Jan 2016)") +
  guides(colour = guide_legend(title = "Forecast"))

#Show that the forecasts are identical to extending the line drawn between the first and last observations.

fb_fc |>
    autoplot(fb_2015, level = NULL) +
    autolayer(fb_jan_2016, Close, colour = "black") +
    labs(y = "$US",
         title = "Facebook daily closing stock prices",
         subtitle = "(Jan 2015 - Jan 2016)") +
    guides(colour = guide_legend(title = "Forecast")) +
    geom_line(data = slice(fb_2015, range(1, n())), 
              aes(x = day, y = Close), 
              linetype = "dashed", color = "blue")

#Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

#The simple version (using all the data)

fb_fit <- fb_stock |>
  model(
    Mean = MEAN(Close),
    Naive = NAIVE(Close),
    Drift = RW(Close ~ drift())
  )


fb_fit |> forecast(h = 90) |> autoplot(fb_stock, level = NULL)

#using the earlier code and the training data set to visualize prediction accuracy

fb_stock <- gafa_stock |>
  filter(Symbol == "FB", year(Date) >= 2015) |>
  mutate(day = row_number()) |>
  update_tsibble(index = day, regular = TRUE)
# Filter the year of interest
fb_2015 <- fb_stock |> filter(year(Date) == 2015)
# Fit the models
fb_fit <- fb_2015 |>
  model(
     Mean = MEAN(Close),
    `Naïve` = NAIVE(Close),
  )


fb_jan_2016 <- fb_stock |>
  filter(yearmonth(Date) == yearmonth("2016 Jan"))


fb_fc <- fb_fit |>
  forecast(new_data = fb_jan_2016)


fb_fc |>
  autoplot(fb_2015, level = NULL) +
  autolayer(fb_jan_2016, Close, colour = "black") +
  labs(y = "$US",
       title = "Facebook daily closing stock prices",
       subtitle = "(Jan 2015 - Jan 2016)") +
  guides(colour = guide_legend(title = "Forecast"))

#naive is better for stocks -- they tend to go up over time (in general)
#so, having more data going farther back can mean a way less accurate mean

5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_rug()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

#the ACF plot suggests in lag 4 that the model didn't capture the data's seasonality (lag 4 is comparing from the previous year).
#Aditionally, the histogram isn't exactly normal (skewed left, somewhat lumpy).
#In the time plot, the variation doesn't stay the same across the historical data, particularly around 1994 and 1996/1997. There's another big dip in 2004.

#The forecast -- beer production has been generally trending downward over the years, so, while the seasonal naive plot captures the seasonality, it doesn't capture as much of the overall downward trend.

5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

#australian exports
#since this data is yearly, I won't filter to just post-1992 data as with beer production
aus_econ <- global_economy |> filter (Country == "Australia") 

#this has generally trended upward with some small drops (the data is not seasonal - only includes years - and cannot be plotted using the seasonal naive method)
aus_econ |> autoplot(Exports)

# Naive
fit_econ <- aus_econ |> model(NAIVE(Exports))
# Look at the residuals
fit_econ |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_rug()`).

# Look a some forecasts
fit_econ |> forecast() |> autoplot(aus_econ)

#residuals - Only one line on the ACF plot goes slightly outside of the line, suggesting that the residuals qualify as noise. The histogram is a little bit lumpy, but close to normal. The time plot does have some big swings, particularly compared to smaller peaks and troughs, so the variation is generally not the same across historical data. 

#The naive forecast is less variable than the data has tended to be. Assuming the same value year-on-year doesn't quite work for a variable that's been generally trending upward and varying somewhat since 1960.

#As noted above, Bricks production has been more seasonal (and unlike exports, quarterly data is available), but it's not the most perfectly seasonal. 

#removing the NA values because they will cause issues later
bricks <- aus_production |> select(Quarter, Bricks) |> filter(!is.na(Bricks)) |> as_tsibble(index = Quarter)

bricks |> autoplot(Bricks)

# seasonal naive
fit_bricks <- bricks |> model(SNAIVE(Bricks))
# Look at the residuals
fit_bricks |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_rug()`).

# Look a some forecasts
fit_bricks |> forecast() |> autoplot(bricks)

# Residuals -- All three plots suggest strongly that the residuals aren't just noise. The ACF plot has an ascending/descending pattern with almost all values outside the dotted line. The time plot has several big, irregular dips, and the histogram is skewed left. 

#Brick production has been somewhat cyclical, with big drops and gradual climbs over a period of years. The seasonal naive forecast doesn't really capture that aspect. 

#let's try it again just to see if the residuals plot looks better with naive

fit_bricks_naive <- bricks |> model(NAIVE(Bricks))
# Look at the residuals
fit_bricks_naive |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_rug()`).

#the residuals look even less like noise in the ACF plot, particularly every second and fourth plot, suggesting seasonality. This is not a better option.

5.7 For your retail time series (from Exercise 7 in Section 2.10):

Create a training dataset consisting of observations before 2011 using

aus_retail_3 <- aus_retail |> filter (Industry == "Cafes, restaurants and catering services") |> summarize(Turnover = sum(Turnover))

myseries_train <- aus_retail_3 |>
  filter(year(Month) < 2011)

Check that your data have been split appropriately by producing the following plot.

autoplot(aus_retail_3, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

#the red overlay stops in 2011

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE())

## Model not specified, defaulting to automatic modelling of the `Turnover`
## variable. Override this using the model formula.

#model automatically chooses turnover

Check the residuals.

fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_rug()`).

#Do the residuals appear to be uncorrelated and normally distributed?
#The histogram looks close to normally distributed, with a slight skew to the right, but the time plot has big dips and peaks, particularly later on, showing that the variation of the residuals does not stay the same across time, and the ACF plot is outside the dotted line for all but 2 of 25, suggesting that the model missed a lot of patterns.

Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(aus_retail_3, myseries_train))

## Joining with `by = join_by(Month, Turnover)`

fc |> autoplot(aus_retail_3)

#The data is supposed to fall into the darker periwinkle forecasted range 80% of the time and the lighter part 95% of the time, but this is not the case. The overlap is far less.

Compare the accuracy of your forecasts against the actual values.

fit |> accuracy()

## # A tibble: 1 × 10
##   .model   .type       ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE() Training  47.7  78.7  60.0  7.64  9.13     1     1 0.838

fc |> accuracy(aus_retail_3)

## # A tibble: 1 × 10
##   .model   .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE() Test   453.  514.  454.  22.3  22.4  7.56  6.53 0.940

How sensitive are the accuracy measures to the amount of training data used?

This model used a lot of training data (monthly from the 80s up until 2011), and still isn’t very accurate. It doesn’t account for the upward overall turnover trend. I’ll break down some of the numbers here.

MAPE error rate went from 9.1 in training to 22.4 in the test set. The model is much less accurate on new data than on historical data. RMSE and MAE values are several times higher in the test set. The model isn’t keeping up with the new data points. A MASE of 7.56 means that the forecast error is 7.56 times more than in the testing data. The ACF in both tables shows there’s a lot of information in the residuals that the model doesn’t account for. The mean error rate shows the average deviation of forecasted or measured values from actual values, and it’s very high (453).

homework 3

Samantha Barbaro

2026-02-17