HW3

HW 3

library(fpp3)

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──

## ✔ tibble      3.2.1     ✔ tsibble     1.1.6
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.1
## ✔ lubridate   1.9.3     ✔ fable       0.4.1
## ✔ ggplot2     3.5.1

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(ggplot2)

Exercises 5.1, 5.2, 5.3, 5.4 and 5.7

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)

Original data:

global_economy_aus <- global_economy |>
  filter(Country == "Australia")

global_economy_aus |>
  autoplot(Population) + 
  geom_point() +
  labs(title= "Australian Population")

There isn’t any seasonality here, so we won’t try the seasonal naive model.

Let’s test out the models:

# Experiment with all forecast models
pop_fit <- global_economy |>
  filter(!is.na(Population), Country == "Australia") |> # just want data that contains population data
  model(
    Naive = NAIVE(Population),
    Drift = RW(Population ~ drift()),
    Mean = MEAN(Population)
  )
pop_fc <- pop_fit |>
  forecast(h = "5 years")
pop_fc |>
  autoplot(global_economy, level = NULL) +
  labs(title = "Australian Population",
    y = "Population") +
  guides(colour = guide_legend(title = "Forecast"))

The mean looks to be the worst option here since it is way too low. The drift forecast model looks to be the best option for this, so the final graph would be:

# Final
pop_fit <- global_economy |>
  filter(!is.na(Population), Country == "Australia") |> # just want data that contains population data
  model(
    Drift = RW(Population ~ drift()),
  )
pop_fc <- pop_fit |>
  forecast(h = "5 years")
pop_fc |>
  autoplot(global_economy, level = NULL) +
  labs(title = "Australian Population",
    y = "Population") +
  guides(colour = guide_legend(title = "Forecast"))

Bricks (aus_production)

# Experiment with all forecast models
brick_fit <- aus_production |>
  filter(!is.na(Bricks)) |> # just want data that contains Bricks data
  model(
    Seasonal_naive = SNAIVE(Bricks),
    Naive = NAIVE(Bricks),
    Drift = RW(Bricks ~ drift()),
    Mean = MEAN(Bricks)
  )
brick_fc <- brick_fit |>
  forecast(h = "5 years")
brick_fc |>
  autoplot(aus_production, level = NULL) +
  labs(title = "Clay brick production in Australia",
    y = "Millions of bricks") +
  guides(colour = guide_legend(title = "Forecast"))

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

Seasonal naive is the one that makes the most sense here, as it is the only one that takes in seasonality.

So the final forecast would be:

# Final forecast model
brick_fit <- aus_production |>
  filter(!is.na(Bricks)) |> # just want data that contains Bricks data
  model(
    Seasonal_naive = SNAIVE(Bricks),
  )
brick_fc <- brick_fit |>
  forecast(h = "5 years")
brick_fc |>
  autoplot(aus_production, level = NULL) +
  labs(title = "Clay brick production in Australia",
    y = "Millions of bricks") +
  guides(colour = guide_legend(title = "Forecast"))

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

NSW Lambs (aus_livestock)

Original data:

aus_livestock_nsw_lambs <- aus_livestock |>
  filter(Animal == "Lambs", State == "New South Wales")

aus_livestock_nsw_lambs |>
  autoplot(Count) + 
  labs(title= "NSW Lambs")

This graph does have seasonality, so the seasonal naive might be the best option for this.

# Experiment with all forecast models
aus_livestock_nsw_lambs_fit <- aus_livestock_nsw_lambs |>
  model(
    Seasonal_naive = SNAIVE(Count),
    Naive = NAIVE(Count),
    Drift = RW(Count ~ drift()),
    Mean = MEAN(Count)
  )
aus_livestock_nsw_lambs_fc <- aus_livestock_nsw_lambs_fit |>
  forecast(h = "5 years")
aus_livestock_nsw_lambs_fc |>
  autoplot(aus_livestock_nsw_lambs, level = NULL) +
  labs(title = "NSW Lambs",
    y = "Number of animals slaughtered") +
  guides(colour = guide_legend(title = "Forecast"))

The seasonal_naive looks to be the best option, so the final graph would be:

aus_livestock_nsw_lambs_fit <- aus_livestock_nsw_lambs |>
  model(
    Seasonal_naive = SNAIVE(Count),
  )
aus_livestock_nsw_lambs_fc <- aus_livestock_nsw_lambs_fit |>
  forecast(h = "5 years")
aus_livestock_nsw_lambs_fc |>
  autoplot(aus_livestock_nsw_lambs, level = NULL) +
  labs(title = "NSW Lambs",
    y = "Number of animals slaughtered") +
  guides(colour = guide_legend(title = "Forecast"))

Household wealth (hh_budget).

Original data:

hh_budget |>
  autoplot(Wealth) + 
  labs(title= "Household wealth")

This data does not look seasonal, so let’s remove testing the seasonal naive model:

# Experiment with all forecast models
hh_budget_fit <- hh_budget |>
  model(
    Naive = NAIVE(Wealth),
    Drift = RW(Wealth ~ drift()),
    Mean = MEAN(Wealth)
  )
hh_budget_fc <- hh_budget_fit |>
  forecast(h = "5 years")
hh_budget_fc |>
  autoplot(hh_budget, level = NULL) +
  labs(title = "NSW Lambs",
    y = "Number of animals slaughtered") +
  guides(colour = guide_legend(title = "Forecast"))

The drift looks to be the best option here. The naive is not a bad second option either.

The final graph would be:

hh_budget_fit <- hh_budget |>
  model(
    Drift = RW(Wealth ~ drift())
  )
hh_budget_fc <- hh_budget_fit |>
  forecast(h = "5 years")
hh_budget_fc |>
  autoplot(hh_budget, level = NULL) +
  labs(title = "NSW Lambs",
    y = "Number of animals slaughtered") +
  guides(colour = guide_legend(title = "Forecast"))

Australian takeaway food turnover (aus_retail)

Original data:

aus_retail_takeaway_food <- aus_retail |>
  filter(Industry == "Takeaway food services")

aus_retail_takeaway_food |>
  autoplot(Turnover) + 
  labs(title= "Australian takeaway food turnover")

# Experiment with all forecast models
aus_retail_takeaway_food_fit <- aus_retail_takeaway_food |>
  model(
    Seasonal_naive = SNAIVE(Turnover),
    Naive = NAIVE(Turnover),
    Drift = RW(Turnover ~ drift()),
    Mean = MEAN(Turnover)
  )
aus_retail_takeaway_food_fc <- aus_retail_takeaway_food_fit |>
  forecast(h = "5 years")
aus_retail_takeaway_food_fc |>
  autoplot(aus_retail_takeaway_food, level = NULL) +
  facet_grid(vars(State), scales = "free_y") +
  labs(title= "Australian takeaway food turnover") +
  guides(colour = guide_legend(title = "Forecast"))

Drift is not too bad here, but since these graphs show seasonality, the seasonal naive would most likely be best.

So the final graph would be:

aus_retail_takeaway_food_fit <- aus_retail_takeaway_food |>
  model(
    Seasonal_naive = SNAIVE(Turnover)
  )
aus_retail_takeaway_food_fc <- aus_retail_takeaway_food_fit |>
  forecast(h = "5 years")
aus_retail_takeaway_food_fc |>
  autoplot(aus_retail_takeaway_food, level = NULL) +
  facet_grid(vars(State), scales = "free_y") +
  labs(title= "Australian takeaway food turnover") +
  guides(colour = guide_legend(title = "Forecast"))

Below is for my own experimenting:

It looks like some of the original data has a lot of variance, so let’s see what a box cox transformation does just for Victoria:

# Experimentation code
aus_retail_takeaway_food_cap <- aus_retail_takeaway_food |>
  filter(State == "Victoria")

aus_retail_takeaway_food_cap |>
  autoplot(Turnover)

lambda_aus_retail_takeaway_food <- aus_retail_takeaway_food_cap |>
  features(Turnover, features = guerrero) |>
  pull(lambda_guerrero)
# lambda_aus_livestock

aus_retail_takeaway_food_cap |>
  autoplot(box_cox(Turnover, lambda_aus_retail_takeaway_food))

fc <- aus_retail_takeaway_food_cap |>
  model(SNAIVE(box_cox(Turnover, lambda_aus_retail_takeaway_food))) |>
  forecast(h = 50) |>
  mutate(.median = median(Turnover))
fc |>
  autoplot(aus_retail_takeaway_food_cap |> filter(!is.na(Turnover)), level = 80) +
  geom_line(aes(y = .median), data = fc, linetype = 2, col = "blue")

Looks like seasonal naive is still a good model here. The mean and median are pretty close to one another, with the mean slightly above the median. It looks like this shows the bias adjustment.

Use the Facebook stock price (data set gafa_stock) to do the following:

Produce a time plot of the series.

fb_stock <- gafa_stock |>
  filter(Symbol == "FB") |>
  mutate(trading_day = row_number()) |>
  update_tsibble(index = trading_day, regular = TRUE)

fb_stock |>
  autoplot(Close)

Produce forecasts using the drift method and plot them.

fb_stock |>
  model(RW(Close ~ drift())) |>
  forecast(h = 42) |>
  autoplot(fb_stock, level = NULL) +
  labs(title = "Facebook closing stock price", y = "$US") +
  guides(colour = guide_legend(title = "Forecast"))

Show that the forecasts are identical to extending the line drawn between the first and last observations.

Let’s create a regression line using geom_abline():

# grab the first and last observation
last_row <- tail(fb_stock, n = 1)
first_row <- head(fb_stock, n = 1)

# create the data points
x <- c(first_row$trading_day, last_row$trading_day)
y <- c(first_row$Close, last_row$Close)

# create a data frame
data <- data.frame(x, y)

# create a linear model
regression_line <- lm(formula = y ~ x, 
   data=data)

# get the parameters
coeff <- coefficients(regression_line)           
intercept <- coeff[1] 
slope <- coeff[2] 

fb_stock |>
  model(RW(Close ~ drift())) |>
  forecast(h = 42) |>
  autoplot(fb_stock, level = NULL) +
  labs(title = "Facebook closing stock price", y = "$US") +
  guides(colour = guide_legend(title = "Forecast")) +
  geom_abline(intercept = intercept, slope = slope, color="red", linetype="dashed", linewidth=1.5)

As you can see, the red dotted line goes right through the blue forecast line, showing that the forecast is identical to the regression line.

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

fb_stock_fit <- fb_stock |>
  model(
    Seasonal_naive = SNAIVE(Close),
    Naive = NAIVE(Close),
    Mean = MEAN(Close)
  )

## Warning: 1 error encountered for Seasonal_naive
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.

fb_stock_fit_fc <- fb_stock_fit |>
  forecast(h = 42)
fb_stock_fit_fc |>
  autoplot(fb_stock, level = NULL) +
  labs(title = "Facebook closing stock price", y = "$US") +
  guides(colour = guide_legend(title = "Forecast"))

## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_line()`).

Seasonal naive did not plot which makes sense because there’s no clear seasonality. The naive is not too bad, but I think the best method is the drift because it is able to capture increases and decreases over time. This graph clearly shows increases and decreases and positive and negative trends, so the drift method would be slightly more accurate than just the naive. The mean looks too far low (and also doesn’t capture increases and decreases) so it could not be the best option here.

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

What do you conclude?

Besides 1996 Q1, the time plot of the residuals shows the variance is pretty constant. The variation of the residuals does not change all that much as time goes on. The ACF of the residuals shows a lack of correlation, meaning that the forecasts are good. The histogram also shows a somewhat normal distribution.

Now let’s check the residuals for white noise. Since this has seasonality, we pick l = 2m. The seasonal period is 4 quarters (1 year), so l = 2 * 4 which is 8.

# Box-Pierce test
augment(fit) |> features(.innov, box_pierce, lag = 8)

# Ljung-Box test
augment(fit) |> features(.innov, ljung_box, lag = 8)

Both of these tests show that the p value is less than 0.05, so we reject the hypothesis of white noise. It is unlikely to occur by chance. There could be information in the residuals that could be used in computing the forecasts.

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian Exports series from global_economy

Original data:

global_economy_export_aus <- global_economy |>
  filter(Country == "Australia")

global_economy_export_aus |>
  autoplot(Exports) + 
  geom_point() +
  labs(title= "Australian Exports")

There does not seem to be any seasonality here, so let’s use NAIVE().

# Define and estimate a model
fit <- global_economy_export_aus |> model(NAIVE(Exports))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
fit |> forecast() |> autoplot(global_economy_export_aus)

# Box-Pierce test
augment(fit) |> features(.innov, box_pierce, lag = 10)

# Ljung-Box test
augment(fit) |> features(.innov, ljung_box, lag = 10)

For both tests, the p value is larger than 0.05, so it’s likely to occur by chance. These residuals are not easily distinguished from white noise. We accept the white noise hypothesis.

Bricks series from aus_production

Original data:

autoplot(aus_production, Bricks) + geom_point()

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).

There looks to be seasonality here, so let’s use SNAIVE():

# Experiment with all forecast models
brick_fit <- aus_production |>
  filter(!is.na(Bricks)) |> # just want data that contains Bricks data
  model(
    Seasonal_naive = SNAIVE(Bricks),
  )
# Look at the residuals
brick_fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

# Look a some forecasts
brick_fit |> forecast() |> autoplot(aus_production)

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

The time plot of the residuals shows a lot of variance which means the forecast model needs some work. Additionally, the ACF shows strong correlation and the histogram shows a left skewed distribution with some possible outliers. The residuals do not pass the test of constants variance and a normal distribution.

Now let’s check the residuals for white noise. Since this has seasonality, we pick l = 2m. The seasonal period is 4 quarters (1 year), so l = 2 * 4 which is 8.

# Box-Pierce test
augment(brick_fit) |> features(.innov, box_pierce, lag = 8)

# Ljung-Box test
augment(brick_fit) |> features(.innov, ljung_box, lag = 8)

For both tests, the p value is less than 0.05 (actually 0), so it’s unlikely to occur by chance. These residuals are easily distinguished from white noise. We reject the white noise hypothesis.

For your retail time series (from Exercise 7 in Section 2.10):

Create a training dataset consisting of observations before 2011 using

set.seed(100)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries_train <- myseries |>
  filter(year(Month) < 2011)

Check that your data have been split appropriately by producing the following plot.

autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE(Turnover))

Check the residuals.

# myseries_train
fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

Do the residuals appear to be uncorrelated and normally distributed?

No, the time plot of the residuals shows a lot of variance, and the ACF shows strong correlation. Additionally, the histogram does not show a normal distribution as the distribution is right skewed with an outlier.

Produce forecasts for the test data

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train)) # test data

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

Compare the accuracy of your forecasts against the actual values.

fit |> accuracy() # myseries_train

fc |> accuracy(myseries) # test data

The errors for the test data is quite large. For RMSE, the test data is 6 times the amount as the training. You can also see this visually as the test data is significant lower than the actual values.

How sensitive are the accuracy measures to the amount of training data used?

Let’s add another 5 years of training:

set.seed(100)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries_train <- myseries |>
  filter(year(Month) < 2016)

fit <- myseries_train |>
  model(SNAIVE(Turnover))
fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train)) # test data

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries)

fit |> accuracy() # myseries_train

fc |> accuracy(myseries) # test data

Just adding 5 years of training data made the errors go down by more than 50%. Visually the graph looks a lot more accurate, as it matches the values more.

HW3

Gillian McGovern

2025-02-17

HW 3