Homework 3

Author

Naomi Buell

pacman::p_load(
  tidyverse,
  janitor,
  fpp3,
  USgas,
  readxl,
  feasts,
  ggplot2,
  scales,
  seasonal,
  lubridate,
  skimr,
  forecast,
  zoo
)
set.seed(9219)

5.1

Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:

Australian Population (global_economy)

First, I expore the population data to assess what model to use.

        #| label: global-econ
        #| warning: false
        #| message: false

        # transformed the units of the y-axis to millions to make the large numbers easier to read
        australian_pop <- global_economy |> 
          filter(Country == "Australia") |> 
          mutate(Population = Population / 1000000) |>
          select(Population) |> 
          drop_na()

        australian_pop |>
          autoplot() +
          theme_classic() +
          theme(legend.position = "none") +
          labs(title = "Australian population appears to increase linearly.", 
               x = "Year", 
               y = "Population (millions)") +
          scale_x_continuous(breaks = seq(1950, 2100, 10))

Plot variable not specified, automatically selected `.vars = Population`

Since the Australian population rise appears to be linear, I will use the drift method.

        #| label: aus-pop-model
        #| warning: false
        #| message: false

        # Fit the model
        australian_pop_model <- australian_pop |>
          model(RW(Population ~ drift()))

        # Forecast the next 10 years
        australian_pop_fc <- australian_pop_model |>
          forecast(h = 10)

        # Plot the forecast
        australian_pop_fc |>
          autoplot() +
          autolayer(australian_pop, Population) +
          labs(
            title = "Australian Population Forecast",
            subtitle = "Drift method",
            x = "Year",
            y = "Population (millions)"
          ) +
          theme_classic() +
          scale_x_continuous(breaks = seq(1950, 2100, 10))

Bricks (aus_production)

First, I explore the bricks data to assess what model to use.

        #| label: bricks-exploratory
        #| warning: false
        #| message: false

        # get bricks
        bricks <- aus_production |>
          select(Bricks)  |>
          drop_na()

        # Autoplot
        bricks |> autoplot() +
          theme_classic() +
          theme(axis.line = element_blank()) +
          labs(
            title = "Brick production trends upwards until the 80's, then drops off.",
            subtitle = "Cyclical booms and crashes in 1975, 1983, 1991, 1996, and 2001.",
            x = "",
            y = "Clay brick production (in millions of bricks)"
          ) +
          scale_x_yearquarter(breaks = seq(yearquarter("1950 Q1"), yearquarter("2010 Q1"), by = 20))

Plot variable not specified, automatically selected `.vars = Bricks`

        # Seasonal plot
        bricks |> gg_season() +
          theme_classic() +
          theme(axis.line = element_blank()) +
          labs(title = "Production is seasonal, starting low in Q1 and peaking in Q3.", x = "", y = "Clay brick production (in millions of bricks)")

Plot variable not specified, automatically selected `y = Bricks`

        # Subseries
        bricks |> gg_subseries() +
          theme_classic() +
          theme(axis.line = element_blank())

Plot variable not specified, automatically selected `y = Bricks`

        labs(title = "Bricks are produced seasonally, peaking in Q3.", x = "", y = "Production (millions of bricks)")

$x
[1] ""

$y
[1] "Production (millions of bricks)"

$title
[1] "Bricks are produced seasonally, peaking in Q3."

attr(,"class")
[1] "labels"

Since Bricks appear seasonal, I will use the seasonal naive method.

        #| label: bricks-model
        #| warning: false
        #| message: false

        # Fit the model
        bricks_model <- bricks |>
          model(SNAIVE(Bricks))

        # Forecast the next 10 years
        bricks_fc <- bricks_model |>
          forecast(h = 10)

        # Plot the forecast
        bricks_fc |>
          autoplot() +
          autolayer(bricks, Bricks) +
          labs(
            title = "Brick Production Forecast",
            subtitle = "SNAIVE method",
            x = "Year",
            y = "Clay brick production (in millions of bricks)"
          ) +
          theme_classic() +
          scale_x_yearquarter(breaks = seq(yearquarter("1950 Q1"), yearquarter("2030 Q1"), by = 20))

NSW Lambs (aus_livestock)

First I explore the lambs data to assess what model to use.

        #| label: nsw-lambs-exploratory
        #| message: false
        #| warning: false

        # get lambs
        lambs <- aus_livestock |>
          filter(Animal == "Lambs", State == "New South Wales") |>
          select(-Animal) |>
          mutate(Count = Count / 10000)  |>
          drop_na()

        # plot lambs
        lambs |>
          autoplot() +
          theme_classic() +
          labs(title = "Lambs slaughtered in New South Wales", x = "Year", y = "10k lambs slaughtered")

Plot variable not specified, automatically selected `.vars = Count`

        # Decompose
        lambs |>
          model(x11 = X_13ARIMA_SEATS(Count ~ x11())) |>
          components() |>
          autoplot() +
          labs(
            title = "X-11 decomposition of lambs doesn't make it clear which forecast method to use.",
            subtitle = "Seasonality has variance over time, and trend is not linear.",
            x = "Year",
            y = "10k lambs slaughtered"
          ) +
          theme_classic() +
          scale_x_yearmonth(breaks = "2 year", labels = label_date("%Y")) +
          theme(axis.text.x = element_text(angle = 45, hjust = 1))

It’s unclear from the trends in the data which forecast method will fit best, so I will plot and test the accuracy of all three.

        #| label: lambs-model
        #| warning: false
        #| message: false

        # Fit the model
        lambs_model <- lambs |>
          model(
            `Naïve` = NAIVE(Count),
            `SNAIVE` = SNAIVE(Count),
            `RW` = RW(Count ~ drift())
          )

        # Forecast the next 10 years
        lambs_fc <- lambs_model |>
          forecast(h = 10)

        # Assess model accuracy w/ function
        calculate_accuracy <- function(model) {
          accuracy_df <- model |>
            accuracy() |>
            select(.model, MAE, RMSE, MAPE, MASE, RMSSE) |>
            pivot_longer(
              cols = where(is.numeric),
              names_to = "score",
              values_to = "value"
            ) |>
            group_by(score) |>
            summarize(min_value = value[which.min(abs(value))], # Find the minimum value for each metric
                      model = .model[which.min(value)])# Get the model corresponding to the minimum value
          
          print(accuracy_df)
        }

        # calculate best models for accuracy
        calculate_accuracy(lambs_model)

# A tibble: 5 × 3
  score min_value model
  <chr>     <dbl> <chr>
1 MAE       4.01  RW   
2 MAPE      9.96  RW   
3 MASE      0.958 RW   
4 RMSE      5.03  RW   
5 RMSSE     0.910 RW

Based on the accuracy measures, the drift method appears to have the best accuracy scores. I plot the drift forecast below.

        #| label: lambs-plot-model
        #| warning: false
        #| message: false

        # Plot the forecast
        lambs_fc |>
          filter(.model == "RW")  |>
          autoplot() +
          autolayer(lambs, Count) +
          labs(
            title = "New South Wales Lambs Slaughtered Forecast",
            subtitle = "Drift method",
            x = "Year",
            y = "10k lambs slaughtered"
          ) +
          theme_classic() +
          scale_x_yearmonth(breaks = "5 year", labels = label_date("%Y")) +
          theme(axis.text.x = element_text(angle = 45, hjust = 1))

Household wealth (hh_budget)

First, I plot the data. Then, I compare the accuracy scores of all three forecasting methods to determine which will be the best model.

        #| label: hh-wealth-explor
        #| message: false
        #| warning: false

        # get hh
        wealth <- hh_budget |>
          select(Wealth)  |>
          drop_na()

        # plot hh
        wealth |>
          autoplot() +
          theme_classic() +
          labs(title = "Household wealth", 
               x = "Year", 
               y = "Wealth as a percentage of net disposable income")

Plot variable not specified, automatically selected `.vars = Wealth`

        # Fit the model
        wealth_model <- wealth |>
          model(
            `Naïve` = NAIVE(Wealth),
            `SNAIVE` = SNAIVE(Wealth),
            `RW` = RW(Wealth ~ drift())
          )

Warning: 4 errors (1 unique) encountered for SNAIVE
[4] Non-seasonal model specification provided, use RW() or provide a different lag specification.

        # get accuracy
        calculate_accuracy(wealth_model)

# A tibble: 5 × 3
  score min_value model
  <chr>     <dbl> <chr>
1 MAE      12.1   RW   
2 MAPE      2.28  RW   
3 MASE      0.761 RW   
4 RMSE     16.4   RW   
5 RMSSE     0.877 RW

Based on the accuracy scores of all three methods, the drift method appears best.

        #| label: hh-forecast
        #| message: false
        #| warning: false

        # Forecast the next 10 years
        wealth_fc <- wealth_model |>
          forecast(h = 10)

        # Plot the forecast
        wealth_fc |>
          filter(.model == "RW")  |>
          autoplot() +
          autolayer(wealth, Wealth) +
          labs(title = "Household wealth forecast", x = "Year", y = "Wealth as a percentage of net disposable income") +
          theme_classic()

Australian takeaway food turnover (aus_retail)

First, I plot the data. Then, I compare the accuracy scores of all three forecasting methods to determine which will be the best model.

        #| label: aus-takeaway-explor
        #| message: false
        #| warning: false

        # get takeaway food turnover
        takeaway <- aus_retail |>
          filter(Industry == "Takeaway food services")  |>
          select(-c(Industry), Turnover)  |>
          drop_na()

        # plot takeaway food turnover
        takeaway |>
          autoplot() +
          theme_classic() +
          labs(title = "Australian Takeaway Food Turnover", x = "Year", y = "Turnover in million AUD")

Plot variable not specified, automatically selected `.vars = Turnover`

        # Fit the model
        takeaway_model <- takeaway |>
          model(
            `Naïve` = NAIVE(Turnover),
            `SNAIVE` = SNAIVE(Turnover),
            `RW` = RW(Turnover ~ drift())
          )

        # get accuracy
        calculate_accuracy(takeaway_model)

# A tibble: 5 × 3
  score min_value model
  <chr>     <dbl> <chr>
1 MAE       0.666 Naïve
2 MAPE      5.29  RW   
3 MASE      0.410 Naïve
4 RMSE      0.950 RW   
5 RMSSE     0.399 RW

Based on the MAPE, RMSE, and RMSSE accuracy measures, the drift method appears to have the best accuracy scores (although Naive performed best according to the MAE and MASE). I plot the drift forecast below.

        #| label: aus-takeaway-forecast
        #| message: false
        #| warning: false

        # Forecast the next 10 years
        takeaway_model |>
          forecast(h = 10) |>
          filter(.model == "RW")  |>
          autoplot() +
          autolayer(takeaway, Turnover) +
          labs(title = "Australian Takeaway Food Turnover Forecast", x = "Year", y = "Turnover in million AUD") +
          theme_classic()

5.2

Use the Facebook stock price (data set gafa_stock) to do the following:
Produce a time plot of the series.

        #| label: time-plot
        #| message: false
        #| warning: false

        # get fb closing prices
        fb <- gafa_stock |>
          filter(Symbol == "FB") |>
          select(Date, Close) |>
          # Re-index based on trading days
          mutate(day = row_number()) |>
          # Divide day by the number of trading days in a year to plot years since 2014-01-02
          mutate(years_since = (day - 1) / 252) |>
          update_tsibble(index = years_since, regular = TRUE)

        # Get the minimum date in the tsibble
        min_date <- min(fb$Date)

        # plot fb
        fb |>
          autoplot(Close) +
          theme_classic() +
          labs(title = "Time plot of Facebook stock price",
               y = "Closing price",
               x = paste("Years since", min_date))

Produce forecasts using the drift method and plot them.

        #| label: forecast-drift-fb
        #| message: false
        #| warning: false

        # Fit the model
        fb_model <- fb |>
          model(RW = RW(Close ~ drift()))

        # Forecast the next year of trading days
        fb_fc <- fb_model |> forecast(h = 252)

        # Plot the forecast
        plot  <- fb_fc |>
          autoplot() +
          autolayer(fb, Close) +
          labs(title = "Forecast of Facebook stock price using Drift Method",
               x = paste("Years since", min_date),
               y = "Closing price") +
          theme_classic() +
          scale_x_continuous(breaks = seq(0, max(fb$years_since) + 252, 1))

        plot

Show that the forecasts are identical to extending the line drawn between the first and last observations.

        plot +
          # plot straight line through first and last observations
          geom_abline(
            # get slope
            slope = (fb$Close[nrow(fb)] - fb$Close[1]) / (max(fb$years_since) - min(fb$years_since)),
            # get intercept
            intercept = fb$Close[1] - (fb$Close[nrow(fb)] - fb$Close[1]) / (max(fb$years_since) - min(fb$years_since)) * min(fb$years_since),
            color = "red",
            linetype = "dotted"
          )

The dotted red line through the first observation and the last overlaps directly with the drift forecast.

Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

        # Fit the model
        fb_models <- fb |>
          model(
            `Naïve` = NAIVE(Close),
            `Mean` = MEAN(Close),
            `RW` = RW(Close ~ drift())
          )

        # get accuracy
        calculate_accuracy(fb_models)

# A tibble: 5 × 3
  score min_value model
  <chr>     <dbl> <chr>
1 MAE       1.46  RW   
2 MAPE      1.26  RW   
3 MASE      0.998 RW   
4 RMSE      2.41  RW   
5 RMSSE     1.00  RW

        # Forecast the next year of trading days
        fb_fc <- fb_model |> forecast(h = 252)

I compare the naive, mean, and drift methods (since I specify the model index in terms of years of trading days, a non-seasonal model specification, I do not use the seasonal naive method). The drift method (RW) has the best accuracy scores (lowest MAE, MAPE, MASE, RMSE, and RMSSE out of all benchmark functions), so I think the drift method is best.

5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.

    #| label: q3-help
    #| warning: false

    # Extract data of interest
    recent_production <- aus_production |>
      filter(year(Quarter) >= 1992)
    # Define and estimate a model
    fit <- recent_production |> model(SNAIVE(Beer))
    # Look at the residuals
    fit |> gg_tsresiduals()

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_line()`).

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_bin()`).

    # Look at some forecasts
    fit |> forecast() |> autoplot(recent_production)

What do you conclude?

The mean of the residuals appears close to zero in the time plot of residuals and the histogram. There does not seem to be a significant correlation in the residuals series and the variance appears constant (homoskedastic). The histogram of the residuals from the seasonal naïve method appears normal, suggesting that this was a good forecast. The lack of correlation in the ACF of the residuals from the seasonal naïve method suggests the forecasts are good, apart from one outlier. The forecast aligns well with the seasonal data.

5.4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

For the Australian Exports, I first plot the data to determine whether it has strong seasonality.

    #| label: aus-exports-exploratory
    #| message: false

    # Extract data of interest
    aus_exports <- global_economy |>
      filter(Country == "Australia") |>
      select(Exports)

    # Plot
    aus_exports |>
      autoplot() +
      theme_classic() +
      labs(title = "Australian Exports", x = "Year", y = "Exports of goods and services (% of GDP)")

Plot variable not specified, automatically selected `.vars = Exports`

Since the data does not look seasonal, I choose the naive method.

    #| label: aus-exports-model

    # Define and estimate a model
    fit <- aus_exports |> model(NAIVE())

Model not specified, defaulting to automatic modelling of the `Exports`
variable. Override this using the model formula.

    # Look at the residuals
    fit |> gg_tsresiduals()

Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_line()`).

Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 1 row containing non-finite outside the scale range
(`stat_bin()`).

    # Look at some forecasts
    fit |> forecast() |> autoplot(aus_exports) + theme_classic() + labs(title = "Australian Exports", x = "Year", y = "Exports of goods and services (% of GDP)")

The residuals look like they have constant variance, do not look autocorrelated, and look normally distributed. This all suggests that the naive model forecasts are good.

For the Bricks series, I determined in section in 5.1 that we should use the seasonal naive method.

    #| label: bricks-resids
    #| message: false
      
    # Define and estimate a model
    fit <- bricks |> model(SNAIVE(Bricks))
    # Look at the residuals
    fit |> gg_tsresiduals()

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_line()`).

Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_bin()`).

    # Look at some forecasts
    fit |> forecast() |> autoplot(bricks) +
      labs(
        title = "Brick Production Forecast",
        subtitle = "SNAIVE method",
        x = "Year",
        y = "Clay brick production (in millions of bricks)"
      ) +
      theme_classic() +
      scale_x_yearquarter(
        breaks = seq(yearquarter("1950 Q1"), yearquarter("2030 Q1"), by = 20))

The residuals have a lower tail, which isn’t normal. They look very autocorrelated in the ACF plot. Since there is a detectable pattern in the residuals, there is still a better prediction we could make about bricks that hasn’t been captired witht his model.

5.7

For your retail time series (from Exercise 7 in Section 2.10):
Create a training dataset consisting of observations before 2011 using

        # get random retail series
        random_id <- sample(aus_retail$`Series ID`, 1)
        myseries <- aus_retail |>
          filter(`Series ID` == random_id)
        industry_name <- myseries$Industry[1]

        myseries_train <- myseries |>
          filter(year(Month) < 2011)

Check that your data have been split appropriately by producing the following plot.

        autoplot(myseries, Turnover) +
          autolayer(myseries_train, Turnover, colour = "red")

**Yes, the data has been split appropriately with training data in red before 2011 and testing data after 2011.**

Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

        fit <- myseries_train |>
          model(SNAIVE(Turnover))

Check the residuals.

        fit |> gg_tsresiduals()

Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_line()`).

Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).

Warning: Removed 12 rows containing non-finite outside the scale range
(`stat_bin()`).

Do the residuals appear to be uncorrelated and normally distributed?

No, the residuals do not appear to be uncorrelated. The ACF plot shows a strong negative correlation at the 12-month lag, and positive correlation at the 24-month lag, suggesting there is still some seasonality that could be forecasted with a better model. The histogram of the residuals is not normally distributed. The residuals appear to have an upper tail.

Produce forecasts for the test data

        # Use the model trained on the training data to forecast the test data
        fc <- fit |>
          forecast(new_data = anti_join(myseries, myseries_train))

Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

        # Plot the forecast of the test data using the model trained on the training data
        fc |> autoplot(myseries)

Compare the accuracy of your forecasts against the actual values.

        # Get the accuracy of the model on the training data
        fit |> accuracy()

# A tibble: 1 × 12
  State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
  <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Victoria Cafes, … SNAIV… Trai…  22.1  37.6  27.6  6.81  8.76     1     1 0.829

        # Get the accuracy of the model on the test data
        fc |> accuracy(myseries)

# A tibble: 1 × 12
  .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
  <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SNAIVE(T… Vict… Cafes, … Test   69.0  116.  89.9  7.68  10.8  3.25  3.09 0.921

The accuracy measures of the model on the training data are all lower (i.e., more accurate) than that for the test data. This suggests that the model may be overfitting the training data.

How sensitive are the accuracy measures to the amount of training data used?

To answer this, I will repeat the process with a smaller training set. I will use the data before 2000 as my training data set.

        #| label: sensitivity

        myseries_train_smaller <- myseries |>
          filter(year(Month) < 2000)

        autoplot(myseries, Turnover) +
          autolayer(myseries_train_smaller, Turnover, colour = "red")

        fit_smaller <- myseries_train_smaller |>
          model(SNAIVE(Turnover))

        # Use the model trained on the training data to forecast the test data
        fc_smaller <- fit_smaller |>
          forecast(new_data = anti_join(myseries, myseries_train_smaller))

Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

        # Plot the forecast of the test data using the model trained on the training data
        fc_smaller |> autoplot(myseries)

        # Get the accuracy of the model on the training data
        fit |> accuracy()

# A tibble: 1 × 12
  State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
  <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Victoria Cafes, … SNAIV… Trai…  22.1  37.6  27.6  6.81  8.76     1     1 0.829

        # Get the accuracy of the first model on the test data
        fc |> accuracy(myseries)

# A tibble: 1 × 12
  .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
  <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SNAIVE(T… Vict… Cafes, … Test   69.0  116.  89.9  7.68  10.8  3.25  3.09 0.921

        # Get the accuracy of the smaller model on the test data
        fc_smaller |> accuracy(myseries)

# A tibble: 1 × 12
  .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
  <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SNAIVE(T… Vict… Cafes, … Test   294.  343.  295.  43.5  44.0  17.7  14.3 0.970

The accuracy measures are sensitive to the amount of training data used. The model trained on the smaller set of training data has much higher (worse) accuracy scores than the model trained on the larger set of training data.