Exercise 5.1

Produce forecasts for the following series using whichever of NAIVE(y) , SNAIVE(y) or RW(y~ drift()) is more appropriate in each case:

Australian Population ( `global_economy` )

The plot below shows that population of Australia trended upward from 1960 to 2017. The time series interval is one year, so there is no seasonality, which rules out SNAIVE().

# Select data of interest
aus_population <- global_economy %>%
  filter(Country == 'Australia') %>%
  mutate(
    Population = Population / 1E6  # Express population in million for plot
  )

# Plot the time series
aus_population %>%
  autoplot(Population) +
    ylab('Population (million)') +
    ggtitle('Australia Population, 1960-2017') + 
    xlim(1960, 2020) +
    guides(
      x = guide_axis(minor.ticks = TRUE)
    ) +
    theme_classic() +
    theme(
      axis.title = element_text(face = 'bold'),
      plot.title = element_text(face = 'bold')
    )

I then compared the NAIVE() and RW() forecasting methods. Given the upward trend in the historical data, the random walk with drift forecasting method is more appropriate.

# Fit the model
population_fit <- aus_population %>%
  model(
    Naive = NAIVE(Population),
    RW = RW(Population ~ drift())
  )

# Generate forecast for 3 years in future
population_forecast <- population_fit %>%
  forecast(h = 3)

# Plot forecast with historical data
population_forecast %>%
  autoplot(aus_population, level = NULL) +
  autolayer(
    .vars = Population,
    filter_index(aus_population, '2018' ~ .)
  ) +
  guides(
    x = guide_axis(minor.ticks = TRUE)
  ) +   
  labs(
    y = 'Population (million)',
    title = 'Australia Population Forecast',
    color = 'Forecast\nmethod'
  ) +
  theme_classic() +
  theme(
    axis.title = element_text(face = 'bold'),
    plot.title = element_text(face = 'bold')
  )

Bricks ( `aus_production` )

The plot below shows that brick production in Australia trended upward from 1956 to 1974, then gradually trended downward.

# Select data of interest
aus_bricks <- aus_production %>%
  select(Quarter, Bricks) %>%
  drop_na()

# Plot the time series
aus_bricks %>%
  autoplot(Bricks) +
    ylab('Bricks produced (million)') +
    ggtitle('Clay brick production in Australia, 1956-2005') + 
    guides(
      x = guide_axis(minor.ticks = TRUE)
    ) +
    theme_classic() +
    theme(
      axis.title = element_text(face = 'bold'),
      plot.title = element_text(face = 'bold')
    )

The subseries plot shows seasonality (quarterly period). This indicates SNAIVE() would be a more appropriate forecasting method than NAIVE().

ggtime::gg_subseries(aus_bricks, Bricks) +
  ylab('Bricks produced (million)')

# Fit the model
bricks_fit <- aus_bricks %>%
  model(
    Seasonal_Naive = SNAIVE(Bricks)
  )

# Generate forecast for 6 quarters in future
bricks_forecast <- bricks_fit %>%
  forecast(h = 6)

# Plot forecast with historical data
bricks_forecast %>%
  autoplot(aus_bricks, level = NULL) +
  autolayer(
    .vars = Bricks,
    filter_index(aus_bricks, '2005 Q3' ~ .)
  ) +
  guides(
    x = guide_axis(minor.ticks = TRUE)
  ) +
  labs(
    y = 'Bricks produced (million)',
    title = 'Brick Production Forecast',
    subtitle = 'SNAIVE forecasting method'
  ) +
  theme_classic() +
  theme(
    axis.title = element_text(face = 'bold'),
    plot.title = element_text(face = 'bold')
  )

NSW Lambs ( `aus_livestock` )

This time series does not have an obvious overall trend. There was a downward trend from 1985 to 1995 and an upward trend from thereafter.

# Select data of interest
nsw_lambs <- aus_livestock %>%
  filter(State == 'New South Wales', Animal == 'Lambs') %>%
  mutate(
    Count = Count / 1E5
  ) %>%
  select(Month, Count)

# Plot the time series
nsw_lambs %>%
  autoplot(Count) +
    ylab('Count (x100,000)') +
    ggtitle('Lambs Slaughtered in Australia, 1972-2018') + 
    coord_cartesian(xlim = c(as.Date('1970-01-01'), as.Date('2020-01-01')),
                    ylim = c(2, 7)) +
    guides(
      x = guide_axis(minor.ticks = TRUE)
    ) +
    theme_classic() +
    theme(
      axis.title = element_text(face = 'bold'),
      plot.title = element_text(face = 'bold')
    )

The subseries plot shows seasonality (monthly period). This indicates SNAIVE() would be a more appropriate forecasting method than NAIVE().

ggtime::gg_subseries(nsw_lambs, Count) +
  ylab('Count (x100,000)')

# Fit the model
slaughter_fit <- nsw_lambs %>%
  model(
    Seasonal_Naive = SNAIVE(Count)
  )

# Generate forecast for 18 months in future
slaughter_forecast <- slaughter_fit %>%
  forecast(h = 18)

# Plot forecast with historical data
slaughter_forecast %>%
  autoplot(nsw_lambs, level = NULL) +
  autolayer(
    .vars = Count,
    filter_index(nsw_lambs, '2005 Q3' ~ .)
  ) +
  guides(
    x = guide_axis(minor.ticks = TRUE)
  ) +
  labs(
    y = 'Count (x100,000)',
    title = 'Lambs Slaughtered in Australia, 1972-2018 + 18-Month Forecast',
    subtitle = 'SNAIVE forecasting method',
  ) +
  theme_classic() +
  theme(
    axis.title = element_text(face = 'bold'),
    plot.title = element_text(face = 'bold')
  )

Household wealth ( `hh_budget` )

Notes:

The text did not specify a country for this exercise, so I focused on Australia (since all the other exercises use data in Australia).
It is unclear whether Wealth has units. The documentation for the dataset states that Wealth is a percentage of net disposable income, but the values exceed 100, so it cannot be a percentage.

The time series plot does not show a clear overall trend for household wealth in Australia between 1995 and 2016. The time series interval is one year, so there is no seasonality, which rules out SNAIVE().

# Select data of interest
aus_wealth <- hh_budget %>%
  filter(Country == 'Australia') %>%
  select(Year, Wealth)

# Plot the time series
aus_wealth %>%
  autoplot(Wealth) +
    ylab('Value') +
    ggtitle('Household Wealth in Australia, 1995-2016') + 
    ylim(300, 450) +
    theme_classic() +
    theme(
      axis.title = element_text(face = 'bold'),
      plot.title = element_text(face = 'bold')
    )

I compared the NAIVE() and RW() forecasting methods; however, it is a little difficult to know which is more appropriate. The upward trend from 2011 to 2016 suggests that the random walk forecasting method may be appropriate. However, the naive method may also be reasonable since the historical data shows that upward trends do not continue forever.

# Fit the model
wealth_fit <- aus_wealth %>%
  model(
    Naive = NAIVE(Wealth),
    RW = RW(Wealth ~ drift())
  )

# Generate forecast for 2 years in future
wealth_forecast <- wealth_fit %>%
  forecast(h = 2)

# Plot forecast with historical data
wealth_forecast %>%
  autoplot(aus_wealth, level = NULL) +
  autolayer(
    .vars = Wealth,
    filter_index(aus_wealth, '2017' ~ .)
  ) +
  ylim(300, 450) + 
  labs(
    y = 'Value',
    title = 'Household Wealth in Australia, 1995-2016 + 2-Year Forecast',
    color = 'Forecast'
  ) +
  theme_classic() +
  theme(
    axis.title = element_text(face = 'bold'),
    plot.title = element_text(face = 'bold')
  )

Australian takeaway food turnover ( `aus_retail` )

Note: The text did not specify the state for this exercise, so I focused on the Australian Capital Territory.

The time series plot shows that takeaway food turnover trended upward from 1982 to 2018.

# Select data of interest
aus_food_turnover <- aus_retail %>%
  filter(Industry == 'Cafes, restaurants and takeaway food services',
         State == 'Australian Capital Territory') %>%
  select(Month, Turnover)

# Plot the time series
aus_food_turnover %>%
  autoplot(Turnover) +
    ylab('Turnover (million AUD)') +
    ggtitle('Takeaway Food Turnover in Australia, 1982-2018') + 
    coord_cartesian(xlim = c(as.Date('1980-01-01'), as.Date('2020-01-01')),
                    ylim = c(0, 80)) +
    theme_classic() +
    theme(
      axis.title = element_text(face = 'bold'),
      plot.title = element_text(face = 'bold')
    )

The subseries plot shows seasonality (monthly period). This suggests that SNAIVE() would be a more appropriate forecasting method than NAIVE().

ggtime::gg_subseries(aus_food_turnover, Turnover) +
  ylab('Turnover (million AUD)')

# Fit the model
turnover_fit <- aus_food_turnover %>%
  model(
    Naive = SNAIVE(Turnover)
  )

# Generate forecast for 18 months in future
turnover_forecast <- turnover_fit %>%
  forecast(h = 18)

# Plot forecast with historical data
turnover_forecast %>%
  autoplot(aus_food_turnover, level = NULL) +
  autolayer(
    .vars = Turnover,
    filter_index(aus_food_turnover, '2019 Jan' ~ .)
  ) +
  coord_cartesian(xlim = c(as.Date('1980-01-01'), as.Date('2020-01-01')),
                  ylim = c(0, 80)) +
  labs(
    y = 'Turnover (million AUD)',
    title = 'Takeaway Food Turnover in Australia, 1982-2018 + 18-Month Forecast',
    subtitle = 'SNAIVE forecasting method',
  ) +
  theme_classic() +
  theme(
    axis.title = element_text(face = 'bold'),
    plot.title = element_text(face = 'bold')
  )

Exercise 5.2

Use the Facebook stock price (data set gafa_stock ) to do the following:

Note: The exercise does not specify which variable in gafa_stock to analyze. I focused on the closing price. The analyses would be similar for other variables.

a. Produce a time plot of the series

# Select data of interest
facebook_stock_price <- gafa_stock %>%
  filter(Symbol == 'FB') %>%
  # The dataset has data for irregular trading days, so change the index 
  # to row number to make it regular. I found that the 'regular' parameter
  # also needs to be specified to fit the model (part (b))
  mutate(
    day = row_number()
  ) %>%
  update_tsibble(index = day, regular = TRUE) %>%
  select(day, Close)

# Plot the time series
facebook_stock_price %>%
  autoplot(Close) +
    xlim(0, 1500) + ylim(50, 250) +  
    labs(
      x = 'Trading Day Index',
      y = 'Price',
      title = 'Closing Price of Facebook on Irregular Trading Days, 2014-2018'
    ) +
    guides(
      x = guide_axis(minor.ticks = TRUE)
    ) +  
    theme_classic() +
    theme(
      axis.title = element_text(face = 'bold'),
      plot.title = element_text(face = 'bold')
    )

b. Produce forecasts using the drift method and plot them

ex5_2b_plot <- facebook_stock_price %>%
  # Fit the model
  model(RW(Close ~ drift())) %>%
  # Generate forecast for 90 index 'days' in future
  forecast(h = 90) %>%
  autoplot(facebook_stock_price) +
    xlim(0, 1500) + ylim(50, 250) +  
    labs(
      x = 'Trading Day Index',      
      y = 'Price',
      title = 'Closing Price of Facebook, 2014-2018 + Forecast',
      subtitle = 'Random walk with drift forecasting method'
    ) +
    guides(
      x = guide_axis(minor.ticks = TRUE)
    ) +
    theme_classic() +
    theme(
      axis.title = element_text(face = 'bold'),
      plot.title = element_text(face = 'bold')
    )

ex5_2b_plot

c. Show that the forecasts are identical to extending the line drawn between the first and last observations

The plot below shows that the point estimates of the forecast (ie, not including the prediction intervals) are identical to the line between the first and last observations of the historical data.

# Define the start and end coordinates of the line segment
start_x <- min(facebook_stock_price$day)
start_y <- as.numeric(facebook_stock_price[start_x, "Close"])
end_x <- max(facebook_stock_price$day)
end_y <- as.numeric(facebook_stock_price[end_x, "Close"])

# Add the line segment to the previous plot
ex5_2b_plot +
  annotate('segment', x = start_x, y = start_y, xend = end_x, yend = end_y,
           color = 'steelblue', linetype = 'dashed')

d. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?

I think the random walk with drift method is best because it reflects the overall upward trend of the historical data. In addition, the historical data shows that price decreases are followed by price increases (eg, index ~750). Although the historical data ends in a downward trend, the closing price will eventually rebound, and the random walk with drift is the only benchmark function that captures that phenomenon.

facebook_stock_price %>%
  # Fit the models
  model(
    Mean = MEAN(Close),
    Naive = NAIVE(Close),
    RW_Drift = RW(Close ~ drift())
  ) %>%
  # Generate forecast for 90 index 'days' in future
  forecast(h = 90) %>%
  # Omit prediction intervals to prevent overlap with multiple models
  autoplot(facebook_stock_price, level = NULL) +
    xlim(0, 1500) + ylim(50, 250) +  
    labs(
      x = 'Trading Day Index',      
      y = 'Price',
      title = 'Closing Price of Facebook, 2014-2018 + Forecast',
      subtitle = 'Comparison of benchmark methods'
    ) +
    guides(
      x = guide_axis(minor.ticks = TRUE)
    ) +
    theme_classic() +
    theme(
      axis.title = element_text(face = 'bold'),
      plot.title = element_text(face = 'bold')
    )

Exercise 5.3

Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts.

Answer: The residuals do not appear to be white noise. This means that the forecasting method could be improved.

Explanation:

The time series plot (top plot in output from gg_tsresiduals) shows that the variation in the residuals is not constant (ie, there are peaks and valleys for some time points, eg 1996).
The autocorrelation plot (lower left) shows a single significant spike at lag=4, which suggests that there is seasonal-related correlation (see subseries plot below). Of note, the absence of such a spike at lag=0, which is characteristic of white noise (ie, data that is only correlated with itself), indicates that the residuals are not white noise.
The histogram (lower right) shows that the distribution of the residuals is not quite normal, and the mean is a little less than zero.

# Code provided in textbook
# This block generates some warnings about missing values, which I suppressed

# Extract data of interest
recent_production <- aus_production %>%
  filter(year(Quarter) >= 1992)
  
# Define and estimate a model
fit <- recent_production %>% 
  model(SNAIVE(Beer))

# Look at the residuals
fit %>% 
  gg_tsresiduals()

The forecast looks like this:

# Code provided in textbook
# Look at some forecasts
fit %>% 
  forecast() %>% 
  autoplot(recent_production)

I also applied the Box-Pierce and Ljung-Box tests. The time series is seasonal with a quarterly period, so \(l = 2m = 2 \times 4 = 8\) for these tests.

ggtime::gg_subseries(recent_production, Beer)

Both tests had significant P-values (P<0.05), so the null hypothesis that the data are from a white noise series is rejected.

aug <- fit %>%
  augment()

# Box-Pierce test
aug %>% 
  features(.innov, box_pierce, lag = 8)

## # A tibble: 1 × 3
##   .model       bp_stat bp_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    29.7  0.000234

# Ljung-Box test
aug %>% 
  features(.innov, ljung_box, lag = 8)

## # A tibble: 1 × 3
##   .model       lb_stat lb_pvalue
##   <chr>          <dbl>     <dbl>
## 1 SNAIVE(Beer)    32.3 0.0000834

Exercise 5.4

Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.

Australian exports (`global_economy`)

Conclusion: The residuals of the NAIVE() model of the Australian exports series appear to be white noise. This means that the forecasting method performs well.

Explanation:

I first examined the time series and assessed seasonality to decide which benchmark function to use. The time series interval is one year, so there is no seasonality, which rules out SNAIVE().

# Select data of interest
aus_exports <- global_economy %>%
  filter(Country == 'Australia') %>%
  select(Year, Exports)

# Plot the time series
aus_exports %>%
  autoplot(Exports) +
    labs(
      y = 'Exports (% of GDP)',
      title = 'Australian Exports, 1960-2017',
      caption = 'GDP, gross domestic product.'
    ) +
    xlim(1960, 2020) + ylim(10, 25) +
    guides(
      x = guide_axis(minor.ticks = TRUE)
    ) +
    theme_classic() +
    theme(
      axis.title = element_text(face = 'bold'),
      plot.title = element_text(face = 'bold'),
      plot.caption = element_text(color = "gray", hjust = 0)
    )

I fit the data using the NAIVE() model and analyzed the residuals. All three plots suggest that the residuals are white noise.

The time series plot shows that the residuals fluctuate around zero with similar variance and no apparent pattern.
The autocorrelation plot shows a single significant spike at lag=0.
The histogram shows that the residuals are distributed normally, and the mean is zero.

# Fit model
exports_fit <- aus_exports %>% 
  model(NAIVE(Exports))

# Examine residuals
exports_fit %>% 
  gg_tsresiduals()

The results above were in agreement with the Box-Pierce and Ljung-Box tests with \(l=10\), which is the recommended value for non-seasonal data. Neither test had a significant P-value (both P>0.05), so the null hypothesis that the data are white noise was not rejected.

exports_fit_aug <- exports_fit %>%
  augment()

# Box-Pierce test
exports_fit_aug %>% 
  features(.innov, box_pierce, lag = 10)

## # A tibble: 1 × 3
##   .model         bp_stat bp_pvalue
##   <chr>            <dbl>     <dbl>
## 1 NAIVE(Exports)    14.6     0.148

exports_fit_aug %>% 
  features(.innov, ljung_box, lag = 10)

## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 NAIVE(Exports)    16.4    0.0896

The forecast look like this:

exports_fit %>%
  forecast() %>%
  autoplot(aus_exports)

Bricks production (`aus_production`)

Conclusion: The residuals of the SNAIVE() model of the Bricks production series do not appear to be white noise. This means the forecasting method could be improved.

Explanation:

This series was analyzed in Exercise 5.1, which showed the presence of seasonality, so I applied the SNAIVE() method. All three residual plots indicate that the residuals are not white noise.

The time series plot shows that the variation in the residuals is not constant.
The autocorrelation plot shows a large proportion of spikes exceed the dashed blue lines, which indicate significant correlations at multiple lags.
The histogram shows that the residuals are not distributed normally, and the mean is greater than zero.

# Fit model
bricks_fit <- aus_bricks %>% 
  model(SNAIVE(Bricks))

# Examine residuals
bricks_fit %>% 
  gg_tsresiduals()

The results above were in agreement with the Box-Pierce and Ljung-Box tests. Both had zero P-values, so the null hypothesis that the data are white noise was rejected.

bricks_fit_aug <- bricks_fit %>%
  augment()

# Box-Pierce test
bricks_fit_aug %>% 
  features(.innov, box_pierce, lag = 8)

## # A tibble: 1 × 3
##   .model         bp_stat bp_pvalue
##   <chr>            <dbl>     <dbl>
## 1 SNAIVE(Bricks)    267.         0

bricks_fit_aug %>% 
  features(.innov, ljung_box, lag = 8)

## # A tibble: 1 × 3
##   .model         lb_stat lb_pvalue
##   <chr>            <dbl>     <dbl>
## 1 SNAIVE(Bricks)    274.         0

The forecast looks like this:

bricks_fit %>%
  forecast() %>%
  autoplot(aus_bricks)

Exercise 5.7

For the retail time series from Exercise 7 in Section 2.10:

a. Create a training dataset consisting of observations before 2011

# Generate the same series used previously
set.seed(144)

myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID` , 1))

# Code provided in textbook
myseries_train <- myseries %>%
  filter(year(Month) < 2011)

b. Check that your data have been split appropriately by producing the following plot

The plot shows that the training dataset is limited to data from before 2011.

# Code provided in textbook
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, color = 'red')

c. Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train)

# Code provided in textbook (specified 'Turnover' as the response variable)
fit <- myseries_train %>%
  model(SNAIVE(Turnover))

d. Check the residuals. Do the residuals appear to be uncorrelated and normally distributed?

Answer: No. The residuals do not appear to be white noise. This implies that the forecasting method could be improved.

Explanation:

The time series plot shows that the variation in the residuals increases with the level of the series.
The autocorrelation plot shows a large proportion of spikes exceed the upper dashed blue line, which indicate significant correlations at multiple lags.
The histogram shows that the residuals are not distributed normally, and the mean is greater than zero.

# Code provided in textbook
fit %>% 
  gg_tsresiduals()

e. Produce forecasts for the test data

# Code provided in textbook
fc <- fit %>%
  forecast(new_data = anti_join(myseries, myseries_train))

fc %>% autoplot(myseries)

f. Compare the accuracy of your forecasts against the actual values

As expected (from development of other machine learning models), the accuracy of forecasts is better in the training dataset than in the test dataset, as shown by the higher error values.

accuracy_train <- fit %>% 
  accuracy() %>%
  select(-c(State, Industry, `.model`)) %>%
  rename('dataset' = `.type`)

accuracy_test <- fc %>% 
  accuracy(myseries) %>%
  select(-c(State, Industry, `.model`)) %>%
  rename('dataset' = `.type`)  

rbind(accuracy_train, accuracy_test) %>%
  kbl() %>%
  kable_styling()

dataset	ME	RMSE	MAE	MPE	MAPE	MASE	RMSSE	ACF1
Training	4.514	6.388	5.047	5.871	6.542	1.000	1.000	0.6470
Test	21.283	29.027	22.550	11.113	11.978	4.468	4.544	0.9301

g. How sensitive are the accuracy measures to the amount of training data used?

Answer: The forecast accuracy measures are very sensitive to the amount of training data. In general, errors decrease as the fraction of data used in the training dataset increases. For four common accuracy metrics (MAE, MAPE, MASE, and RMSE), the difference in the magnitude of error between low and high training fractions was 4 to 7-fold.

Explanation:

I analyzed the effect of the amount of training data on forecast accuracy measures by generating nine different models using 10% to 90% of the data for training, and then plotting the results for four accuracy metrics (MAE, MAPE, MASE, and RMSE).

# Constants to subset time period
start_year <- min(year(myseries$Month))
n_years <- max(year(myseries$Month)) - start_year + 1

# Vector of test:train split ratios for iterator
split_ratio_vec <- seq(from = 0.1, to = 0.9, by = 0.1)

# List to store results
result_list <- vector(mode = 'list', length = 9)

# Iterate through dataset split ratios and collate accuracy metrics
for (ratio in split_ratio_vec) {
  iteration <- 10 * ratio
  # Calculate end year for training data split
  end_year <- start_year + floor(ratio * n_years)
  # Extract the training data for that period
  myseries_train <- filter(myseries, year(Month) < end_year)
  # Fit the model
  fit <- myseries_train %>% model(SNAIVE(Turnover))
  # Generate forecast
  fc <- fit %>% forecast(new_data = anti_join(myseries, myseries_train))
  # Calculate accuracy metrics
  accuracy_test <- fc %>% accuracy(myseries)
  # Extract accuracy metrics for test dataset
  result_list[[iteration]] <- list(MASE = accuracy_test$MASE, 
                                   MAE = accuracy_test$MAE,
                                   RMSE = accuracy_test$RMSE,
                                   MAPE = accuracy_test$MAPE)
}

# Convert results (list of lists) to dataframe
results_df <- lol_to_df(result_list)
colnames(results_df) <- c('MASE', 'MAE', 'RMSE', 'MAPE')
rownames(results_df) <- NULL

results_df <- cbind(ratio = split_ratio_vec, results_df)

# Convert results to long data format for plotting
results_long_df <- results_df %>%
  pivot_longer(names_to = 'metric', values_to = 'value', cols = -ratio)

# Plot results
ggplot(results_long_df, aes(x = ratio, y = value, color = metric)) +
  geom_line() +
  facet_wrap(~ metric, scales = 'free_y') +
  scale_x_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2)) +
  labs(
    x = 'Fraction of data in training dataset',
    title = 'Effect of amount of training data on forecast accuracy metrics',
    subtitle = 'Australia retail turnover (1982-2018) with SNAIVE forecast'
  ) +
  theme_minimal() +
  theme(
    axis.title = element_text(face = 'bold'),
    plot.title = element_text(face = 'bold'),
    strip.text = element_text(face = 'bold'),
    legend.position = 'none'
  )

Across the four metrics, the difference in the magnitude of error for low vs high training fractions varied from 4 to 7.2-fold.

# MAE
sprintf('MAE has a %.1f fold difference from 10%% to 90%% training fraction', 
        max(results_df$MAE) / min(results_df$MAE))

## [1] "MAE has a 4.0 fold difference from 10% to 90% training fraction"

# MAPE
sprintf('MAPE has a %.1f fold difference from 10%% to 90%% training fraction', 
        max(results_df$MAPE) / min(results_df$MAPE))

## [1] "MAPE has a 6.6 fold difference from 10% to 90% training fraction"

# MASE
sprintf('MASE has a %.1f fold difference from 10%% to 90%% training fraction', 
        max(results_df$MASE) / min(results_df$MASE))

## [1] "MASE has a 7.2 fold difference from 10% to 90% training fraction"

# RMSE
sprintf('RMSE has a %.1f fold difference from 10%% to 90%% training fraction', 
        max(results_df$RMSE) / min(results_df$RMSE))

## [1] "RMSE has a 4.0 fold difference from 10% to 90% training fraction"

Session Details

sessionInfo()

## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] kableExtra_1.4.0  ggtime_0.1.0      fable_0.5.0       feasts_0.4.2     
##  [5] fabletools_0.5.1  fpp3_1.0.2        tsibbledata_0.4.1 tsibble_1.1.6    
##  [9] lubridate_1.9.4   forcats_1.0.1     stringr_1.6.0     dplyr_1.1.4      
## [13] purrr_1.2.1       readr_2.1.6       tidyr_1.3.2       tibble_3.3.1     
## [17] ggplot2_4.0.1     tidyverse_2.0.0  
## 
## loaded via a namespace (and not attached):
##  [1] ggdist_3.3.3         utf8_1.2.6           rappdirs_0.3.4      
##  [4] sass_0.4.10          generics_0.1.4       anytime_0.3.12      
##  [7] xml2_1.5.2           stringi_1.8.7        hms_1.1.4           
## [10] digest_0.6.39        magrittr_2.0.4       evaluate_1.0.5      
## [13] grid_4.5.2           timechange_0.3.0     RColorBrewer_1.1-3  
## [16] fastmap_1.2.0        jsonlite_2.0.0       viridisLite_0.4.2   
## [19] scales_1.4.0         textshaping_1.0.4    jquerylib_0.1.4     
## [22] cli_3.6.5            crayon_1.5.3         rlang_1.1.7         
## [25] ellipsis_0.3.2       withr_3.0.2          cachem_1.1.0        
## [28] yaml_2.3.12          otel_0.2.0           tools_4.5.2         
## [31] tzdb_0.5.0           vctrs_0.7.1          R6_2.6.1            
## [34] lifecycle_1.0.5      pkgconfig_2.0.3      progressr_0.18.0    
## [37] pillar_1.11.1        bslib_0.10.0         gtable_0.3.6        
## [40] glue_1.8.0           Rcpp_1.1.1           systemfonts_1.3.1   
## [43] xfun_0.56            tidyselect_1.2.1     rstudioapi_0.18.0   
## [46] knitr_1.51           farver_2.1.2         htmltools_0.5.9     
## [49] labeling_0.4.3       svglite_2.2.2        rmarkdown_2.30      
## [52] compiler_4.5.2       S7_0.2.1             distributional_0.6.0

DATA624 Homework 3

Alexander Simon

2026-02-22

Exercise 5.1

Australian Population ( `global_economy` )

Bricks ( `aus_production` )

NSW Lambs ( `aus_livestock` )

Household wealth ( `hh_budget` )

Australian takeaway food turnover ( `aus_retail` )

Exercise 5.2

Exercise 5.3

Exercise 5.4

Australian exports (`global_economy`)

Bricks production (`aus_production`)

Exercise 5.7

Session Details

DATA624 Homework 3

Alexander Simon

2026-02-22

Exercise 5.1

Australian Population ( global_economy )

Bricks ( aus_production )

NSW Lambs ( aus_livestock )

Household wealth ( hh_budget )

Australian takeaway food turnover ( aus_retail )

Exercise 5.2

Exercise 5.3

Exercise 5.4

Australian exports (global_economy)

Bricks production (aus_production)

Exercise 5.7

Session Details

Australian Population ( `global_economy` )

Bricks ( `aus_production` )

NSW Lambs ( `aus_livestock` )

Household wealth ( `hh_budget` )

Australian takeaway food turnover ( `aus_retail` )

Australian exports (`global_economy`)

Bricks production (`aus_production`)