Features and Forecaster’s Toolbox

library(tsibble)

## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr

## 
## Attaching package: 'tsibble'

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union

library(ggplot2)
library(fpp3)

## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──

## ✔ tibble      3.3.0     ✔ tsibbledata 0.4.1
## ✔ dplyr       1.1.4     ✔ feasts      0.4.2
## ✔ tidyr       1.3.1     ✔ fable       0.5.0
## ✔ lubridate   1.9.4

## Warning: package 'fable' was built under R version 4.5.2

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()     masks base::date()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ tsibble::intersect()  masks base::intersect()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ tsibble::setdiff()    masks base::setdiff()
## ✖ tsibble::union()      masks base::union()

#Question 1

#Australian Population
australian_population <- global_economy |>
  filter(Country == "Australia") |>
  select(Year, Population)
australian_population

australian_population |>
  autoplot(Population) +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Australian Population", x = "Year", y = "Population")

aus_pop_fit <- australian_population |>
  model(RW(Population ~ drift()))
aus_pop_fit

aus_pop_fc <- aus_pop_fit |>
  forecast(h = 10)
aus_pop_fc

aus_pop_fc |>
  autoplot(australian_population) + 
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Australian Population Forecast", 
       y = "Population",
       x = "Year")

Drift is the appropriate benchmark considering that the data shows a strong upward trend.

#Brick Production
bricks <- aus_production |>
  filter(!is.na(Bricks)) |>
  select(Quarter, Bricks)
bricks

bricks |>
  autoplot(Bricks) +
  labs(title = "Australian Quarterly Brick Production", 
       x = "Quarter", 
       y = "Bricks Produced (millions)")

bricks_fit <- bricks |> model(SNAIVE(Bricks))
bricks_fit

bricks_fc <- bricks_fit|>
  forecast(h = "10 years")
bricks_fc

bricks_fc |>
  autoplot(bricks) +
  labs(title = "Bricks Production Forecast",
       x = "Quarter",
       y = "Brick Production (millions)")

Due to the strong quarterly seasonality, SNAIVE is the appropriate benchmark used to forecast the data on Australian Brick Production

#Australian Livestock - New South Wales Lambs
nsw_lambs <- aus_livestock |>
  filter(Animal == "Lambs", State == "New South Wales")
nsw_lambs

nsw_lambs |>
  autoplot(Count) +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "NSW Lambs Slaughtered",
       x = "Month",
       y = "Count (thousands)")

nsw_lambs_fit <- nsw_lambs |>
  model(SNAIVE(Count))

nsw_lambs_fc <- nsw_lambs_fit |>
  forecast(h = 12)

nsw_lambs_fc |>
  autoplot(nsw_lambs) +
  labs(title = "NSW Lambs Forecast",
       y = "Count (thousands)")

SNAIVE is appropriate for NSW lambs due to monthly seasonality in agricultural patterns.

#Household Wealth
household_wealth <- hh_budget |>
  select(Year, Wealth)
household_wealth

household_wealth |>
  autoplot(Wealth) +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Household Wealth", 
       x = "Year", y = "Wealth")

household_wealth_fit <- household_wealth |>
  model(RW(Wealth ~ drift()))
household_wealth_fit

household_wealth_fc <- household_wealth_fit |>
  forecast(h = 5)
household_wealth_fc

household_wealth_fc |>
  autoplot(household_wealth) +
  labs(title = "Household Wealth Forecast",
       y = "Wealth")

Drift is appropriate for household wealth due to the upward trend over time.

#Australian Takeaway Food Turnover
takeaway <- aus_retail |>
  filter(Industry == "Cafes, restaurants and takeaway food services") |>
  summarise(Turnover = sum(Turnover))

takeaway |>
  autoplot(Turnover) +
  labs(title = "Australian Takeaway Food Turnover", 
       x = "Month", y = "Turnover ($Million AUD)")

takeaway_fit <- takeaway |>
  model(SNAIVE(Turnover))

takeaway_fc <- takeaway_fit |>
  forecast(h = 12)

takeaway_fc |>
  autoplot(takeaway) +
  labs(title = "Takeaway Food Turnover Forecast",
       y = "Turnover ($Million AUD)")

SNAIVE is appropriate for takeaway food turnover due to monthly seasonality and trend.

#Question 2

#Facebook Stock Closing Price Forecasting
fb_stock <- gafa_stock |>
  filter(Symbol == "FB") |>
  mutate(day = row_number()) |>
  update_tsibble(index = day, regular = TRUE)

fb_stock |>
  autoplot(Close) +
  labs(title = "Facebook Stock Price",
       x = "Date", y = "Closing Price ($USD)")

##Drift model
fb_fit <- fb_stock |>
  model(RW(Close ~ drift()))

fb_fc = fb_fit |>
  forecast(h = 60)

fb_fc |>
  autoplot(fb_stock, level = NULL) +
  labs(
    title = "Facebook Stock Price - Drift Forecast",
    x = "Date",
    y = "Closing Price ($USD)"
  )

first_point <- fb_stock |> head(1)
last_point <- fb_stock |> tail(1)
slope <- (last_point$Close - first_point$Close) / (last_point$day - first_point$day)

fb_fc |>
  autoplot(fb_stock, level = NULL) +
  geom_abline(
    intercept = first_point$Close - slope * first_point$day,
    slope = slope,
    color = "tomato",
    linetype = "dashed",
    linewidth = 1
  ) +
  labs(
    title = "Drift Forecast = Line from First to Last Observation",
    subtitle = "Red dashed line shows linear extension from first to last point",
    x = "Day",
    y = "Closing Price ($USD)"
  )

fb_models <- fb_stock |>
  model(
    Mean = MEAN(Close),
    Naive = NAIVE(Close),
    Drift = RW(Close ~ drift())
  )

fb_benchmark <- fb_models |>
  forecast(h = 60)

fb_benchmark |>
  autoplot(fb_stock, level = NULL) +
  labs(
    title = "Facebook Stock - Benchmark Forecast Comparison",
    x = "Day",
    y = "Closing Price ($USD)"
  ) +
  guides(colour = guide_legend(title = "Forecast"))

The drift forecast (shown in the plot) is equivalent to drawing a straight line from the first observation to the last observation and extending it forward. Both produce identical forecasts.

#Question 3

# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)

# Define and estimate a model
fit <- recent_production |> 
  model(SNAIVE(Beer))

# Look at the residuals
fit |> gg_tsresiduals()

## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_rug()`).

# Look at some forecasts
fit |> 
  forecast(h = 5) |> 
  autoplot(recent_production)

fit |> 
  forecast(h = 10) |> 
  autoplot(recent_production)

fit |> 
  forecast(h = 20) |> 
  autoplot(recent_production)

The residuals do look like white noise: The innovation residuals are randomly scattered around zero with no observable patterns. The ACF plot shows all autocorrelations within the significance bounds. The histogram is approximately normal and centered at zero. No trends, seasonality, or systematic patterns remain

The forecast plot shows: - The seasonal pattern continues into the future periods - Forecasts follow the same quarterly pattern as historical data - The model captures the strong seasonality in beer production

The seasonal naive model has adequately captured the information in the beer production data. The innovation residuals behave like white noise, with no observable patterns in the residual plot, all ACF values within significance bounds, and an approximately normal histogram. The forecasts appropriately extend the seasonal pattern into future quarters. Since no patterns remain in the residuals, the model appears appropriate for this data.

#Question 4

aus_exports <- global_economy |>
  filter(Country == "Australia") |>
  select(Year, Exports)

aus_exports |>
  autoplot(Exports) +
  labs(title = "Australian Exports",
       y = "GDP (%)",
       x = "Year")

exports_fit <- aus_exports |>
  model(
    Naive = NAIVE(Exports),
    Drift = RW(Exports ~ drift())
  )

exports_fit |> 
  select(Drift) |>
  gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_rug()`).

exports_fit |>
  select(Drift) |>
  forecast(h = 10) |>
  autoplot(aus_exports)

The drift method is appropriate for Australian exports as the series shows a clear upward trend over time. The residual diagnostics indicate the drift model has adequately captured the information in the data. The innovation residuals appear randomly scattered around zero with no observable patterns. The ACF plot shows all autocorrelations within the significance bounds, suggesting the residuals behave like white noise. The histogram is approximately normal and centered at zero. The forecasts show the upward trend continuing into the future with appropriately wide prediction intervals reflecting the recent volatility in export levels.

bricks <- aus_production |>
  filter(!is.na(Bricks)) |>
  select(Quarter, Bricks)

bricks |>
  autoplot(Bricks) +
  labs(title = "Australian Brick Production",
       y = "Bricks (millions)",
       x = "Quarter")

bricks_fit <- bricks |>
  model(SNAIVE(Bricks))

bricks_fit |> gg_tsresiduals()

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_rug()`).

bricks_fit |>
  forecast(h = 20) |>
  autoplot(bricks)

The seasonal naive method is appropriate for brick production due to the strong quarterly seasonality visible in the data. The residual diagnostics indicate the SNAIVE model has adequately captured the information in the data. The innovation residuals are randomly scattered around zero with no observable patterns. The ACF plot shows all autocorrelations within the significance bounds, indicating the residuals behave like white noise. The histogram is approximately normal and centered at zero. The forecasts appropriately continue the quarterly seasonal pattern into the future.

#Question 7

# retail series
set.seed(12345678)  
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`, 1))

# Create training data < 2011
myseries_train <- myseries |>
  filter(year(Month) < 2011)

#Check the split
autoplot(myseries, Turnover) +
  autolayer(myseries_train, Turnover, colour = "tomato") +
  labs(
    title = "Retail Turnover - Training vs Full Data",
    y = "Turnover ($Million AUD)"
  )

#Fit SNAIVE model
fit <- myseries_train |>
  model(SNAIVE(Turnover))

#Check residuals
fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_rug()`).

#Produce forecasts
fc <- fit |>
  forecast(new_data = anti_join(myseries, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(myseries) +
  labs(
    title = "Retail Turnover Forecasts",
    y = "Turnover ($Million AUD)"
  )

#Compare accuracy

# Training set accuracy
fit |> accuracy()

# Test set accuracy
fc |> accuracy(myseries)

#Sensitivity analysis
myseries_train2 <- myseries |> filter(year(Month) < 2013)
fit2 <- myseries_train2 |> model(SNAIVE(Turnover))
fc2 <- fit2 |> forecast(new_data = anti_join(myseries, myseries_train2))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> accuracy(myseries)   # 2011 cutoff

fc2 |> accuracy(myseries)  # 2013 cutoff

The residuals do not appear to be uncorrelated. The ACF plot shows significant autocorrelation at multiple lags (lags 1-7 exceed the significance bounds), indicating the residuals contain patterns that the SNAIVE model has not captured. However, the histogram appears approximately normal and centered at zero, with no obvious outliers. The presence of autocorrelated residuals suggests the model could potentially be improved, though for a simple benchmark method, it provides a reasonable starting point.

The test set accuracy is worse than training accuracy for most metrics. RMSE increases from 1.21 to 1.55 (28% higher) and MAE increases from 0.915 to 1.24 (36% higher), indicating the model forecasts are less accurate on unseen data. Interestingly, MAPE decreases from 12.4% to 9.06%, likely because the test period has higher turnover values, making percentage errors smaller. The MASE of 1.36 (>1) indicates the SNAIVE forecasts are 36% worse than a naive benchmark. Overall, the model performs reasonably but shows degradation on test data, as expected.

The accuracy measures show modest sensitivity to the amount of training data. With more training data (2013 cutoff), test RMSE improves slightly from 1.55 to 1.50, and MAE improves from 1.24 to 1.21. MAPE remains essentially unchanged (9.06% vs 9.07%).

The improvement is relatively small because both models use SNAIVE, which only looks back one year regardless of training set size. The 2013 cutoff provides more recent seasonal patterns (2012 values) for forecasting, which slightly improves accuracy. However, the 2011 cutoff provides a longer test period for more robust evaluation.

Overall, the accuracy measures are not highly sensitive to the training/test split for this method, though using more recent data for training provides marginally better forecasts.

Features and Forecaster’s Toolbox

Kerwin Charles