pacman::p_load(
tidyverse,
janitor,
fpp3,
USgas,
readxl,
feasts,
ggplot2,
scales,
seasonal,
lubridate,
skimr,
forecast,
zoo
)
set.seed(9219)Homework 3
5.1
- Produce forecasts for the following series using whichever of
NAIVE(y),SNAIVE(y)orRW(y ~ drift())is more appropriate in each case:
- Australian Population (
global_economy)
First, I expore the population data to assess what model to use.
#| label: global-econ
#| warning: false
#| message: false
# transformed the units of the y-axis to millions to make the large numbers easier to read
australian_pop <- global_economy |>
filter(Country == "Australia") |>
mutate(Population = Population / 1000000) |>
select(Population) |>
drop_na()
australian_pop |>
autoplot() +
theme_classic() +
theme(legend.position = "none") +
labs(title = "Australian population appears to increase linearly.",
x = "Year",
y = "Population (millions)") +
scale_x_continuous(breaks = seq(1950, 2100, 10))Plot variable not specified, automatically selected `.vars = Population`
Since the Australian population rise appears to be linear, I will use the drift method.
#| label: aus-pop-model
#| warning: false
#| message: false
# Fit the model
australian_pop_model <- australian_pop |>
model(RW(Population ~ drift()))
# Forecast the next 10 years
australian_pop_fc <- australian_pop_model |>
forecast(h = 10)
# Plot the forecast
australian_pop_fc |>
autoplot() +
autolayer(australian_pop, Population) +
labs(
title = "Australian Population Forecast",
subtitle = "Drift method",
x = "Year",
y = "Population (millions)"
) +
theme_classic() +
scale_x_continuous(breaks = seq(1950, 2100, 10))- Bricks (
aus_production)
First, I explore the bricks data to assess what model to use.
#| label: bricks-exploratory
#| warning: false
#| message: false
# get bricks
bricks <- aus_production |>
select(Bricks) |>
drop_na()
# Autoplot
bricks |> autoplot() +
theme_classic() +
theme(axis.line = element_blank()) +
labs(
title = "Brick production trends upwards until the 80's, then drops off.",
subtitle = "Cyclical booms and crashes in 1975, 1983, 1991, 1996, and 2001.",
x = "",
y = "Clay brick production (in millions of bricks)"
) +
scale_x_yearquarter(breaks = seq(yearquarter("1950 Q1"), yearquarter("2010 Q1"), by = 20))Plot variable not specified, automatically selected `.vars = Bricks`
# Seasonal plot
bricks |> gg_season() +
theme_classic() +
theme(axis.line = element_blank()) +
labs(title = "Production is seasonal, starting low in Q1 and peaking in Q3.", x = "", y = "Clay brick production (in millions of bricks)")Plot variable not specified, automatically selected `y = Bricks`
# Subseries
bricks |> gg_subseries() +
theme_classic() +
theme(axis.line = element_blank())Plot variable not specified, automatically selected `y = Bricks`
labs(title = "Bricks are produced seasonally, peaking in Q3.", x = "", y = "Production (millions of bricks)")$x
[1] ""
$y
[1] "Production (millions of bricks)"
$title
[1] "Bricks are produced seasonally, peaking in Q3."
attr(,"class")
[1] "labels"
Since Bricks appear seasonal, I will use the seasonal naive method.
#| label: bricks-model
#| warning: false
#| message: false
# Fit the model
bricks_model <- bricks |>
model(SNAIVE(Bricks))
# Forecast the next 10 years
bricks_fc <- bricks_model |>
forecast(h = 10)
# Plot the forecast
bricks_fc |>
autoplot() +
autolayer(bricks, Bricks) +
labs(
title = "Brick Production Forecast",
subtitle = "SNAIVE method",
x = "Year",
y = "Clay brick production (in millions of bricks)"
) +
theme_classic() +
scale_x_yearquarter(breaks = seq(yearquarter("1950 Q1"), yearquarter("2030 Q1"), by = 20))- NSW Lambs (
aus_livestock)
First I explore the lambs data to assess what model to use.
#| label: nsw-lambs-exploratory
#| message: false
#| warning: false
# get lambs
lambs <- aus_livestock |>
filter(Animal == "Lambs", State == "New South Wales") |>
select(-Animal) |>
mutate(Count = Count / 10000) |>
drop_na()
# plot lambs
lambs |>
autoplot() +
theme_classic() +
labs(title = "Lambs slaughtered in New South Wales", x = "Year", y = "10k lambs slaughtered")Plot variable not specified, automatically selected `.vars = Count`
# Decompose
lambs |>
model(x11 = X_13ARIMA_SEATS(Count ~ x11())) |>
components() |>
autoplot() +
labs(
title = "X-11 decomposition of lambs doesn't make it clear which forecast method to use.",
subtitle = "Seasonality has variance over time, and trend is not linear.",
x = "Year",
y = "10k lambs slaughtered"
) +
theme_classic() +
scale_x_yearmonth(breaks = "2 year", labels = label_date("%Y")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))It’s unclear from the trends in the data which forecast method will fit best, so I will plot and test the accuracy of all three.
#| label: lambs-model
#| warning: false
#| message: false
# Fit the model
lambs_model <- lambs |>
model(
`Naïve` = NAIVE(Count),
`SNAIVE` = SNAIVE(Count),
`RW` = RW(Count ~ drift())
)
# Forecast the next 10 years
lambs_fc <- lambs_model |>
forecast(h = 10)
# Assess model accuracy w/ function
calculate_accuracy <- function(model) {
accuracy_df <- model |>
accuracy() |>
select(.model, MAE, RMSE, MAPE, MASE, RMSSE) |>
pivot_longer(
cols = where(is.numeric),
names_to = "score",
values_to = "value"
) |>
group_by(score) |>
summarize(min_value = value[which.min(abs(value))], # Find the minimum value for each metric
model = .model[which.min(value)])# Get the model corresponding to the minimum value
print(accuracy_df)
}
# calculate best models for accuracy
calculate_accuracy(lambs_model)# A tibble: 5 × 3
score min_value model
<chr> <dbl> <chr>
1 MAE 4.01 RW
2 MAPE 9.96 RW
3 MASE 0.958 RW
4 RMSE 5.03 RW
5 RMSSE 0.910 RW
Based on the accuracy measures, the drift method appears to have the best accuracy scores. I plot the drift forecast below.
#| label: lambs-plot-model
#| warning: false
#| message: false
# Plot the forecast
lambs_fc |>
filter(.model == "RW") |>
autoplot() +
autolayer(lambs, Count) +
labs(
title = "New South Wales Lambs Slaughtered Forecast",
subtitle = "Drift method",
x = "Year",
y = "10k lambs slaughtered"
) +
theme_classic() +
scale_x_yearmonth(breaks = "5 year", labels = label_date("%Y")) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))- Household wealth (
hh_budget)
First, I plot the data. Then, I compare the accuracy scores of all three forecasting methods to determine which will be the best model.
#| label: hh-wealth-explor
#| message: false
#| warning: false
# get hh
wealth <- hh_budget |>
select(Wealth) |>
drop_na()
# plot hh
wealth |>
autoplot() +
theme_classic() +
labs(title = "Household wealth",
x = "Year",
y = "Wealth as a percentage of net disposable income")Plot variable not specified, automatically selected `.vars = Wealth`
# Fit the model
wealth_model <- wealth |>
model(
`Naïve` = NAIVE(Wealth),
`SNAIVE` = SNAIVE(Wealth),
`RW` = RW(Wealth ~ drift())
)Warning: 4 errors (1 unique) encountered for SNAIVE
[4] Non-seasonal model specification provided, use RW() or provide a different lag specification.
# get accuracy
calculate_accuracy(wealth_model)# A tibble: 5 × 3
score min_value model
<chr> <dbl> <chr>
1 MAE 12.1 RW
2 MAPE 2.28 RW
3 MASE 0.761 RW
4 RMSE 16.4 RW
5 RMSSE 0.877 RW
Based on the accuracy scores of all three methods, the drift method appears best.
#| label: hh-forecast
#| message: false
#| warning: false
# Forecast the next 10 years
wealth_fc <- wealth_model |>
forecast(h = 10)
# Plot the forecast
wealth_fc |>
filter(.model == "RW") |>
autoplot() +
autolayer(wealth, Wealth) +
labs(title = "Household wealth forecast", x = "Year", y = "Wealth as a percentage of net disposable income") +
theme_classic() - Australian takeaway food turnover (
aus_retail)
First, I plot the data. Then, I compare the accuracy scores of all three forecasting methods to determine which will be the best model.
#| label: aus-takeaway-explor
#| message: false
#| warning: false
# get takeaway food turnover
takeaway <- aus_retail |>
filter(Industry == "Takeaway food services") |>
select(-c(Industry), Turnover) |>
drop_na()
# plot takeaway food turnover
takeaway |>
autoplot() +
theme_classic() +
labs(title = "Australian Takeaway Food Turnover", x = "Year", y = "Turnover in million AUD")Plot variable not specified, automatically selected `.vars = Turnover`
# Fit the model
takeaway_model <- takeaway |>
model(
`Naïve` = NAIVE(Turnover),
`SNAIVE` = SNAIVE(Turnover),
`RW` = RW(Turnover ~ drift())
)
# get accuracy
calculate_accuracy(takeaway_model)# A tibble: 5 × 3
score min_value model
<chr> <dbl> <chr>
1 MAE 0.666 Naïve
2 MAPE 5.29 RW
3 MASE 0.410 Naïve
4 RMSE 0.950 RW
5 RMSSE 0.399 RW
Based on the MAPE, RMSE, and RMSSE accuracy measures, the drift method appears to have the best accuracy scores (although Naive performed best according to the MAE and MASE). I plot the drift forecast below.
#| label: aus-takeaway-forecast
#| message: false
#| warning: false
# Forecast the next 10 years
takeaway_model |>
forecast(h = 10) |>
filter(.model == "RW") |>
autoplot() +
autolayer(takeaway, Turnover) +
labs(title = "Australian Takeaway Food Turnover Forecast", x = "Year", y = "Turnover in million AUD") +
theme_classic()5.2
Use the Facebook stock price (data set
gafa_stock) to do the following:Produce a time plot of the series.
#| label: time-plot
#| message: false
#| warning: false
# get fb closing prices
fb <- gafa_stock |>
filter(Symbol == "FB") |>
select(Date, Close) |>
# Re-index based on trading days
mutate(day = row_number()) |>
# Divide day by the number of trading days in a year to plot years since 2014-01-02
mutate(years_since = (day - 1) / 252) |>
update_tsibble(index = years_since, regular = TRUE)
# Get the minimum date in the tsibble
min_date <- min(fb$Date)
# plot fb
fb |>
autoplot(Close) +
theme_classic() +
labs(title = "Time plot of Facebook stock price",
y = "Closing price",
x = paste("Years since", min_date))- Produce forecasts using the drift method and plot them.
#| label: forecast-drift-fb
#| message: false
#| warning: false
# Fit the model
fb_model <- fb |>
model(RW = RW(Close ~ drift()))
# Forecast the next year of trading days
fb_fc <- fb_model |> forecast(h = 252)
# Plot the forecast
plot <- fb_fc |>
autoplot() +
autolayer(fb, Close) +
labs(title = "Forecast of Facebook stock price using Drift Method",
x = paste("Years since", min_date),
y = "Closing price") +
theme_classic() +
scale_x_continuous(breaks = seq(0, max(fb$years_since) + 252, 1))
plot- Show that the forecasts are identical to extending the line drawn between the first and last observations.
plot +
# plot straight line through first and last observations
geom_abline(
# get slope
slope = (fb$Close[nrow(fb)] - fb$Close[1]) / (max(fb$years_since) - min(fb$years_since)),
# get intercept
intercept = fb$Close[1] - (fb$Close[nrow(fb)] - fb$Close[1]) / (max(fb$years_since) - min(fb$years_since)) * min(fb$years_since),
color = "red",
linetype = "dotted"
)The dotted red line through the first observation and the last overlaps directly with the drift forecast.
- Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
# Fit the model
fb_models <- fb |>
model(
`Naïve` = NAIVE(Close),
`Mean` = MEAN(Close),
`RW` = RW(Close ~ drift())
)
# get accuracy
calculate_accuracy(fb_models)# A tibble: 5 × 3
score min_value model
<chr> <dbl> <chr>
1 MAE 1.46 RW
2 MAPE 1.26 RW
3 MASE 0.998 RW
4 RMSE 2.41 RW
5 RMSSE 1.00 RW
# Forecast the next year of trading days
fb_fc <- fb_model |> forecast(h = 252)I compare the naive, mean, and drift methods (since I specify the model index in terms of years of trading days, a non-seasonal model specification, I do not use the seasonal naive method). The drift method (RW) has the best accuracy scores (lowest MAE, MAPE, MASE, RMSE, and RMSSE out of all benchmark functions), so I think the drift method is best.
5.3
- Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.
#| label: q3-help
#| warning: false
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_bin()`).
# Look at some forecasts
fit |> forecast() |> autoplot(recent_production) What do you conclude?
The mean of the residuals appears close to zero in the time plot of residuals and the histogram. There does not seem to be a significant correlation in the residuals series and the variance appears constant (homoskedastic). The histogram of the residuals from the seasonal naïve method appears normal, suggesting that this was a good forecast. The lack of correlation in the ACF of the residuals from the seasonal naïve method suggests the forecasts are good, apart from one outlier. The forecast aligns well with the seasonal data.
5.4
- Repeat the previous exercise using the Australian Exports series from
global_economyand the Bricks series fromaus_production. Use whichever ofNAIVE()orSNAIVE()is more appropriate in each case.
For the Australian Exports, I first plot the data to determine whether it has strong seasonality.
#| label: aus-exports-exploratory
#| message: false
# Extract data of interest
aus_exports <- global_economy |>
filter(Country == "Australia") |>
select(Exports)
# Plot
aus_exports |>
autoplot() +
theme_classic() +
labs(title = "Australian Exports", x = "Year", y = "Exports of goods and services (% of GDP)")Plot variable not specified, automatically selected `.vars = Exports`
Since the data does not look seasonal, I choose the naive method.
#| label: aus-exports-model
# Define and estimate a model
fit <- aus_exports |> model(NAIVE())Model not specified, defaulting to automatic modelling of the `Exports`
variable. Override this using the model formula.
# Look at the residuals
fit |> gg_tsresiduals()Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 1 row containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 1 row containing non-finite outside the scale range
(`stat_bin()`).
# Look at some forecasts
fit |> forecast() |> autoplot(aus_exports) + theme_classic() + labs(title = "Australian Exports", x = "Year", y = "Exports of goods and services (% of GDP)")The residuals look like they have constant variance, do not look autocorrelated, and look normally distributed. This all suggests that the naive model forecasts are good.
For the Bricks series, I determined in section in 5.1 that we should use the seasonal naive method.
#| label: bricks-resids
#| message: false
# Define and estimate a model
fit <- bricks |> model(SNAIVE(Bricks))
# Look at the residuals
fit |> gg_tsresiduals()Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 4 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 4 rows containing non-finite outside the scale range
(`stat_bin()`).
# Look at some forecasts
fit |> forecast() |> autoplot(bricks) +
labs(
title = "Brick Production Forecast",
subtitle = "SNAIVE method",
x = "Year",
y = "Clay brick production (in millions of bricks)"
) +
theme_classic() +
scale_x_yearquarter(
breaks = seq(yearquarter("1950 Q1"), yearquarter("2030 Q1"), by = 20))The residuals have a lower tail, which isn’t normal. They look very autocorrelated in the ACF plot. Since there is a detectable pattern in the residuals, there is still a better prediction we could make about bricks that hasn’t been captired witht his model.
5.7
For your retail time series (from Exercise 7 in Section 2.10):
Create a training dataset consisting of observations before 2011 using
# get random retail series
random_id <- sample(aus_retail$`Series ID`, 1)
myseries <- aus_retail |>
filter(`Series ID` == random_id)
industry_name <- myseries$Industry[1]
myseries_train <- myseries |>
filter(year(Month) < 2011)- Check that your data have been split appropriately by producing the following plot.
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")**Yes, the data has been split appropriately with training data in red before 2011 and testing data after 2011.**
- Fit a seasonal naïve model using
SNAIVE()applied to your training data (myseries_train).
fit <- myseries_train |>
model(SNAIVE(Turnover))- Check the residuals.
fit |> gg_tsresiduals()Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_line()`).
Warning: Removed 12 rows containing missing values or values outside the scale range
(`geom_point()`).
Warning: Removed 12 rows containing non-finite outside the scale range
(`stat_bin()`).
Do the residuals appear to be uncorrelated and normally distributed?
No, the residuals do not appear to be uncorrelated. The ACF plot shows a strong negative correlation at the 12-month lag, and positive correlation at the 24-month lag, suggesting there is still some seasonality that could be forecasted with a better model. The histogram of the residuals is not normally distributed. The residuals appear to have an upper tail.
- Produce forecasts for the test data
# Use the model trained on the training data to forecast the test data
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
# Plot the forecast of the test data using the model trained on the training data
fc |> autoplot(myseries)- Compare the accuracy of your forecasts against the actual values.
# Get the accuracy of the model on the training data
fit |> accuracy() # A tibble: 1 × 12
State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Victoria Cafes, … SNAIV… Trai… 22.1 37.6 27.6 6.81 8.76 1 1 0.829
# Get the accuracy of the model on the test data
fc |> accuracy(myseries) # A tibble: 1 × 12
.model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SNAIVE(T… Vict… Cafes, … Test 69.0 116. 89.9 7.68 10.8 3.25 3.09 0.921
The accuracy measures of the model on the training data are all lower (i.e., more accurate) than that for the test data. This suggests that the model may be overfitting the training data.
- How sensitive are the accuracy measures to the amount of training data used?
To answer this, I will repeat the process with a smaller training set. I will use the data before 2000 as my training data set.
#| label: sensitivity
myseries_train_smaller <- myseries |>
filter(year(Month) < 2000)
autoplot(myseries, Turnover) +
autolayer(myseries_train_smaller, Turnover, colour = "red") fit_smaller <- myseries_train_smaller |>
model(SNAIVE(Turnover))
# Use the model trained on the training data to forecast the test data
fc_smaller <- fit_smaller |>
forecast(new_data = anti_join(myseries, myseries_train_smaller))Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
# Plot the forecast of the test data using the model trained on the training data
fc_smaller |> autoplot(myseries) # Get the accuracy of the model on the training data
fit |> accuracy()# A tibble: 1 × 12
State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Victoria Cafes, … SNAIV… Trai… 22.1 37.6 27.6 6.81 8.76 1 1 0.829
# Get the accuracy of the first model on the test data
fc |> accuracy(myseries)# A tibble: 1 × 12
.model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SNAIVE(T… Vict… Cafes, … Test 69.0 116. 89.9 7.68 10.8 3.25 3.09 0.921
# Get the accuracy of the smaller model on the test data
fc_smaller |> accuracy(myseries) # A tibble: 1 × 12
.model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SNAIVE(T… Vict… Cafes, … Test 294. 343. 295. 43.5 44.0 17.7 14.3 0.970
The accuracy measures are sensitive to the amount of training data used. The model trained on the smaller set of training data has much higher (worse) accuracy scores than the model trained on the larger set of training data.