library(fpp3)
library(patchwork)
library(USgas)
library(lubridate)
library(scales)
options(scipen=999)
Hyndman Chapter 5 Homework
5.1
Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:
- Australian Population (
global_economy
)
# first we filter.
= global_economy |>
aus filter(Code == "AUS") |>
select("Population") |>
filter(!is.na(Population))
## then we check, probably drift.
<- aus |>
fit_mable model(
# seasonal_naive = SNAIVE(Population),
naive = NAIVE(Population),
drift = RW(Population ~ drift()),
mean = MEAN(Population)
)<- fit_mable |>
fc forecast(h = 10)
# accuracy(fit_mable) |>
# arrange(.model) |>
# select(.model, .type, RMSE, MAE, MAPE, MASE, RMSSE)
# accuracy
# # A tibble: 3 × 7
# .model .type RMSE MAE MAPE MASE RMSSE
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 drift Training 76870. 59124. 0.347 0.235 0.293
# 2 mean Training 3979165. 3388690. 21.7 13.5 15.1
# 3 naive Training 262766. 251271. 1.52 1 1
## drift definitely performs better.
<- fc %>% filter(.model == "drift")
fc_drift
<- ggplot(aus, aes(x = Year, y = Population)) +
actual_data_plot geom_line(colour = "grey")
# forecast on top of the actual data plot
+
actual_data_plot geom_line(data = fc_drift, aes(x = Year, y = .mean, group = .model), colour = "blue") +
labs(title = "Australia's Population (Drift)", y="# of People") +
guides(colour = guide_legend(title="Forecast")) +
scale_y_continuous(labels = scales::comma) # This will format the y-axis labels
- Bricks (
aus_production
)
# first we filter.
= aus_production |>
bricks select("Bricks") |>
filter(!is.na(Bricks))
## then we check, probably drift.
<- bricks |>
fit_mable model(
seasonal_naive = SNAIVE(Bricks),
naive = NAIVE(Bricks),
drift = RW(Bricks ~ drift()),
mean = MEAN(Bricks)
)<- fit_mable |>
fc forecast(h = 10)
# > accuracy(fit_mable) |>
# arrange(.model) |>
# select(.model, .type, RMSE, MAE, MAPE, MASE, RMSSE)
# # A tibble: 4 × 7
# .model .type RMSE MAE MAPE MASE RMSSE
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 drift Training 40.2 32.9 8.29 0.927 0.831
# 2 mean Training 95.9 76.3 22.4 2.15 1.98
# 3 naive Training 40.2 32.9 8.28 0.926 0.832
# 4 seasonal_naive Training 48.3 35.5 8.84 1 1
# >
# naive works best, only ever so slightly
<- fc %>% filter(.model == "naive")
fc_chosen
<- ggplot(bricks, aes(x = Quarter, y = Bricks)) +
actual_data_plot geom_line(colour = "grey")
# forecast on top of the actual data plot
+
actual_data_plot geom_line(data = fc_chosen, aes(x = Quarter, y = .mean, group = .model), colour = "blue") +
labs(title = "Australia's Brick Production (Naive)", y="# of Bricks in Millions") +
guides(colour = guide_legend(title="Forecast")) +
scale_y_continuous(labels = scales::comma) # This will format the y-axis labels
- NSW Lambs (
aus_livestock
)
# first we filter.
= aus_livestock |>
lambs filter(Animal == "Lambs" & State == "New South Wales" ) |>
filter(!is.na(Count)) |>
select("Count")
## then we check, probably drift.
<- lambs |>
fit_mable model(
seasonal_naive = SNAIVE(Count),
naive = NAIVE(Count),
drift = RW(Count ~ drift()),
mean = MEAN(Count)
)
<- fit_mable |>
fc forecast(h = 50)
# > accuracy(fit_mable) |>
# arrange(.model) |>
# select(.model, .type, RMSE, MAE, MAPE, MASE, RMSSE)
# # A tibble: 4 × 7
# .model .type RMSE MAE MAPE MASE RMSSE
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 drift Training 50291. 40124. 9.96 0.958 0.910
# 2 mean Training 72452. 57618. 14.8 1.38 1.31
# 3 naive Training 50293. 40134. 9.97 0.958 0.910
# 4 seasonal_naive Training 55281. 41900 10.4 1 1.00
# >
# drift works best
<- fc %>% filter(.model == "drift")
fc_chosen
<- ggplot(lambs, aes(x = Month, y = Count)) +
actual_data_plot geom_line(colour = "grey")
# forecast on top of the actual data plot
+
actual_data_plot geom_line(data = fc_chosen, aes(x = Month, y = .mean, group = .model), colour = "blue") +
labs(title = "Lambs slaughtered (Drift)", y="# lambs") +
guides(colour = guide_legend(title="Forecast")) +
scale_y_continuous(labels = scales::comma)
- Household wealth (
hh_budget
).
# first we filter.
<- hh_budget |>
wealth filter(Country == "USA") |>
select(Wealth, Year) |>
filter(!is.na(Wealth))
## then we check, probably drift.
<- wealth |>
fit_mable model(
# seasonal_naive = SNAIVE(Wealth),
naive = NAIVE(Wealth),
drift = RW(Wealth ~ drift()),
mean = MEAN(Wealth)
)<- fit_mable |>
fc forecast(h = 10)
accuracy(fit_mable) |>
arrange(.model) |>
select(.model, .type, RMSE, MAE, MAPE, MASE, RMSSE)
# A tibble: 3 × 7
.model .type RMSE MAE MAPE MASE RMSSE
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
1 drift Training 32.7 24.7 4.75 0.927 0.981
2 mean Training 42.1 35.8 6.73 1.34 1.26
3 naive Training 33.4 26.6 5.10 1 1
# A tibble: 3 × 7
# .model .type RMSE MAE MAPE MASE RMSSE
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 drift Training 32.7 24.7 4.75 0.927 0.981
# 2 mean Training 42.1 35.8 6.73 1.34 1.26
# 3 naive Training 33.4 26.6 5.10 1 1
# drift is best.
<- fc %>% filter(.model == "drift")
fc_chosen
<- ggplot(wealth, aes(x = Year, y = Wealth)) +
actual_data_plot geom_line(colour = "grey")
# # forecast on top of the actual data plot
+
actual_data_plot geom_line(data = fc_chosen, aes(x = Year, y = .mean, group = .model), colour = "blue") +
labs(title = "US Wealth as a percentage of net disposable income (Drift) ", y="Wealth aas a percentage of net disposable income") +
guides(colour = guide_legend(title="Forecast"))
- Australian takeaway food turnover (
aus_retail
).
# first we filter.
<- aus_retail %>%
takeaway filter(Industry == "Takeaway food services") %>%
index_by(Month) %>%
summarise(total_turnover = sum(Turnover, na.rm = TRUE)) %>%
ungroup()
# ## then we check, probably seasonal
<- takeaway |>
fit_mable model(
seasonal_naive = SNAIVE(total_turnover),
naive = NAIVE(total_turnover),
drift = RW(total_turnover ~ drift()),
mean = MEAN(total_turnover)
)<- fit_mable |>
fc forecast(h = 20)
# accuracy(fit_mable) |>
# + arrange(.model) |>
# + select(.model, .type, RMSE, MAE, MAPE, MASE, RMSSE)
# # A tibble: 4 × 7
# .model .type RMSE MAE MAPE MASE RMSSE
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 drift Training 52.4 34.1 4.51 0.767 0.930
# 2 mean Training 401. 340. 65.5 7.64 7.12
# 3 naive Training 52.6 34.2 4.53 0.769 0.932
# 4 seasonal_naive Training 56.4 44.5 6.48 1 1
# # drift is best.
<- fc %>% filter(.model == "drift")
fc_chosen
<- ggplot(takeaway, aes(x = Month, y = total_turnover)) +
actual_data_plot geom_line(colour = "grey")
# # # forecast on top of the actual data plot
+
actual_data_plot geom_line(data = fc_chosen, aes(x = Month, y = .mean, group = .model), colour = "blue") +
labs(title = "AUS total turnover(Drift) ", y="turnover in australian dollars") +
guides(colour = guide_legend(title="Forecast"))
5.2
Use the Facebook stock price (data set gafa_stock
) to do the following:
- Produce a time plot of the series.
- Produce forecasts using the drift method and plot them.
- Show that the forecasts are identical to extending the line drawn between the first and last observations.
- Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
#’s1-3 are covered in the plot. For #4, it’s almost a toss up between drift and naive, but the more decimal places you go out, the more drift becomes better.
# accuracy(fit_mable) |>
# arrange(.model) |>
# select(.model, .type, RMSE, MAE, MAPE, MASE, RMSSE)
# # A tibble: 3 × 7
# .model .type RMSE MAE MAPE MASE RMSSE
# <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
# 1 drift Training 2.60 1.56 1.18 0.998 1.00
# 2 mean Training 35.8 30.9 25.7 19.7 13.8
# 3 naive Training 2.60 1.57 1.18 1.00 1
<- gafa_stock |>
fb filter(Symbol == "FB", year(Date) >= 2015) |>
mutate(day = row_number()) |>
update_tsibble(index = day, regular = TRUE) |>
select(day, Close)
# Modeling
<- fb |>
fit_mable model(
naive = NAIVE(Close),
drift = RW(Close ~ drift()),
mean = MEAN(Close)
)
<- fit_mable |>
fc forecast(h = 20)
<- fc %>% filter(.model == "drift")
fc_chosen
<- fb %>%
drift_start as_tibble() |>
summarise(day = min(day), close = first(Close))
<- fb %>%
drift_end as_tibble() |>
summarise(day = max(day), close = last(Close))
<- ggplot(fb, aes(x = day, y = Close)) +
actual_data_plot geom_line(colour = "grey") +
geom_segment(data = NULL, aes(x = drift_start$day, y = drift_start$close,
xend = drift_end$day, yend = drift_end$close),
colour = "red", linetype = "dashed") +
geom_line(data = fc_chosen, aes(x = day, y = .mean, group = .model), colour = "blue") +
labs(title = "FB Stock", y="Close") +
guides(colour = guide_legend(title="Forecast"))
print(actual_data_plot)
5.3
Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. What can you conclude?
Residuals are hovering around zero with not a lot of discernable pattern. ACF doesn’t have a pattern. The histogram seems to be fairly normal and not have a pattern. I think the fit is ok.
# Extract data of interest
<- aus_production |>
recent_production filter(year(Quarter) >= 1992)
# Define and estimate a model
<- recent_production |> model(SNAIVE(Beer))
fit # Look at the residuals
|> gg_tsresiduals() fit
# Look a some forecasts
|> forecast() |> autoplot(recent_production) fit
5.4
Repeat the previous exercise using the Australian Exports series from global_economy
and the Bricks series from aus_production
. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
- Exports from
global_economy
Same thing as 5.3, residuals don’t seem to be following a pattern, ACF isn’t unusual, normally distributed residuals. I think this fit is ok.
= global_economy |>
ausfilter(Code == "AUS") |>
select("Exports") |>
filter(!is.na(Exports))
# Has to be NAIVE.
<- aus |> model(NAIVE(Exports))
fit |> gg_tsresiduals() fit
|> forecast() |> autoplot(aus) fit
- Bricks form
aus_production
Interesting. Naive is a better predictor as proven above in the first question, but I thought I’d plot them both anyway (not shown.) When using SNAIVE, you can definitely see patterns in the residuals plots, way more than NAIVE. This is quarterly data, I figure seasonal would be a better predictor. Regardless, NAIVE is a better fit
= aus_production |>
bricks select("Bricks") |>
filter(!is.na(Bricks))
<- bricks |> model(NAIVE(Bricks))
fit |> gg_tsresiduals() fit
|> forecast() |> autoplot(bricks) fit
5.7
For your retail time series (from Exercise 7 in Section 2.10):
- Create a training dataset consisting of observations before 2011 using
set.seed(12345678)
<- aus_retail |>
myseries filter(`Series ID` == sample(aus_retail$`Series ID`,1))
<- myseries |>
myseries_train filter(year(Month) < 2011)
- Check that your data have been split appropriately by producing the following plot.
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
- Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
<- myseries_train |>
fit model(SNAIVE(Turnover))
- Check the residuals. Do the residuals appear to be uncorrelated and normally distributed? No. The residuals plot shows a trend up and down, the ACF plot had a definite pattern approaching zero, and the distribution histogram hs skewed heavily positive.
|> gg_tsresiduals() fit
- Produce forecasts for the test data
<- fit |>
fc forecast(new_data = anti_join(myseries, myseries_train))
Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
|> autoplot(myseries) fc
- Compare the accuracy of your forecasts against the actual values. You can see the seasonality, it doesn’t factor the trend in enough I think, and it seems to expect a downward trend.
|> accuracy() fit
# A tibble: 1 × 12
State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 Norther… Clothin… SNAIV… Trai… 0.439 1.21 0.915 5.23 12.4 1 1 0.768
|> accuracy(myseries) fc
# A tibble: 1 × 12
.model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
<chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 SNAIVE(T… Nort… Clothin… Test 0.836 1.55 1.24 5.94 9.06 1.36 1.28 0.601
- How sensitive are the accuracy measures to the amount of training data used?
I don’t have enough experience to answer this question. The sample has about 360 records, while the dataset has about 64K. Certainly one is smaller and less accurate. RMSE shows not too different of a forecasting error, I think. MAPE and MASE are off by about 30%, so maybe that means the forecasting is going to also be off by about 30% between the sample and the population?
In my day job, though we strive for perfection, it seems like if we were off a few million here and there it might not matter. 30% seems like a small difference, especially if you’re actively trying to do it better all the time. I think the sample might be good enough. ```