library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.3 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(ggplot2)
Exercises 5.1, 5.2, 5.3, 5.4 and 5.7
NAIVE(y)
, SNAIVE(y)
or
RW(y ~ drift())
is more appropriate in each case:global_economy
)Original data:
global_economy_aus <- global_economy |>
filter(Country == "Australia")
global_economy_aus |>
autoplot(Population) +
geom_point() +
labs(title= "Australian Population")
There isn’t any seasonality here, so we won’t try the seasonal naive model.
Let’s test out the models:
# Experiment with all forecast models
pop_fit <- global_economy |>
filter(!is.na(Population), Country == "Australia") |> # just want data that contains population data
model(
Naive = NAIVE(Population),
Drift = RW(Population ~ drift()),
Mean = MEAN(Population)
)
pop_fc <- pop_fit |>
forecast(h = "5 years")
pop_fc |>
autoplot(global_economy, level = NULL) +
labs(title = "Australian Population",
y = "Population") +
guides(colour = guide_legend(title = "Forecast"))
The mean looks to be the worst option here since it is way too low. The drift forecast model looks to be the best option for this, so the final graph would be:
# Final
pop_fit <- global_economy |>
filter(!is.na(Population), Country == "Australia") |> # just want data that contains population data
model(
Drift = RW(Population ~ drift()),
)
pop_fc <- pop_fit |>
forecast(h = "5 years")
pop_fc |>
autoplot(global_economy, level = NULL) +
labs(title = "Australian Population",
y = "Population") +
guides(colour = guide_legend(title = "Forecast"))
aus_production
)# Experiment with all forecast models
brick_fit <- aus_production |>
filter(!is.na(Bricks)) |> # just want data that contains Bricks data
model(
Seasonal_naive = SNAIVE(Bricks),
Naive = NAIVE(Bricks),
Drift = RW(Bricks ~ drift()),
Mean = MEAN(Bricks)
)
brick_fc <- brick_fit |>
forecast(h = "5 years")
brick_fc |>
autoplot(aus_production, level = NULL) +
labs(title = "Clay brick production in Australia",
y = "Millions of bricks") +
guides(colour = guide_legend(title = "Forecast"))
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
Seasonal naive is the one that makes the most sense here, as it is the only one that takes in seasonality.
So the final forecast would be:
# Final forecast model
brick_fit <- aus_production |>
filter(!is.na(Bricks)) |> # just want data that contains Bricks data
model(
Seasonal_naive = SNAIVE(Bricks),
)
brick_fc <- brick_fit |>
forecast(h = "5 years")
brick_fc |>
autoplot(aus_production, level = NULL) +
labs(title = "Clay brick production in Australia",
y = "Millions of bricks") +
guides(colour = guide_legend(title = "Forecast"))
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
aus_livestock
)Original data:
aus_livestock_nsw_lambs <- aus_livestock |>
filter(Animal == "Lambs", State == "New South Wales")
aus_livestock_nsw_lambs |>
autoplot(Count) +
labs(title= "NSW Lambs")
This graph does have seasonality, so the seasonal naive might be the best option for this.
# Experiment with all forecast models
aus_livestock_nsw_lambs_fit <- aus_livestock_nsw_lambs |>
model(
Seasonal_naive = SNAIVE(Count),
Naive = NAIVE(Count),
Drift = RW(Count ~ drift()),
Mean = MEAN(Count)
)
aus_livestock_nsw_lambs_fc <- aus_livestock_nsw_lambs_fit |>
forecast(h = "5 years")
aus_livestock_nsw_lambs_fc |>
autoplot(aus_livestock_nsw_lambs, level = NULL) +
labs(title = "NSW Lambs",
y = "Number of animals slaughtered") +
guides(colour = guide_legend(title = "Forecast"))
The seasonal_naive looks to be the best option, so the final graph would be:
aus_livestock_nsw_lambs_fit <- aus_livestock_nsw_lambs |>
model(
Seasonal_naive = SNAIVE(Count),
)
aus_livestock_nsw_lambs_fc <- aus_livestock_nsw_lambs_fit |>
forecast(h = "5 years")
aus_livestock_nsw_lambs_fc |>
autoplot(aus_livestock_nsw_lambs, level = NULL) +
labs(title = "NSW Lambs",
y = "Number of animals slaughtered") +
guides(colour = guide_legend(title = "Forecast"))
hh_budget
).Original data:
hh_budget |>
autoplot(Wealth) +
labs(title= "Household wealth")
This data does not look seasonal, so let’s remove testing the seasonal naive model:
# Experiment with all forecast models
hh_budget_fit <- hh_budget |>
model(
Naive = NAIVE(Wealth),
Drift = RW(Wealth ~ drift()),
Mean = MEAN(Wealth)
)
hh_budget_fc <- hh_budget_fit |>
forecast(h = "5 years")
hh_budget_fc |>
autoplot(hh_budget, level = NULL) +
labs(title = "NSW Lambs",
y = "Number of animals slaughtered") +
guides(colour = guide_legend(title = "Forecast"))
The drift looks to be the best option here. The naive is not a bad second option either.
The final graph would be:
hh_budget_fit <- hh_budget |>
model(
Drift = RW(Wealth ~ drift())
)
hh_budget_fc <- hh_budget_fit |>
forecast(h = "5 years")
hh_budget_fc |>
autoplot(hh_budget, level = NULL) +
labs(title = "NSW Lambs",
y = "Number of animals slaughtered") +
guides(colour = guide_legend(title = "Forecast"))
aus_retail
)Original data:
aus_retail_takeaway_food <- aus_retail |>
filter(Industry == "Takeaway food services")
aus_retail_takeaway_food |>
autoplot(Turnover) +
labs(title= "Australian takeaway food turnover")
# Experiment with all forecast models
aus_retail_takeaway_food_fit <- aus_retail_takeaway_food |>
model(
Seasonal_naive = SNAIVE(Turnover),
Naive = NAIVE(Turnover),
Drift = RW(Turnover ~ drift()),
Mean = MEAN(Turnover)
)
aus_retail_takeaway_food_fc <- aus_retail_takeaway_food_fit |>
forecast(h = "5 years")
aus_retail_takeaway_food_fc |>
autoplot(aus_retail_takeaway_food, level = NULL) +
facet_grid(vars(State), scales = "free_y") +
labs(title= "Australian takeaway food turnover") +
guides(colour = guide_legend(title = "Forecast"))
Drift is not too bad here, but since these graphs show seasonality, the
seasonal naive would most likely be best.
So the final graph would be:
aus_retail_takeaway_food_fit <- aus_retail_takeaway_food |>
model(
Seasonal_naive = SNAIVE(Turnover)
)
aus_retail_takeaway_food_fc <- aus_retail_takeaway_food_fit |>
forecast(h = "5 years")
aus_retail_takeaway_food_fc |>
autoplot(aus_retail_takeaway_food, level = NULL) +
facet_grid(vars(State), scales = "free_y") +
labs(title= "Australian takeaway food turnover") +
guides(colour = guide_legend(title = "Forecast"))
Below is for my own experimenting:
It looks like some of the original data has a lot of variance, so
let’s see what a box cox transformation does just for
Victoria
:
# Experimentation code
aus_retail_takeaway_food_cap <- aus_retail_takeaway_food |>
filter(State == "Victoria")
aus_retail_takeaway_food_cap |>
autoplot(Turnover)
lambda_aus_retail_takeaway_food <- aus_retail_takeaway_food_cap |>
features(Turnover, features = guerrero) |>
pull(lambda_guerrero)
# lambda_aus_livestock
aus_retail_takeaway_food_cap |>
autoplot(box_cox(Turnover, lambda_aus_retail_takeaway_food))
fc <- aus_retail_takeaway_food_cap |>
model(SNAIVE(box_cox(Turnover, lambda_aus_retail_takeaway_food))) |>
forecast(h = 50) |>
mutate(.median = median(Turnover))
fc |>
autoplot(aus_retail_takeaway_food_cap |> filter(!is.na(Turnover)), level = 80) +
geom_line(aes(y = .median), data = fc, linetype = 2, col = "blue")
Looks like seasonal naive is still a good model here. The mean and median are pretty close to one another, with the mean slightly above the median. It looks like this shows the bias adjustment.
gafa_stock
) to
do the following:fb_stock <- gafa_stock |>
filter(Symbol == "FB") |>
mutate(trading_day = row_number()) |>
update_tsibble(index = trading_day, regular = TRUE)
fb_stock |>
autoplot(Close)
fb_stock |>
model(RW(Close ~ drift())) |>
forecast(h = 42) |>
autoplot(fb_stock, level = NULL) +
labs(title = "Facebook closing stock price", y = "$US") +
guides(colour = guide_legend(title = "Forecast"))
Let’s create a regression line using geom_abline():
# grab the first and last observation
last_row <- tail(fb_stock, n = 1)
first_row <- head(fb_stock, n = 1)
# create the data points
x <- c(first_row$trading_day, last_row$trading_day)
y <- c(first_row$Close, last_row$Close)
# create a data frame
data <- data.frame(x, y)
# create a linear model
regression_line <- lm(formula = y ~ x,
data=data)
# get the parameters
coeff <- coefficients(regression_line)
intercept <- coeff[1]
slope <- coeff[2]
fb_stock |>
model(RW(Close ~ drift())) |>
forecast(h = 42) |>
autoplot(fb_stock, level = NULL) +
labs(title = "Facebook closing stock price", y = "$US") +
guides(colour = guide_legend(title = "Forecast")) +
geom_abline(intercept = intercept, slope = slope, color="red", linetype="dashed", linewidth=1.5)
As you can see, the red dotted line goes right through the blue forecast line, showing that the forecast is identical to the regression line.
fb_stock_fit <- fb_stock |>
model(
Seasonal_naive = SNAIVE(Close),
Naive = NAIVE(Close),
Mean = MEAN(Close)
)
## Warning: 1 error encountered for Seasonal_naive
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.
fb_stock_fit_fc <- fb_stock_fit |>
forecast(h = 42)
fb_stock_fit_fc |>
autoplot(fb_stock, level = NULL) +
labs(title = "Facebook closing stock price", y = "$US") +
guides(colour = guide_legend(title = "Forecast"))
## Warning: Removed 42 rows containing missing values or values outside the scale range
## (`geom_line()`).
Seasonal naive did not plot which makes sense because there’s no clear seasonality. The naive is not too bad, but I think the best method is the drift because it is able to capture increases and decreases over time. This graph clearly shows increases and decreases and positive and negative trends, so the drift method would be slightly more accurate than just the naive. The mean looks too far low (and also doesn’t capture increases and decreases) so it could not be the best option here.
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
What do you conclude?
Besides 1996 Q1, the time plot of the residuals shows the variance is pretty constant. The variation of the residuals does not change all that much as time goes on. The ACF of the residuals shows a lack of correlation, meaning that the forecasts are good. The histogram also shows a somewhat normal distribution.
Now let’s check the residuals for white noise. Since this has seasonality, we pick l = 2m. The seasonal period is 4 quarters (1 year), so l = 2 * 4 which is 8.
# Box-Pierce test
augment(fit) |> features(.innov, box_pierce, lag = 8)
# Ljung-Box test
augment(fit) |> features(.innov, ljung_box, lag = 8)
Both of these tests show that the p value is less than 0.05, so we reject the hypothesis of white noise. It is unlikely to occur by chance. There could be information in the residuals that could be used in computing the forecasts.
global_economy
and the Bricks series from
aus_production
. Use whichever of NAIVE()
or
SNAIVE()
is more appropriate in each case.global_economy
Original data:
global_economy_export_aus <- global_economy |>
filter(Country == "Australia")
global_economy_export_aus |>
autoplot(Exports) +
geom_point() +
labs(title= "Australian Exports")
There does not seem to be any seasonality here, so let’s use
NAIVE()
.
# Define and estimate a model
fit <- global_economy_export_aus |> model(NAIVE(Exports))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(global_economy_export_aus)
# Box-Pierce test
augment(fit) |> features(.innov, box_pierce, lag = 10)
# Ljung-Box test
augment(fit) |> features(.innov, ljung_box, lag = 10)
For both tests, the p value is larger than 0.05, so it’s likely to occur by chance. These residuals are not easily distinguished from white noise. We accept the white noise hypothesis.
aus_production
Original data:
autoplot(aus_production, Bricks) + geom_point()
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).
There looks to be seasonality here, so let’s use
SNAIVE()
:
# Experiment with all forecast models
brick_fit <- aus_production |>
filter(!is.na(Bricks)) |> # just want data that contains Bricks data
model(
Seasonal_naive = SNAIVE(Bricks),
)
# Look at the residuals
brick_fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
brick_fit |> forecast() |> autoplot(aus_production)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
The time plot of the residuals shows a lot of variance which means the forecast model needs some work. Additionally, the ACF shows strong correlation and the histogram shows a left skewed distribution with some possible outliers. The residuals do not pass the test of constants variance and a normal distribution.
Now let’s check the residuals for white noise. Since this has seasonality, we pick l = 2m. The seasonal period is 4 quarters (1 year), so l = 2 * 4 which is 8.
# Box-Pierce test
augment(brick_fit) |> features(.innov, box_pierce, lag = 8)
# Ljung-Box test
augment(brick_fit) |> features(.innov, ljung_box, lag = 8)
For both tests, the p value is less than 0.05 (actually 0), so it’s unlikely to occur by chance. These residuals are easily distinguished from white noise. We reject the white noise hypothesis.
set.seed(100)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
SNAIVE()
applied to
your training data (myseries_train
).fit <- myseries_train |>
model(SNAIVE(Turnover))
# myseries_train
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
Do the residuals appear to be uncorrelated and normally distributed?
No, the time plot of the residuals shows a lot of variance, and the ACF shows strong correlation. Additionally, the histogram does not show a normal distribution as the distribution is right skewed with an outlier.
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train)) # test data
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
fit |> accuracy() # myseries_train
fc |> accuracy(myseries) # test data
The errors for the test data is quite large. For RMSE, the test data is 6 times the amount as the training. You can also see this visually as the test data is significant lower than the actual values.
Let’s add another 5 years of training:
set.seed(100)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
filter(year(Month) < 2016)
fit <- myseries_train |>
model(SNAIVE(Turnover))
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train)) # test data
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
fit |> accuracy() # myseries_train
fc |> accuracy(myseries) # test data
Just adding 5 years of training data made the errors go down by more than 50%. Visually the graph looks a lot more accurate, as it matches the values more.