suppressWarnings({
library(fpp3)
library(dplyr)
library(tidyr)
library(fable)
})
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.5
## ✔ dplyr 1.1.2 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.0 ✔ feasts 0.3.2
## ✔ lubridate 1.9.2 ✔ fable 0.3.4
## ✔ ggplot2 3.5.1 ✔ fabletools 0.4.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
aus_pop <- global_economy %>%
filter(Country == "Australia")
autoplot(aus_pop, Population) +
labs(title = "Australian Population")
Since this time series follows a clear linear trend, a drift model that uses the average rate of change between the first and last points will be the most appropriate forecast.
aus_pop_model <- aus_pop %>%
model(RW(Population ~ drift()))
aus_pop_model %>%
forecast() %>%
autoplot(aus_pop) +
labs(title = "Australian Population Forecast")
aus_bricks <- aus_production[,c(1,4)] %>%
drop_na()
autoplot(aus_bricks, Bricks) +
labs(title = "Australian Brick Production")
Since the bricks time series exhibits seasonality, the SNAIVE() method will be most appropriate.
aus_bricks_model <- aus_bricks %>%
model(SNAIVE(Bricks))
aus_bricks_model %>%
forecast() %>%
autoplot(aus_bricks) +
labs(title = "Australian Brick Production Forecast")
nsw_lambs <- aus_livestock %>%
filter(State == "New South Wales") %>%
filter(Animal == "Lambs")
autoplot(nsw_lambs, Count) +
labs(title = "Lambs in New South Wales")
Similarly to the production of bricks, the population of lambs exhibits seasonality, so SNAIVE() is most appropriate.
nsw_lambs_model <- nsw_lambs %>%
model(SNAIVE(Count))
nsw_lambs_model %>%
forecast() %>%
autoplot(nsw_lambs) +
labs(title = "New South Wales Lambs Forecast")
hh_wealth <- hh_budget[,c(1,2,7)] %>%
as_tsibble(index = "Year")
autoplot(hh_wealth, Wealth) +
labs(title = "Household Wealth", y = "Wealth (USD)")
With an upward trend, the drift method will be appropriate.
hh_wealth_model <- hh_wealth %>%
model(RW(Wealth ~ drift()))
hh_wealth_model %>%
forecast() %>%
autoplot(hh_wealth) +
labs(title = "Household Wealth Forecast")
aus_takeaway <- aus_retail %>%
filter(Industry == "Takeaway food services")
autoplot(aus_takeaway, Turnover) +
labs(title = "Turnover at Australian Takeaways")
With clear seasonality, the SNAIVE() method will be most appropriate.
aus_takeaway_model <- aus_takeaway %>%
model(SNAIVE(Turnover))
aus_takeaway_forecast <- aus_takeaway_model %>%
forecast()
ggplot() +
geom_line(data = aus_takeaway, aes(x = Month, y = Turnover, color = State)) +
geom_line(data = aus_takeaway_forecast, aes(x = Month, y = .mean, color = State), linetype = "dashed") +
labs(title = "Australian Takeaway Turnover Forecast",
y = "Turnover",
x = "Time") +
theme_minimal()
fb_close_price <- gafa_stock[,c(1,2,6)] %>%
filter(Symbol == "FB", year(Date) >= 2014) %>%
mutate(day = row_number()) %>%
update_tsibble(index = Date, regular = TRUE) %>%
fill_gaps()
autoplot(fb_close_price, Close) +
labs(title = "Facebook Closing Stock Price")
fb_close_model <- fb_close_price %>%
model(RW(Close ~ drift()))
fb_close_forecast <- fb_close_model %>%
forecast(h = "3 years")
fb_close_forecast %>%
autoplot(fb_close_price) +
labs(title = "Facebook Close Price Forecast")
We can show that the equation of the forecast line and the equation of the line drawn between the first and last observations are the same by using algebra.
fb_close_forecast <- fb_close_forecast %>%
mutate(day = row_number())
# compare slopes
slope_forecast <- (fb_close_forecast$.mean[1096] - fb_close_forecast$.mean[1]) / (1096-1)
slope_actual <- (fb_close_price$Close[1825] - fb_close_price$Close[1]) / (1258-1)
round(slope_actual, 5) == round(slope_forecast, 5)
## [1] TRUE
#compare y-intercepts
forecast_b <- 131.1508 - (slope_forecast * 1259)
actual_b <- 54.71 - (slope_actual * 1)
round(forecast_b, 3) == round(actual_b, 3)
## [1] TRUE
Since both the forecast and the line drawn between the first and last observations have the same slope and y-intercept, they are the same line.
fb_close_price %>%
model(NAIVE(Close)) %>%
forecast(h = "3 years") %>%
autoplot(fb_close_price) +
labs(title = "Facebook Close Price Forecast: NAIVE Model")
fb_close_price %>%
model(SNAIVE(Close)) %>%
forecast(h = "3 years") %>%
autoplot(fb_close_price) +
labs(title = "Facebook Close Price Forecast: SNAIVE Model")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
I think the NAIVE model is best. The drift model is not appropriate because it relies on the historical data to predict an overall upward trend, when the recent trend has been downward. I would not use the SNAIVE model because there is not apparent seasonality.
The following code will help.
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
The residuals do appear to be uncorrelated. They are also approximately normally distributed, with a mean of 0. This suggests that the SNAIVE model is a good fit for the data.
Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
aus_exports <- global_economy[,c(1,3,8)] %>%
filter(Country == "Australia")
autoplot(aus_exports, Exports) +
labs(title = "Australian Exports")
aus_exports_model <- aus_exports %>%
model(NAIVE(Exports))
aus_exports_model %>%
forecast() %>%
autoplot(aus_exports) +
labs(title = "Australian Exports Forecast")
aus_exports_model %>% gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
I used the NAIVE model because there does not appear to be seasonality in the data. The residuals are uncorrelated and normally distributed with a mean of 0, so we can conclude this is a good fit.
aus_bricks_model %>%
forecast() %>%
autoplot(aus_bricks) +
labs(title = "Australian Brick Production Forecast")
gg_tsresiduals(aus_bricks_model)
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
I used the SNAIVE model since there is seasonality in the data. These residual plots do not reveal as strong of a fit as the previous two models were for their data sets. There is some heteroscedasticity, and the residuals appear to be skewed rather than normally distributed. However, the mean is still close to 0, and there’s no obvious pattern in the residuals. Still, there may be a better way to model this data.
set.seed(1989)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
autoplot(myseries, Turnover) +
labs(title = "Turnover at Australian Retail Stores")
myseries_train <- myseries |>
filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red") +
labs(title = "Australian Retail Turnover: Test/ Train Split", y = "Turnover")
fit <- myseries_train |>
model(SNAIVE(Turnover))
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
fit |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Food re… SNAIV… Trai… 3.10 4.99 3.99 5.28 6.99 1.00 1 0.823
fc |> accuracy(myseries)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Food re… Test 20.1 23.4 20.6 16.6 17.0 5.15 4.69 0.932
The forecast did not produce very accurate predictions for the test data, which we can see from the measures of accuracy like the root mean squared error. The SNAIVE model accounted for the seasonality, but not the upward trend, leaving the forecast significantly lower than the actual data. In addition, we had previously used a Box Cox transformation on this time series due to the unequal variance. The model may be improved by first performing the Box Cox transformation, then creating the model and forecast, and finally un-transforming the forecast back to the original scale.
Given the upward trend in this data, the amount of training data used should significantly impact the accuracy of the forecast. For example, if we use all but the last year for training data:
myseries_train_2 <- myseries |>
filter(year(Month) < 2018)
fit_2 <- myseries_train_2 |>
model(SNAIVE(Turnover))
fit_2 |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
fc_2 <- fit_2 |>
forecast(new_data = anti_join(myseries, myseries_train_2))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc_2 |> autoplot(myseries)
fit_2 |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Food re… SNAIV… Trai… 3.38 5.47 4.22 4.89 6.34 1.00 1 0.829
fc_2 |> accuracy(myseries)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Food re… Test 2.22 2.98 2.6 1.67 2.02 0.616 0.546 0.194
We can see from the graph that the forecast much more closely predicts the actual data, and measures of accuracy like the RMSE decrease compared to the first forecast that had less training data.