library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.3.3
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ ggplot2 3.5.1
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.3 ✔ fable 0.4.0
## Warning: package 'feasts' was built under R version 4.3.3
## Warning: package 'fabletools' was built under R version 4.3.3
## Warning: package 'fable' was built under R version 4.3.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:
pop_data <- global_economy |>
filter(Code == 'AUS') |>
select(Year, Population)
autoplot(pop_data, Population)
The plot shows consistent growth over time, so a drift model is most
appropriate.
pop_data |>
model(RW(Population ~ drift())) |>
forecast(h = 20) |>
autoplot(pop_data, level = NULL)
bricks_data <- aus_production |>
select(Quarter, Bricks) |>
drop_na()
autoplot(bricks_data, Bricks)
The data shows seasonal variation, so SNAIVE is most appropriate.
bricks_data |>
model(SNAIVE(Bricks)) |>
forecast(h = 20) |>
autoplot(bricks_data, level = NULL)
lambs_data <- aus_livestock |>
filter(State == 'New South Wales') |>
filter(Animal == 'Lambs') |>
select(Month, Count)
autoplot(lambs_data, Count)
This data does not show seasonal variation and does not have a clear long term trend up or down, so a NAIVE() model is most appropriate.
lambs_data |>
model(NAIVE(Count)) |>
forecast(h = 100) |>
autoplot(lambs_data, level = NULL)
wealth_data <- hh_budget |>
select(Year, Wealth) |>
filter(Country == 'Australia')
autoplot(wealth_data, Wealth)
Household wealth collapsed during and after the 2008 financial crisis, but seems to have recovered roughly linearly since 2011. A drift model is most appropriate.
wealth_data |>
filter_index('2011' ~ '2016') |>
model(RW(Wealth ~ drift())) |>
forecast(h = 10) |>
autoplot(wealth_data, level = NULL)
takeaway_data <- aus_retail |>
filter(Industry == 'Takeaway food services') |>
filter(State == 'Australian Capital Territory') |>
select(Month, Turnover)
autoplot(takeaway_data, Turnover)
This series shows an upward trend but does not show obvious seasonality, making a drift model most appropriate.
takeaway_data |>
model(RW(Turnover ~ drift())) |>
forecast(h = 100) |>
autoplot(takeaway_data, level = NULL)
Use the Facebook stock price (data set gafa_stock) to do the following:
Produce a time plot of the series.
fb_stock_open <- gafa_stock |>
filter(Symbol == 'FB') |>
select(Date, Open)
autoplot(fb_stock_open, Open)
Produce forecasts using the drift method and plot them.
day_nums <- c(1:nrow(fb_stock_open))
fb_stock_regular <- data.frame(day_num = day_nums, Open = fb_stock_open$Open)
fb_stock_regular <- fb_stock_regular |>
as_tsibble(index = 'day_num')
fb_stock_regular |>
model(RW(Open ~ drift())) |>
forecast(h = 365) |>
autoplot(fb_stock_regular, level = NULL)
Show that the forecasts are identical to extending the line drawn between the first and last observations.
# Construct the equation of the line passing through the first and last observation
first_and_last_point <- fb_stock_regular[c(1, nrow(fb_stock_regular)),] |> as_tibble()
line_slope <- as.numeric((first_and_last_point[2,2] - first_and_last_point[1,2]) / (first_and_last_point[2,1] - first_and_last_point[1,1]))
line_int <- as.numeric(first_and_last_point[2,2] - (line_slope * first_and_last_point[2,1]))
# Use the line to create 365 days of projections after the last observation
day_num_2 <- c(1:(nrow(fb_stock_regular) + 365))
linear_proj <- line_int + (day_num_2 * line_slope)
projection_df <- data.frame(day_num = day_num_2, line_val = linear_proj)
projection_df <- projection_df[(nrow(projection_df)-364):nrow(projection_df),]
# Create a data frame of the drift forecast
fb_forecast <- fb_stock_regular |>
model(RW(Open ~ drift())) |>
forecast(h = 365)
fb_forecast <- as.data.frame(fb_forecast) |>
select(2,4) |>
rename(forecast = .mean)
# Merge the data frames
comparison_df <- merge(projection_df, fb_forecast)
print(head(comparison_df))
## day_num line_val forecast
## 1 1259 134.5133 134.5133
## 2 1260 134.5767 134.5767
## 3 1261 134.6400 134.6400
## 4 1262 134.7034 134.7034
## 5 1263 134.7667 134.7667
## 6 1264 134.8300 134.8300
As shown in the table, the values created by extending the line passing through the first and last observations are identical to those created by the drift forecast.
Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
fb_stock_regular |>
model(NAIVE(Open)) |>
forecast(h = 365) |>
autoplot(fb_stock_regular, level = NULL)
I think that the drift model is more appropriate. It is extremely rare for a stock to stay flat for an extended period of time, and generally a safe bet that the stock market in general will go up over time. Absent particularly bad news for Facebook, I think in the long term their stock price is more likely to go up than stay flat.
Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
What do you conclude?
The acf residuals plot shows a significant negative correlation with a lag of 4. The residuals are nearly normal and the time plot does not show significant spikes. From this I conclude that the model is likely accounting for all available information, but it’s possible that a better model could be created that would capture the seasonality better.
Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
recent_aus_exp <- global_economy |>
filter(Code == 'AUS' & Year >= 1992) |>
select(Year, Exports)
autoplot(recent_aus_exp, Exports)
There does not appear to be a seasonal trend in the dataset, so NAIVE() is more appropriate.
aus_exp_fit <- recent_aus_exp |> model(NAIVE(Exports))
aus_exp_fit |> gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
aus_exp_fit |> forecast() |> autoplot(recent_aus_exp, level = NULL)
The residuals appear to be normally distributed with stable variance and the acf plot does not show any significant lag correlations. From this, I conclude that this model accounts for all available information.
recent_bricks <- aus_production |>
filter(year(Quarter) >= 1992) |>
select(Quarter, Bricks) |>
drop_na()
autoplot(recent_bricks, Bricks)
Since this dataset appears to demonstrate a seasonal trend, SNAIVE() is more appropriate.
bricks_fit <- recent_bricks |> model(SNAIVE(Bricks))
bricks_fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
bricks_fit |> forecast() |> autoplot(recent_bricks, level = NULL)
The residuals have a pronounced left skew, with stable variance but some major low-end exceptions. From this, it is possible to conclude that the data set has some significant low outliers.
For your retail time series (from Exercise 7 in Section 2.10):
Create a training dataset consisting of observations before 2011 using
# Code provided by the textbook, with the seed modified
set.seed(31415)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
# Select only the relevant variables
myseries <- myseries |>
select(Month, Turnover)
# Code provided by the textbook
myseries_train <- myseries |>
filter(year(Month) < 2011)
# My addition to view the results
print(head(myseries_train))
## # A tsibble: 6 x 2 [1M]
## Month Turnover
## <mth> <dbl>
## 1 1988 Apr 0.6
## 2 1988 May 1.1
## 3 1988 Jun 0.9
## 4 1988 Jul 0.9
## 5 1988 Aug 1.1
## 6 1988 Sep 1.2
print(tail(myseries_train))
## # A tsibble: 6 x 2 [1M]
## Month Turnover
## <mth> <dbl>
## 1 2010 Jul 6.5
## 2 2010 Aug 6
## 3 2010 Sep 6.1
## 4 2010 Oct 5.4
## 5 2010 Nov 5.1
## 6 2010 Dec 7.1
The training dataset starts at the beginning of the dataset (April 1988) and ends right before 2011 (the last observation is December 2010) as desired.
Check that your data have been split appropriately by producing the following plot.
# Code provided by the textbook
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
The dataset is red until the end of 2010 and then black starting in 2011, as desired.
Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
# Code provided by the textbook
fit <- myseries_train |>
model(SNAIVE(Turnover))
Check the residuals.
# Code provided by the textbook
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
Do the residuals appear to be uncorrelated and normally distributed?
The residuals do appear to be normally distributed, and have mostly stable variance until the last 10% or so of the data (suggesting some recent outliers). The acf plot shows shrinking statistically significant correlations at low lag values, suggesting a trend.
Produce forecasts for the test data.
# Code provided by the textbook
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(Month, Turnover)`
fc |> autoplot(myseries, level = NULL)
Compare the accuracy of your forecasts against the actual values.
accuracy(fc, myseries)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Turnover) Test -0.182 0.953 0.801 -6.02 14.7 1.80 1.27 0.693
How sensitive are the accuracy measures to the amount of training data used?
I believe that on some level this is a trick question. For monthly data, an SNAIVE() model will only have access to 12 data points (the 12 most recent ones) when creating forecasts regardless of the size of the training set. With that in mind, the relevant question is not how much training data the model gets, but how similar the year used to construct the model is to the remaining years.
However, I will still demonstrate the process of constructing another model that uses less training data, and yet another that uses more training data, and comparing the accuracy of the new models to the original.
# Re-running the code with less training data
myseries_train_2 <- myseries |>
filter(year(Month) < 2008)
fit_2 <- myseries_train_2 |>
model(SNAIVE(Turnover))
fc_2 <- fit_2 |>
forecast(new_data = anti_join(myseries, myseries_train_2))
## Joining with `by = join_by(Month, Turnover)`
fc_2 |> autoplot(myseries, level = NULL)
accuracy(fc_2, myseries)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Turnover) Test 2.40 2.59 2.40 40.0 40.0 9.12 7.70 0.612
This forecast, which uses much less training data, has a much lower accuracy (because the RMSE is higher). This is because it does not capture a substantial trend upward in the data that the original forecast (with access to more training data) was able to partially account for.
# Re-running the code with more training data
myseries_train_3 <- myseries |>
filter(year(Month) < 2015)
fit_3 <- myseries_train_3 |>
model(SNAIVE(Turnover))
fc_3 <- fit_3 |>
forecast(new_data = anti_join(myseries, myseries_train_3))
## Joining with `by = join_by(Month, Turnover)`
fc_3 |> autoplot(myseries, level = NULL)
accuracy(fc_3, myseries)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Turnover) Test -0.929 1.22 1.05 -18.7 20.4 2.12 1.55 0.639
This forecast (which has access to even more training data) is less accurate than the initial forecast, but more accurate than the one that had access to less data.