library(fpp3)
library(RColorBrewer)
library(seasonal)
Do exercises 5.1, 5.2, 5.3, 5.4 and 5.7 in the Hyndman book. Please submit your Rpubs link as well as your .pdf file showing your run code.
The Australian Population dataset from global_economy seems to be fairly linear. As such, it makes sense to forecast using the drift method, which will extrapolate a line drawn between the first and last observations.
global_economy_aus <- global_economy %>% filter(Country == "Australia")
global_economy_aus %>% autoplot(Population)
global_economy_aus %>% model(drift = RW(Population ~ drift())) %>% forecast(h = 10) %>%
autoplot(global_economy_aus, level = NULL, size = 2)
The Bricks dataset from aus_production appears to have some seasonality within 5 years. Since the index for this tsibble is in quarters, I specified a 5 year lag with the SNAIVE forecast, which repeats the 5 year seasonal pattern; in my case, this was over 40 quarters which is equivalent to 10 years.
aus_production_bricks <- aus_production %>% filter(!is.na(Bricks))
aus_production_bricks %>% autoplot(Bricks)
aus_production_bricks %>% model(seasonal_naive = SNAIVE(Bricks ~ lag("5 year"))) %>%
forecast(h = 40) %>% autoplot(aus_production_bricks,level = NULL, size = 2)
The New South Wales Lamb dataset from aus_livestock also appears to have seasonality, although over a single year. As such, no lag was specified and SNAIVE was used again for the forecast.
aus_livestock_lambs <- aus_livestock %>% filter(Animal == "Lambs", State == "New South Wales")
aus_livestock_lambs %>% autoplot()
aus_livestock_lambs %>% model(seasonal_naive = SNAIVE(Count)) %>%
forecast(h = 10) %>% autoplot(aus_livestock_lambs,level = NULL)
The household wealth dataset from hh_budget does not show any seasonal patterns, although there are potentially cycles of wealth spikes and declines. There could have been potential to forecast with SNAIVE while including a drift component. However, I chose purely the drift method for this forecast as the cycle pattern was not regular enough to justify the use of SNAIVE.
hh_budget_wealth <- hh_budget %>% filter(!is.na(Wealth))
hh_budget_wealth %>% autoplot(Wealth)
hh_budget_wealth %>%
model(drift = RW(Wealth ~ drift())) %>%
forecast(h = 10) %>%
autoplot(hh_budget_wealth, level = NULL, size = 2)
The Australian takeaway food turnover data from aus_retail shows a clear seasonal pattern as well as upward trend. As such, I forecasted with decomposition, using the SNAIVE method for the seasonal component and drift method for the seasonal adjusted portion (trend cycle and remainder).
aus_takeaway <- aus_retail %>% filter(Industry == "Takeaway food services")
aus_takeaway %>% autoplot(Turnover)
aus_takeaway %>% filter(State == "Australian Capital Territory") %>%
model(stlf = decomposition_model(
STL(Turnover ~ trend(), robust = TRUE),
RW(season_adjust ~ drift()))) %>%
forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) +
labs(title = "Forecast of Turnover for Takeaway Food Services in Australian Capital Territory")
aus_takeaway %>% filter(State == "New South Wales") %>%
model(stlf = decomposition_model(
STL(Turnover ~ trend(), robust = TRUE),
RW(season_adjust ~ drift()))) %>%
forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) +
labs(title = "Forecast of Turnover for Takeaway Food Services in New South Wales")
aus_takeaway %>% filter(State == "Northern Territory") %>%
model(stlf = decomposition_model(
STL(Turnover ~ trend(), robust = TRUE),
RW(season_adjust ~ drift()))) %>%
forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) +
labs(title = "Forecast of Turnover for Takeaway Food Services in Northern Territory")
aus_takeaway %>% filter(State == "Queensland") %>%
model(stlf = decomposition_model(
STL(Turnover ~ trend(), robust = TRUE),
RW(season_adjust ~ drift()))) %>%
forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) +
labs(title = "Forecast of Turnover for Takeaway Food Services in Queensland")
aus_takeaway %>% filter(State == "South Australia") %>%
model(stlf = decomposition_model(
STL(Turnover ~ trend(), robust = TRUE),
RW(season_adjust ~ drift()))) %>%
forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) +
labs(title = "Forecast of Turnover for Takeaway Food Services in South Australia")
aus_takeaway %>% filter(State == "Queensland") %>%
model(stlf = decomposition_model(
STL(Turnover ~ trend(), robust = TRUE),
RW(season_adjust ~ drift()))) %>%
forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) +
labs(title = "Forecast of Turnover for Takeaway Food Services in Queensland")
aus_takeaway %>% filter(State == "Tasmania") %>%
model(stlf = decomposition_model(
STL(Turnover ~ trend(), robust = TRUE),
RW(season_adjust ~ drift()))) %>%
forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) +
labs(title = "Forecast of Turnover for Takeaway Food Services in Tasmania")
aus_takeaway %>% filter(State == "Victoria") %>%
model(stlf = decomposition_model(
STL(Turnover ~ trend(), robust = TRUE),
RW(season_adjust ~ drift()))) %>%
forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) +
labs(title = "Forecast of Turnover for Takeaway Food Services in Victoria")
aus_takeaway %>% filter(State == "Western Australia") %>%
model(stlf = decomposition_model(
STL(Turnover ~ trend(), robust = TRUE),
RW(season_adjust ~ drift()))) %>%
forecast(h = 120) %>% autoplot(aus_takeaway, level = NULL) +
labs(title = "Forecast of Turnover for Takeaway Food Services in Western Australia")
I filtered on the symbol FB. For the sake of this exercise, I chose one column - stock price at open - to focus my analysis.
gafa_stock_fb <- gafa_stock %>% filter(Symbol == "FB")
gafa_stock_fb %>% autoplot(Open) +
labs(title = "Facebook stock prices at open", y = "USD", x = "Year")
In order to forecast with the drift method, I needed to mutate the index and update the tsibble with this new index, as stock price data would have gaps during non-trading days. I forecasted 252 days ahead, as there are 252 average trading days per year.
gafa_stock_fb <- gafa_stock_fb %>%
mutate(new_index = row_number()) %>%
update_tsibble(index = new_index, regular = TRUE)
gafa_stock_forecasted <- gafa_stock_fb %>%
model(RW(Open ~ drift())) %>%
forecast(h = 252)
gafa_stock_forecasted %>% autoplot(gafa_stock_fb, level = NULL)
To show that the forecasts are identical, I plotted the forecast and added a segment between the first and last observations. The lines are shown to intersect at the last observation, and they appear to have the same slope, hence they are identical lines.
first_open = gafa_stock_fb %>%
filter(Date == min(Date)) %>% select(Open)
last_open = gafa_stock_fb %>%
filter(Date == max(Date)) %>% select(Open)
proof <- gafa_stock_forecasted %>% autoplot(gafa_stock_fb, level = NULL)
proof + annotate("segment",
x = first_open$new_index,
y = first_open$Open,
xend = last_open$new_index,
yend = last_open$Open, color = "red")
SNAIVE, NAIVE, DRIFT, and MEAN were used to forecast the dataset. Using SNAIVE is fairly nonsensical as there did not appear to be seasons. If seasonality were to occur, it could potentially be over a year (252 trading days). For SNAIVE, I plotted a 1 year season and 1 month season, though both do not appear to be good fits. MEAN and NAIVE also do not appear to be good fits, since the data shows a clear upward trend. The only function that would make sense in this case is drift, and even so it does not take into account the day to day fluctuation, as this fluctuation does not appear to be seasonal.
gafa_stock_fb %>% model(
seasonal_naive_month = SNAIVE(Open ~ lag(21)),
seasonal_naive_year = SNAIVE(Open ~ lag(252)),
naive = NAIVE(Open),
drift = RW(Open ~ drift()),
mean = MEAN(Open)) %>%
forecast(h = 252) %>%
autoplot(gafa_stock_fb, level = NULL)
The residuals plot shows that there might be some slight heteroscedasticity as the variance of residuals is not completely the same. For instance, it does appear to increase and decrease in various a couple of times, with the notable decreases at around 1996-1997 and 2007-2008. The autocorrelations plot shows a large spike at lag 4 and minor spikes at lag 1 and 3. Since there are approximately three spikes outside of the autocorrelation bounds, there is some autocorrelation and the residuals therefore do not look like white noise. This would imply the model is not a good fit.
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
For the Australian Exports series from global_economy, the residuals appear generally homoscedastic, though variance increases a bit towards the end of the time series. The autocorrelations show ACF values that are almost all within the bounds, which also supports that the residuals are white noise. This would indicate that the NAIVE model is a good fit. This is likely because the data is not changing drastically in the 60 years of data collection.
# Extract data of interest
GE_Australian_Exports <- global_economy |>
filter(Country == "Australia")
GE_Australian_Exports %>% autoplot(Exports)
# Define and estimate a model
fit <- GE_Australian_Exports |> model(NAIVE(Exports))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(GE_Australian_Exports) + ylim(0,25)
For the Brick series from aus_production, I used SNAIVE to model and forecast the data. The residuals plot is clearly heteroscedastic and the autocorrelation shows ACF values that are all beyond the bounds. This indicates the model is not a good fit.
# Extract data of interest
aus_production |> autoplot(Bricks)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Define and estimate a model
fit <- aus_production_bricks %>% model(seasonal_naive = SNAIVE(Bricks ~ lag("5 year")))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 20 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast(h = 40) %>% autoplot(aus_production_bricks,level = NULL, size = 2)
set.seed(32)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
filter(year(Month) < 2011)
From the following plot, I can see the data has been split appropriately through the color change.
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
Since the data potentially has some seasonality, SNAIVE is applied.
fit <- myseries_train |>
model(SNAIVE(Turnover))
The residuals do not appear to be normal, nor uncorrelated. The autocorrelation plot is again largely outside of bounds and residuals are heteroscedastic.
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
The forecasts were plotted and the model does not seem to be a good predictor. We can see that the test data set is beyond the bounds of the 95% confidence level in multiple areas.
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
fc |> autoplot(myseries)
The forecast is not very accurate, shown by the Root Mean Squared Scaled Error (RMSSE) of the test data. Since scale is adjusted, a good accuracy would be under 1.
fit |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South A… Pharmac… SNAIV… Trai… 3.13 6.61 4.55 7.89 10.9 1 1 0.795
fc |> accuracy(myseries)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Pharmac… Test 18.5 22.7 18.8 15.1 15.5 4.13 3.43 0.912
To assess the sensitivity of the accuracy measures, I performed the above exercise again but with different training/test data split. I split the test data 5 years after and before the original split (2011 +/- 5 years). Given the large fluctuation in accuracy depending on the data split, the accuracy measures did seem very sensitive to the amount of training data used. I would suggest that this sensitivity is due to the shape of the dataset, where values closer in time to the test data is more accurate. The inclusion (or contrarily, exclusion) of this data in the training model seems to contribute to the sensitivity of the accuracy measures more so than the “amount” of training data per se. For instance, cutting the data at 2016 instead of 2011 brings down the RMSSE down to 0.895, suggesting that the model could be an okay fit. In contrast, cutting the data at 2006 results in a much higher RMSSE at 9.52.
set.seed(32)
myseries2 <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries2_train <- myseries2 |>
filter(year(Month) < 2016)
fit2 <- myseries2_train |>
model(SNAIVE(Turnover))
fit2 |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
fc2 <- fit2 |>
forecast(new_data = anti_join(myseries2, myseries2_train))
fc2 |> autoplot(myseries2)
fit2 |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South A… Pharmac… SNAIV… Trai… 3.55 7.13 4.95 7.45 10.2 1 1 0.777
fc2 |> accuracy(myseries2)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Pharmac… Test 0.900 6.38 5.17 0.580 4.07 1.04 0.895 0.536
set.seed(32)
myseries2 <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries2_train <- myseries2 |>
filter(year(Month) < 2006)
fit2 <- myseries2_train |>
model(SNAIVE(Turnover))
fit2 |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
fc2 <- fit2 |>
forecast(new_data = anti_join(myseries2, myseries2_train))
fc2 |> autoplot(myseries2)
fit2 |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South A… Pharmac… SNAIV… Trai… 2.25 4.95 3.57 7.74 11.0 1 1 0.773
fc2 |> accuracy(myseries2)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Sout… Pharmac… Test 42.4 47.1 42.4 39.0 39.0 11.9 9.52 0.932