library(fpp3)
library(latex2exp)
library(seasonal)
## Warning in system(paste(x13.bin, file.path(tdir, "Testairline")), intern =
## TRUE, : running command
## '/Library/Frameworks/R.framework/Versions/4.5-x86_64/Resources/library/x13binary/bin/x13ashtml
## /var/folders/z2/f1f7v_cs07sdpththdqy3mlr0000gn/T//RtmpxoPvKo/x13binary__818f55b56529__dir/Testairline
## 2>/dev/null' had status 134
Produce forecasts for the following series using whichever of
NAIVE(y)
, SNAIVE(y)
or
RW(y ~ drift())
is more appropriate in each case:
Since population usually increase over time,
RW(y ~ drift())
would be the best for this dataset. We can
fit all three models and compare the forecast accuracy measures. The
model with the lowest RMSE is usually prefered.
pop_train <- global_economy %>%
filter(Country == "Australia") %>%
filter_index("1960"~ "2000")
pop_fit <- pop_train %>%
model(
`Naïve` = NAIVE(Population),
`Seasonal naïve` = SNAIVE(Population),
Drift = RW(Population ~drift())
)
## Warning: 1 error encountered for Seasonal naïve
## [1] Non-seasonal model specification provided, use RW() or provide a different lag specification.
pop_fc <- pop_fit %>%
forecast(h=10)
accuracy(pop_fc,pop_train )
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 10 observations are missing between 2001 and 2010
## # A tibble: 3 × 11
## .model Country .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <fct> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Australia Test NaN NaN NaN NaN NaN NaN NaN NA
## 2 Naïve Australia Test NaN NaN NaN NaN NaN NaN NaN NA
## 3 Seasonal naïve Australia Test NaN NaN NaN NaN NaN NaN NaN NA
pop_fc %>%
autoplot(pop_train) +
labs(title = "Forecasts for Australia Populations")
Since brick production has seasonal patterns, SNAIVE()
would be the best for this dataset. We can fit all three models and
compare the forecast accuracy measures. The model with the lowest RMSE
is usually prefered.
bricks <- aus_production %>%
select(Quarter, Bricks)
bricks_train <- aus_production %>%
filter_index("1992 Q1" ~ "2004 Q4")
bricks_test <- aus_production %>%
filter_index("2005 Q1" ~ .)
bricks_fit <- bricks_train %>%
model(
`Naïve` = NAIVE(Bricks),
`Seasonal naïve` = SNAIVE(Bricks),
Drift = RW(Bricks ~drift())
)
bricks_fc <- bricks_fit %>%
forecast(h = 10)
accuracy(bricks_fc, bricks_test)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Test -2.41 39.9 39.9 -1.65 10.3 NaN NaN -0.5
## 2 Naïve Test -2 40.0 40 -1.55 10.3 NaN NaN -0.5
## 3 Seasonal naïve Test -21 39.1 33 -6.23 8.98 NaN NaN -0.5
SNAIVE()
is a better model here as it has the lowest
RMSE.
bricks_fc %>%
autoplot(bricks_train, level = NULL) +
autolayer(bricks_test,colour = "black")+
labs(title= "Bricks Production Forecasts")+
guides(colour = guide_legend(title = "Forecast"))
## Plot variable not specified, automatically selected `.vars = Beer`
Since livestock has seasonal patterns, SNAIVE()
would be
the best for this dataset. We can fit all three models and compare the
forecast accuracy measures. The model with the lowest RMSE is usually
prefered.
lambs_data<- aus_livestock %>%
filter(Animal=="Lambs", State == "New South Wales")%>%
select(Month, Count)
lambs_train <- lambs_data %>%
filter_index("2005 Jan" ~ "2017 Dec")
lambs_test <- lambs_data %>%
filter_index("2018 Jan" ~ "2019 Feb")
lambs_fit <- lambs_train %>%
model(
`Naïve` = NAIVE(Count),
`Seasonal naïve` = SNAIVE(Count),
Drift = RW(Count ~drift())
)
lambs_fc <- lambs_fit %>%
forecast(h = 14)
accuracy(lambs_fc, lambs_test)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 2 observations are missing between 2019 Jan and 2019 Feb
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Test 47547. 69580. 62076. 9.90 14.5 NaN NaN 0.206
## 2 Naïve Test 49242. 70397. 62858. 10.3 14.6 NaN NaN 0.196
## 3 Seasonal naïve Test 9367. 52218. 42733. 0.957 10.7 NaN NaN 0.519
SNAIVE()
is a better model here as it has the lowest
RMSE.
lambs_fc %>%
autoplot(lambs_train, level = NULL) +
autolayer(lambs_test,colour = "black")+
labs(title= "New South Wales Lambs Forecasts")+
guides(colour = guide_legend(title = "Forecast"))
## Plot variable not specified, automatically selected `.vars = Count`
Since household wealth usually has an upward trend and is plotted
annually, RW(y ~ drift())
would be the best for this
dataset. We can fit all three models and compare the forecast accuracy
measures. The model with the lowest RMSE is usually prefered.
wealth_data <- hh_budget %>%
select(Year, Wealth)
wealth_train <- wealth_data %>%
filter(Year <= 2010)
wealth_test <- wealth_data %>%
filter(Year > 2010)
wealth_fit <- wealth_train %>%
model(
`Naïve` = NAIVE(Wealth),
`Seasonal naïve` = SNAIVE(Wealth),
Drift = RW(Wealth ~drift())
)
## Warning: 4 errors (1 unique) encountered for Seasonal naïve
## [4] Non-seasonal model specification provided, use RW() or provide a different lag specification.
wealth_fc <- wealth_fit %>%
forecast(h = 10)
accuracy(wealth_fc, wealth_test)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 4 observations are missing between 2017 and 2020
## # A tibble: 12 × 11
## .model Country .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Austra… Test 21.7 36.6 30.5 5.20 7.95 NaN NaN 0.346
## 2 Drift Canada Test 34.9 43.4 36.1 6.44 6.72 NaN NaN 0.371
## 3 Drift Japan Test 28.3 31.7 28.3 4.77 4.77 NaN NaN 0.273
## 4 Drift USA Test 36.0 51.9 46.8 5.87 8.00 NaN NaN 0.377
## 5 Naïve Austra… Test 28.1 43.0 36.2 6.86 9.42 NaN NaN 0.363
## 6 Naïve Canada Test 50.0 59.9 50.0 9.30 9.30 NaN NaN 0.414
## 7 Naïve Japan Test 54.7 59.9 54.7 9.25 9.25 NaN NaN 0.432
## 8 Naïve USA Test 47.1 63.2 54.8 7.78 9.28 NaN NaN 0.404
## 9 Seasonal na… Austra… Test NaN NaN NaN NaN NaN NaN NaN NA
## 10 Seasonal na… Canada Test NaN NaN NaN NaN NaN NaN NaN NA
## 11 Seasonal na… Japan Test NaN NaN NaN NaN NaN NaN NaN NA
## 12 Seasonal na… USA Test NaN NaN NaN NaN NaN NaN NaN NA
wealth_fc %>%
autoplot(wealth_train, level = NULL) +
autolayer(wealth_test,colour = "black")+
labs(title= "Household Wealth Forecasts")+
guides(colour = guide_legend(title = "Forecast"))
## Warning: Removed 40 rows containing missing values or values outside the scale range
## (`geom_line()`).
Since brick production has seasonal patterns, SNAIVE()
would be the best for this dataset. We can fit all three models and
compare the forecast accuracy measures. The model with the lowest RMSE
is usually prefered.
takeaway_data <- aus_retail %>%
filter(Industry =="Takeaway food services", State =="Australian Capital Territory") %>%
select(Month, Turnover)
takeaway_train <- takeaway_data %>%
filter_index("2000 Jan" ~ "2014 Dec")
takeaway_test <- takeaway_data %>%
filter_index("2015 Jan" ~ "2018 Dec")
takeaway_fit <- takeaway_train %>%
model(
`Naïve` = NAIVE(Turnover),
`Seasonal naïve` = SNAIVE(Turnover),
Drift = RW(Turnover ~drift())
)
takeaway_fc <- takeaway_fit %>%
forecast(h = 10)
accuracy(takeaway_fc, takeaway_test)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Test -1.54 1.98 1.54 -8.23 8.23 NaN NaN 0.569
## 2 Naïve Test -1.11 1.80 1.23 -6.14 6.70 NaN NaN 0.606
## 3 Seasonal naïve Test 3.68 4.08 3.68 18.3 18.3 NaN NaN 0.562
takeaway_fc %>%
autoplot(takeaway_train, level = NULL) +
autolayer(takeaway_test,colour = "black")+
labs(title= "Australian takeaway food turnover Forecasts")+
guides(colour = guide_legend(title = "Forecast"))
Use the Facebook stock price (data set gafa_stock
) to do
the following:
Produce a time plot of the series.
facebook_stock <- gafa_stock %>%
filter(Symbol == 'FB') %>%
mutate(day = row_number()) %>%
update_tsibble(index = day, regular = TRUE)
facebook_stock %>%
autoplot(Close) +
labs(title = "Facbook Daily Closing Stock Prices")
Produce forecasts using the drift method and plot them.
facebook_2015 <- facebook_stock |>
filter(year(Date) == 2015)
facebook_fit <- facebook_2015 %>%
model(
Drift = RW(Close ~drift())
)
facebook_jan_2016 <- facebook_stock |>
filter(yearmonth(Date) == yearmonth("2016 Jan"))
facebook_fc <- facebook_fit |>
forecast(new_data = facebook_jan_2016)
# Plot the forecasts
facebook_fc |>
autoplot(facebook_2015) +
autolayer(facebook_jan_2016, Close, colour = "black") +
labs(y = "$US",
title = "facebook daily closing stock prices",
subtitle = "(Jan 2015 - Jan 2016)") +
guides(colour = guide_legend(title = "Forecast"))
Show that the forecasts are identical to extending the line drawn between the first and last observations.
facebook_fit <- facebook_2015 %>%
model(
`Naïve` = NAIVE(Close))
facebook_jan_2016 <- facebook_stock |>
filter(yearmonth(Date) == yearmonth("2016 Jan"))
facebook_fc <- facebook_fit |>
forecast(new_data = facebook_jan_2016)
# Plot the forecasts
facebook_fc |>
autoplot(facebook_2015) +
autolayer(facebook_jan_2016, Close, colour = "black") +
labs(y = "$US",
title = "facebook daily closing stock prices",
subtitle = "(Jan 2015 - Jan 2016)") +
guides(colour = guide_legend(title = "Forecast"))
Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
If you want to forecast overall trend, Drift method is the best to forecast the Facebook stocks because it accounts for the overall trend over time. If you want to forecast stock price short term, the naive method may be a good choice as it assume the next value will be the same/similar to recent values.
facebook_fit <- facebook_2015 %>%
model(
Mean = MEAN(Close))
facebook_jan_2016 <- facebook_stock |>
filter(yearmonth(Date) == yearmonth("2016 Jan"))
facebook_fc <- facebook_fit |>
forecast(new_data = facebook_jan_2016)
# Plot the forecasts
facebook_fc |>
autoplot(facebook_2015) +
autolayer(facebook_jan_2016, Close, colour = "black") +
labs(y = "$US",
title = "facebook daily closing stock prices",
subtitle = "(Jan 2015 - Jan 2016)") +
guides(colour = guide_legend(title = "Forecast"))
Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. What do you conclude?
The residuals are centered about 0 and do not show any obvious trend. The histogram of the residual is nearly symmetric and almost bi normal distributed. The autocorrection plot show some spikes, especially at lag 4, which means that some patterns are left unexplained. The seasonal naïve method does a good job in capturing seasonality as it repeats the same vale from same quarter the last year. But it did not capture all the structure as we saw there are still some significant spikes in the acf plot.
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
The Ljing-Box test on the residuals of the seasonal naive model reveals a test statistics of 32.27 with a p-value of 0.00008. Since the p-vlaue < 0.05, we we can conclude the residuals are not white noise.
augment(fit) |> features(.innov, ljung_box, lag=8)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 SNAIVE(Beer) 32.3 0.0000834
augment(fit) |> features(.innov, box_pierce, lag=8)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 SNAIVE(Beer) 29.7 0.000234
Repeat the previous exercise using the Australian Exports series from
global_economy
and the Bricks series from
aus_production
. Use whichever of NAIVE()
or
SNAIVE()
is more appropriate in each case.
Since export data is yearly and has no strong seasonal patterns,
NAIVE()
is more appropriate. The residual distribution
appears to be spead and not exactly bell shaped. There are some large
spikes especially in 2009-2010. The p-value for the Ljung- Box test is
0.194, which is greater than 0.05. The p-value for box piece test is
0.414, which is greater than 0.05. These suggest residuals behave like
white noise.
export <- global_economy %>%
filter(Country == "Australia", Year>= 1992)%>%
select(Year, Exports)
export_fit <- export %>%
model(NAIVE(Exports))
# Look at the residuals
export_fit %>%
gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
export_fit %>%
forecast(h = 10) %>%
autoplot(export)
augment(export_fit) |> features(.innov, ljung_box, lag=8)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 NAIVE(Exports) 11.1 0.194
augment(export_fit) |> features(.innov, box_pierce, lag=8)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 NAIVE(Exports) 8.20 0.414
Residuals fluctuates with large negative values around 1994-1995 and early 2000s. There are strong positive autocorection at lag 1 and 2 and the rest of the lags are negative. The distribution is skwed (not normal). The p-vlaues of both test are less that 0.05. All of these points that the reisduals are not white noise.
# modified from code given by text
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992, !is.na(Bricks))
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Bricks))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
augment(fit) |> features(.innov, ljung_box, lag=8)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 SNAIVE(Bricks) 62.2 1.74e-10
augment(fit) |> features(.innov, box_pierce, lag=8)
## # A tibble: 1 × 3
## .model bp_stat bp_pvalue
## <chr> <dbl> <dbl>
## 1 SNAIVE(Bricks) 56.2 0.00000000256
For your retail time series (from Exercise 7 in Section 2.10):
Create a training dataset consisting of observations before 2011 using
set.seed(12345678)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
filter(year(Month) < 2011)
Check that your data have been split appropriately by producing the following plot.
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
fit <- myseries_train |>
model(SNAIVE(Turnover))
Check the residuals. Do the residuals appear to be uncorrelated and normally distributed?
The residuals fluctuate aorund 0. In the ACF plot there are autocorrection at several lags which means residuals appear to be uncorrelated.
The distribution of the histogram is almost normal.
The residuals are not white noise.
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
Produce forecasts for the test data
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
Compare the accuracy of your forecasts against the actual values.
Errors (RMSE, MAE) are higher in the test set than training which is expected. .
fit |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439 1.21 0.915 5.23 12.4 1 1 0.768
fc |> accuracy(myseries)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test 0.836 1.55 1.24 5.94 9.06 1.36 1.28 0.601
How sensitive are the accuracy measures to the amount of training data used?
If you use less training data, the accuracy will likely to decrease. If you use more training data, the accuracy may increase. But excessive amount of data may also lead to overfitting.