#5.1 Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:
Australian Population (global_economy) Bricks (aus_production) NSW Lambs (aus_livestock) Household wealth (hh_budget) Australian takeaway food turnover (aus_retail)
# AUS population
pop_aus <- global_economy %>%
filter(Country == "Australia")
#head(pop_aus)
aus_plot1 <- pop_aus%>%
autoplot(Population)+
labs(title= "Australian Pop. over time")
aus_fit <- pop_aus %>%
model(RW(Population ~ drift()))
aus_fc <- aus_fit %>%
forecast(h = 10)
aus_plot2<- aus_fc %>%
autoplot(pop_aus)
plot_grid(aus_plot1, aus_plot2, ncol = 2)
There is a clear upward trend in Australia’s population.
#bricks
bricks <- aus_production %>%
filter(Quarter >= yearquarter(as.Date("1970-01-01")) & Quarter <= yearquarter(as.Date("2004-12-31")))
bricks_fit <- bricks %>% model(SNAIVE(Bricks ~ lag("year")))
bricks_fc <- bricks_fit %>% forecast(h = 4)
bricks_fc
## # A fable: 4 x 4 [1Q]
## # Key: .model [1]
## .model Quarter Bricks .mean
## <chr> <qtr> <dist> <dbl>
## 1 "SNAIVE(Bricks ~ lag(\"year\"))" 2005 Q1 N(409, 3026) 409
## 2 "SNAIVE(Bricks ~ lag(\"year\"))" 2005 Q2 N(423, 3026) 423
## 3 "SNAIVE(Bricks ~ lag(\"year\"))" 2005 Q3 N(428, 3026) 428
## 4 "SNAIVE(Bricks ~ lag(\"year\"))" 2005 Q4 N(397, 3026) 397
The SNAIVE() model uses the most recent value from the same season of the previous year (i.e., the same quarter from the prior year) to predict brick production for each quarter of 2005.
#NSW lambs
nsw_lambs<-aus_livestock%>%
filter(State == "New South Wales", str_detect(Animal,"Lambs"))
lambs_plot2<-nsw_lambs%>%
model(NAIVE(Count))%>%
forecast(h = 15)%>%
autoplot(nsw_lambs)+
labs(title= "NSW lambs Forecast")
lambs_plot2
NAIVE is appropriate here as there is no discernible trend or indication
of cyclicality.
#household wealth
# I opted to filter for Australia so that the forecast would apply to only one cpountry. Below I train the model and run the forecast
hh_aus <- hh_budget %>%
filter(Country == 'Australia') %>%
filter(Year >= year(as.Date("1995-01-01")) & Year <= year(as.Date("2016-01-01")))
hh_fit <- hh_aus %>% model(`RW` = RW(Wealth ~ drift()))
hh_forecast <- hh_fit %>% forecast(h = 2)
hh_forecast
## # A fable: 2 x 5 [1Y]
## # Key: Country, .model [1]
## Country .model Year Wealth .mean
## <chr> <chr> <dbl> <dist> <dbl>
## 1 Australia RW 2017 N(427, 561) 427.
## 2 Australia RW 2018 N(432, 1172) 432.
the RW drift model is appropriate for the general upward trend and highlights a forecast of higher Australian wealth from 2017 toward 2018.
#Australian takeaway food turnover (aus_retail).
#head(aus_retail)
aus_retail %>%
filter(str_detect(Industry,"takeaway")) %>%
model(RW(Turnover ~ drift())) %>%
forecast(h = 8) %>%
autoplot(aus_retail)+
facet_wrap(~State, scales = "free")
here again, a general upward trend in takeaway throughout AUS justifies
employing the drift.
#5.2 Use the Facebook stock price (data set gafa_stock) to do the following:
Produce a time plot of the series. Produce forecasts using the drift method and plot them. Show that the forecasts are identical to extending the line drawn between the first and last observations. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
#time plot of series
gafa_stock %>%
filter(Symbol == 'FB') %>%
autoplot(Open) +
labs(title= "Facebook opening share price", y = "USD")
#tail(gafa_stock)
#Produce forecasts using the drift method and plot them.
#get year and fit models
fb_stock <- gafa_stock %>%
filter(Symbol == "FB")
fb_2015 <- fb_stock %>% filter(year(Date) == 2015)
fb_fit <- fb_2015 |>
model(
Drift = NAIVE(Close ~ drift())
)
## Warning: 1 error encountered for Drift
## [1] .data is an irregular time series, which this model does not support. You should consider if your data can be made regular, and use `tsibble::update_tsibble(.data, regular = TRUE)` if appropriate.
fb_q1_2016 <- fb_stock |>
filter(Date >= as.Date("2016-01-01") & Date <= as.Date("2016-03-31"))
fb_fc <- fb_fit |>
forecast(new_data = fb_q1_2016)
#forecasts
fb_fc |>
autoplot(fb_2015, level = NULL) +
autolayer(fb_q1_2016, Close, colour = "red") +
labs(y = "USD",
title = "FB daily stock prices with Q1 2016 forecast") +
guides(color = guide_legend(title = "Forecast"))
## Warning: Removed 61 rows containing missing values or values outside the scale range
## (`geom_line()`).
Employing the drift method generates a Q1 2016 forecast in red that
aligns with pulling the 2015 data forward.
#Show that the forecasts are identical to extending the line drawn between the first and last observations.
fb_fc%>%
autoplot(fb_2015, level = NULL) +
geom_line(data = slice(fb_2015, range(cumsum(!is.na(Close)))),
aes(y=Close), linetype = 'dashed')
## Warning: Removed 61 rows containing missing values or values outside the scale range
## (`geom_line()`).
#Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
# my first attempt at this showed that my data was irregular. I re-tried it after converting data to regular time series
fb_2015 <- fb_2015 %>%
as_tsibble(index = Date) %>%
update_tsibble(regular = TRUE)
fb_fit2 <- fb_2015 %>%
model(
Mean = MEAN(Open),
Naive = NAIVE(Open),
`Seasonal naive` = SNAIVE(Open ~ lag("month")),
Drift = NAIVE(Open ~ drift())
)
## Warning: 1 error encountered for Naive
## [1] .data contains implicit gaps in time. You should check your data and convert implicit gaps into explicit missing values using `tsibble::fill_gaps()` if required.
## Warning: 1 error encountered for Seasonal naive
## [1] .data contains implicit gaps in time. You should check your data and convert implicit gaps into explicit missing values using `tsibble::fill_gaps()` if required.
## Warning: 1 error encountered for Drift
## [1] .data contains implicit gaps in time. You should check your data and convert implicit gaps into explicit missing values using `tsibble::fill_gaps()` if required.
# Q1 2016 forecasts
fb_q1_2016 <- fb_stock |>
filter(Date >= as.Date("2016-01-01") & Date <= as.Date("2016-03-31"))
fb_fc2 <- fb_fit2 %>%
forecast(new_data = fb_q1_2016)
fb_fc2 %>%
autoplot(fb_2015, level = NULL) +
autolayer(fb_q1_2016, Open, color = "purple") +
labs(y = "$USD",
title = "FB Closing Stock Prices (Daily)") +
guides(color = guide_legend(title = "Price Forecast"))
## Warning: Removed 183 rows containing missing values or values outside the scale range
## (`geom_line()`).
Nothing but the mean seems to work for this exercise, but Mean certainly
seems inadequate, considering how far off it is from the q1 forecast.
I’m not really sure where my code is going wrong.
#5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
Reviewing the residual and autocorrelation plots, we can infer that the
residuals are uncorrelated and homoscedastatic. The residuals have a
mean of 0 and are normally distributed, so we can also infer that this
is a strong model, a fact that we can confirm by reviewing the forecast
plot, which enjoys a good fit.
#5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
# Exports
#train model
aus_pop <- global_economy %>%
filter(Country == "Australia") %>%
filter(Year >= 1990)
fit <- aus_pop %>% model(NAIVE(Exports))
# review residuals and forecasts
fit %>% gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
fit %>% forecast() %>% autoplot(aus_pop)
Reviewing the same features as before, we see that the residuals are
largely uncorrelated and are homoscedastic. Residuals once again have a
mean of zero, with a mostly normal distribution. However, the NAIVE
model is identical to the output from the last exercise, and does not
align with the trend line of Australian exports. We can safely assume
that there is room for improvement on the forecast.
# Bricks
# fit model and plot residuals
aus_prod_fit <- aus_production %>%
model(NAIVE(Bricks))
aus_prod_fit %>%
gg_tsresiduals()
## Warning: Removed 21 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 21 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 21 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look at some forecasts
aus_prod_fit %>%
forecast() %>% autoplot(aus_production)
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 8 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
Reviewing the acf, it appears that the model is subject ot periodic
autocorrelation and cyclicality. The BP figure also supports this
assertion.
#5.7 For your retail time series (from Exercise 7 in Section 2.10):
# Create a training dataset consisting of observations before 2011 using
set.seed(432)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
filter(year(Month) < 2011)
# Check that your data have been split appropriately by producing the following plot.
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "purple")
# Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).
fit <- myseries_train |>
model(SNAIVE(Turnover))
# Check the residuals.
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
The figures above would indicate that the data is correlated. However,
the data have constant variation that goes negative at lag 10, which
would be evidence of heteroscedasticity. The residuals also do not
appear to be normally distributed.
# Produce forecasts for the test data
fc <- fit %>%
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
# Compare the accuracy of your forecasts against the actual values.
fit |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Austral… Food re… SNAIV… Trai… 4.99 6.56 5.34 7.62 8.24 1 1 0.768
fc |> accuracy(myseries)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Aust… Food re… Test 35.9 39.1 35.9 18.3 18.3 6.72 5.96 0.914
The forecasts do not appear to be performing well within the test data compared to the training data, which enjoys smaller errors.
#How sensitive are the accuracy measures to the amount of training data used?
Accuracy measures are impacted by sample size and the proportion of the data split between training and testing. If too much similar data is used, there is a risk of over-training the model, while using a dataset that is either too small or not diverse enough can contributed to under-training the model. We are best suited by identifying a model that strikes a delicate balance between over- and underfitting, one that earns high accuracy scores but does not oversample.