library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.5
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.3.2
## ✔ lubridate 1.9.3 ✔ fable 0.3.4
## ✔ ggplot2 3.5.1 ✔ fabletools 0.4.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
5.1 Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:
Australian Population (global_economy): RW(y ~ drift()) method because the growth is increasing in a trend
Bricks (aus_production): attempted use all models to see what happens, although I had a feeling due to the seasonality, sNaive is best and has lowest RMSE
NSW Lambs (aus_livestock): also has seasonality so sNaive is best
Household wealth (hh_budget): there were some peaks and troughs but overal a trend so RW(y ~ drift())
Australian takeaway food turnover (aus_retail): initally I wasnt sure between SNAIVE and Drift, there seemed to be seasonality and trends for some countries, so I checked all models and sNaive is best
aus_pop <- global_economy %>%
filter(Country == "Australia")
autoplot(aus_pop, Population) +
labs(title = "Australian Population")
aus_pop_model <- aus_pop %>%
model(RW(Population ~ drift()))
aus_pop_model %>%
forecast() %>%
autoplot(aus_pop) +
labs(title = "Australian Population Forecast")
aus_bricks <- aus_production[,c(1,4)] %>%
drop_na()
autoplot(aus_bricks, Bricks) +
labs(title = "Australian Brick Production")
bricks_fit <- aus_bricks %>%
model(
Naive = NAIVE(Bricks),
sNaive = SNAIVE(Bricks),
Drift = RW(Bricks ~ drift())
)
# Generate forecasts for the next 10 periods
bricks_fc <- bricks_fit %>%
forecast(h = 10)
bricks_fc %>%
autoplot(aus_production) +
labs(x = "Year",
y = "Millions of bricks",
title = "Forecast for quarterly bricks production") +
guides(color = guide_legend(title = "Forecast"))
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
accuracy(bricks_fc, aus_production)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Test NaN NaN NaN NaN NaN NaN NaN NA
## 2 Naive Test NaN NaN NaN NaN NaN NaN NaN NA
## 3 sNaive Test NaN NaN NaN NaN NaN NaN NaN NA
nsw_lambs <- aus_livestock %>%
filter(State == "New South Wales") %>%
filter(Animal == "Lambs")
autoplot(nsw_lambs, Count) +
labs(title = "Lambs in New South Wales")
nsw_lambs_model <- nsw_lambs %>%
model(SNAIVE(Count))
nsw_lambs_model %>%
forecast() %>%
autoplot(nsw_lambs) +
labs(title = "New South Wales Lambs Forecast")
hh_wealth <- hh_budget[,c(1,2,7)] %>%
as_tsibble(index = "Year")
autoplot(hh_wealth, Wealth) +
labs(title = "Household Wealth", y = "Wealth (USD)")
hh_wealth_model <- hh_wealth %>%
model(RW(Wealth ~ drift()))
hh_wealth_model %>%
forecast() %>%
autoplot(hh_wealth) +
labs(title = "Household Wealth Forecast")
aus_takeaway <- aus_retail %>%
filter(Industry == "Takeaway food services")
autoplot(aus_takeaway, Turnover) +
labs(title = "Turnover at Australian Takeaways")
aus_retail_food <- aus_retail %>%
filter(Industry == "Takeaway food services") %>%
index_by() %>%
group_by(Industry) %>%
summarise(Turnover = sum(Turnover)) %>%
ungroup()
food_tr <- aus_retail_food %>%
filter(year(Month) <= 2017)
food_fit <- food_tr %>%
model(
Naive = NAIVE(Turnover),
SNaive = SNAIVE(Turnover),
Drift = RW(Turnover ~ drift())
)
food_fc <- food_fit %>%
forecast(h = 12)
food_fc %>%
autoplot(food_tr, level = NULL)
accuracy(food_fc, aus_retail_food)
## # A tibble: 3 × 11
## .model Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Takeaway food … Test -175. 195. 176. -11.6 11.7 3.94 3.43 0.255
## 2 Naive Takeaway food … Test -152. 179. 160. -10.2 10.6 3.58 3.16 0.353
## 3 SNaive Takeaway food … Test 33.4 43.1 37.9 2.10 2.40 0.849 0.760 0.269
5.2 Use the Facebook stock price (data set gafa_stock) to do the following:
Produce a time plot of the series. First reindex the timeseries since it does not trade everyday.
Produce forecasts using the drift method and plot them. Initially unsure whether to use close or open price
Show that the forecasts are identical to extending the line drawn between the first and last observations. Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why? Historically its been increasing, in the more recent data it is decreasing so the model should take that into account therefore Naive is best.
fb_stock <- gafa_stock %>%
filter(Symbol == "FB") %>%
mutate(day = row_number()) %>%
update_tsibble(index = day, regular = TRUE)
fb_stock%>%
autoplot(Open) +
labs(title= "Daily Open Price of Facebook", y = "USD")
fb_stock %>%
model(Drift = RW(Close ~ drift())) %>%
forecast(h = 30) %>%
autoplot(fb_stock) +
labs(title = "Facebook Close Price Forcast")
fb_stock %>%
model(RW(Close ~ drift())) %>%
forecast(h = 63) %>%
autoplot(fb_stock) +
labs(title = "Daily Close Price of Facebook", y = "USD") +
geom_segment(aes(x = 1, y = 54.83, xend = 1258, yend = 134.45),
colour = "blue", linetype = "dashed")
## Warning in geom_segment(aes(x = 1, y = 54.83, xend = 1258, yend = 134.45), : All aesthetics have length 1, but the data has 1258 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
fb_fit <- fb_stock %>%
model(
Naive = NAIVE(Close),
SNaive = SNAIVE(Close ~ lag(30)),
Drift = RW(Close ~ drift())
)
# Generate forecasts for the next 63 periods
fb_fc <- fb_fit %>%
forecast(h = 63)
# Plot the forecasts using autoplot
autoplot(fb_fc, level = NULL) +
labs(title = "Daily Close Price of Facebook", y = "USD")
5.3 Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help. What do you conclude? The residuals are normally distributed, q1 is mostly in the blue line areas however lag 4 is clearly not, and lag 3 and 10 are very close suggesting it is not white noise
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
5.4 Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
Global_economy: used NAIVE due to trend, the forecast looks good. The residuals are normally distributed; however it is likely white noise.
Bricks: used SNAIVE due to seasonality, the residuals show that they are not white noise and they are not normally distributed.
# Extract data of interest
aus_economy <- global_economy %>%
filter(Country == "Australia")
# Define and estimate a model
fit <- aus_economy %>% model(NAIVE(Exports))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit %>% forecast() %>% autoplot(aus_economy)
fit <- aus_production %>%
filter(!is.na(Bricks)) %>%
model(SNAIVE(Bricks ~ lag("year")))
# Look at the residuals
fit %>% gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit %>% forecast() %>% autoplot(aus_production)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
5.7 For your retail time series (from Exercise 7 in Section 2.10):
Create a training dataset consisting of observations before 2011 using
set.seed(12345678)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries_train <- myseries |>
filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red") +
labs(title = "Turnover Data with Training Set Highlighted",
y = "Turnover",
x = "Time")
fit <- myseries_train %>%
model(SNAIVE(Turnover))
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
#Produce forecasts for the test data
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
Compare the accuracy of your forecasts against the actual values. The
training data has smaller errors but they are close in values, for
exmaple RMSE is lower for the training dataset.
fit |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Norther… Clothin… SNAIV… Trai… 0.439 1.21 0.915 5.23 12.4 1 1 0.768
fc |> accuracy(myseries)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Nort… Clothin… Test 0.836 1.55 1.24 5.94 9.06 1.36 1.28 0.601
How sensitive are the accuracy measures to the amount of training data used? The amount of data used impacts the sensitivity, ideally want to find the optimal amount, like with kfold validation, as addding to much lease to overfit and too little to underfitting.