library(fpp3)
## Warning: package 'fpp3' was built under R version 4.4.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.2
## ✔ lubridate 1.9.3 ✔ fable 0.5.0
## ✔ ggplot2 3.5.1
## Warning: package 'dplyr' was built under R version 4.4.2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Warning: package 'tsibble' was built under R version 4.4.3
## Warning: package 'tsibbledata' was built under R version 4.4.3
## Warning: package 'feasts' was built under R version 4.4.3
## Warning: package 'fabletools' was built under R version 4.4.3
## Warning: package 'fable' was built under R version 4.4.3
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(tsibble)
Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:
Australian Population (global_economy)
Bricks (aus_production)
NSW Lambs (aus_livestock)
Household wealth (hh_budget).
Australian takeaway food turnover (aus_retail).
aus_pop <- global_economy |>
filter(Country == "Australia") |>
select(Year, Population)
autoplot(aus_pop)
## Plot variable not specified, automatically selected `.vars = Population`
Since this plot shows linear growth, a drift model is appropriate.
aus_pop |>
model(RW(Population ~ drift())) |>
forecast(h=20) |>
autoplot(aus_pop, level=NULL)
bricks_data <- aus_production |>
select(Quarter, Bricks ) |>
drop_na()
autoplot(bricks_data, Bricks)
Since this data has seasonality, SNAIVE should be utilized.
bricks_data |>
model(SNAIVE(Bricks)) |>
forecast(h=20) |>
autoplot(bricks_data, level=NULL)
lambs <- aus_livestock |>
filter(Animal == "Lambs") |>
filter(State == "New South Wales") |>
select(Month, Count)
autoplot(lambs, Count)
This dataset doesn’t appear to have seasonality nor does it have a clear
trend so NAIVE model should be used.
lambs |>
model(NAIVE(Count)) |>
forecast(h=50) |>
autoplot(lambs, level=NULL)
For this question, the instructions don’t specify a country so I will select Australia to match the theme of the other excercises.
wealth_data <-hh_budget |>
filter(Country == "Australia") |>
select(Year, Wealth)
autoplot(wealth_data, Wealth)
While this dataset doesn’t appear to have a clear trend to me initially,
it’s known that there was a market crash in 2008 that impacted many and
likely caused the steep decline in the chart. Since we can see that
growth has returned, I believe a drift model should be used in this
case.
wealth_data |>
filter_index("2011" ~ "2016") |>
model(RW(Wealth ~ drift())) |>
forecast(h=10) |>
autoplot(wealth_data, level=NULL)
The question did not specify the region so I will select Australian Capital Territory for this excercise.
turnover_data <- aus_retail |>
filter(Industry =="Takeaway food services") |>
filter(State =="Australian Capital Territory") |>
select(Month, Turnover)
autoplot(turnover_data, Turnover)
This data has a trend but no seasonality so drift model is
appropriate.
turnover_data |>
model(RW(Turnover ~ drift())) |>
forecast(h=50) |>
autoplot(turnover_data, level = NULL)
Use the Facebook stock price (data set gafa_stock) to do the following:
Produce a time plot of the series
fb_data <- gafa_stock |>
filter(Symbol == "FB") |>
select(Date, Close)
autoplot(fb_data, Close)
Produce forecasts using the drift method and plot them.
# Convert dates to row number to remove date gaps (I believe this is due to lack of data on weekends)
fb_data_reg <- fb_data |>
mutate(Date = row_number()) |>
as_tsibble(index = Date)
# Plot forecast
fb_data_reg |>
model(RW(Close ~ drift())) |>
forecast(h = 100) |>
autoplot(fb_data_reg, level = NULL)
Show that the forecasts are identical to extending the line drawn between the first and last observations.
Note: This question was a bit unclear to me but I will assume the equation for the drift is (Yt - Y1)/(T-1).
# Calculate drift
initial_y <- head(fb_data_reg$Close, 1)
final_y <- tail(fb_data_reg$Close, 1)
n <- nrow(fb_data_reg)
fit <- fb_data_reg |> model(RW(Close ~ drift()))
tidy(fit)
cat("Model's drift:",tidy(fit)$estimate)
## Model's drift: 0.06076372
cat("Manual drift:", (final_y - initial_y) / (n - 1))
## Manual drift: 0.06076372
The drift/slopes are the same in both showing that an extended line would be the same through the first and last observation.
Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
fb_data_reg |>
model(NAIVE(Close)) |>
forecast(h=100) |>
autoplot(fb_data_reg, level = NULL)
I think the drift model is best for this data. The naive model forecast
shows relative stagnant pricing but it is unlikely for stock prices for
a tech giant to be stable like this. It is more likely that the price to
trend upwards over time.
Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
What do you conclude?
The residuals seem to be normally distributed without time spikes. The acf residuals shows lag 4 negative correlation. It appears that the model is likely accounting for all data but a different model may capture the seasonal trend better.
Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
recent_exp <- global_economy |>
filter(Country == 'Australia' & Year >= 1992) |>
select(Year, Exports)
autoplot(recent_exp, Exports)
Since there is no seasonality, NAIVE seems more appropriate.
aus_exp_fit <- recent_exp |> model(NAIVE(Exports))
aus_exp_fit |> gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
aus_exp_fit |> forecast() |> autoplot(recent_exp, level = NULL)
The residuals are normally distributed and the acf does not show
significant lag correlation.
recent_bricks <- aus_production |>
filter(year(Quarter) >= 1992) |>
select(Quarter, Bricks) |>
drop_na()
autoplot(recent_bricks, Bricks)
There seems to be seasonality in this dataset so SNAIVE is more
appropriate.
bricks_fit <- recent_bricks |> model(SNAIVE(Bricks))
bricks_fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
bricks_fit |> forecast() |> autoplot(recent_bricks, level = NULL)
The residuals are skewed and show outliers.
# Provided from exercise
set.seed(12345678)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
# Select necessary variables
myseries <- myseries |>
select(Month, Turnover)
# Provided from exercise
myseries_train <- myseries |>
filter(year(Month) < 2011)
print(tail(myseries_train))
## # A tsibble: 6 x 2 [1M]
## Month Turnover
## <mth> <dbl>
## 1 2010 Jul 16.1
## 2 2010 Aug 13.8
## 3 2010 Sep 13.6
## 4 2010 Oct 12.3
## 5 2010 Nov 11.7
## 6 2010 Dec 17.9
# Provided from exercise
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
Yes, the dataset is red until the end of 2010 and switches to black
afterwards.
fit <- myseries_train |>
model(SNAIVE(Turnover))
# Provided by exercise
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
The residuals seem to be normally distributed. There is more variance
between Jan 1995 and 1997 but the overall seems to be stable.
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(Month, Turnover)`
fc |> autoplot(myseries, level =NULL)
fit |> accuracy()
fc |> accuracy(myseries)
To answer this question, I think we will need to repeat with additional training set sizes.
# less data (stopping at 2002)
train_2002 <- myseries |>
filter(year(Month) < 2002)
fit_2002 <- train_2002 |>
model(SNAIVE(Turnover))
fc2 <- fit_2002|>
forecast(new_data = anti_join(myseries, train_2002))
## Joining with `by = join_by(Month, Turnover)`
fc2 |> autoplot(myseries, level=NULL)
accuracy(fc2, myseries)
# more data (through 2018)
train_2018 <- myseries |>
filter(year(Month) < 2018)
fit_2018 <- train_2018 |>
model(SNAIVE(Turnover))
fc3 <- fit_2018|>
forecast(new_data = anti_join(myseries, train_2018))
## Joining with `by = join_by(Month, Turnover)`
fc3 |> autoplot(myseries, level=NULL)
accuracy(fc3, myseries)
The inclusion of more data seems to improve accuracy in this model suggesting that it is sensitive to the amount of training data.