## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1 ✔ purrr 1.0.1
## ✔ tibble 3.1.8 ✔ dplyr 1.1.0
## ✔ tidyr 1.3.0 ✔ stringr 1.5.0
## ✔ readr 2.1.4 ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
##
## Attaching package: 'tsibble'
##
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
##
##
##
## Attaching package: 'lubridate'
##
##
## The following object is masked from 'package:tsibble':
##
## interval
##
##
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
##
##
## ── Attaching packages ────────────────────────────────────────────── fpp3 0.5 ──
##
## ✔ feasts 0.3.0 ✔ fabletools 0.3.2
## ✔ fable 0.3.2
##
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
global_economy %>%
filter(Country == "Australia") %>%
autoplot(Population) +
labs(title = "Population of Australia")
#Since the data trend is rising, we can use a drift!
global_economy %>%
filter(Country == "Australia") %>%
model(RW(Population ~ drift())) %>%
forecast(h = "5 years") %>%
autoplot(global_economy) +
labs(y = "Number of People", title = "Population of Australia")
aus_production %>%
autoplot(Bricks)
## Warning: Removed 20 rows containing missing values (`geom_line()`).
# Since we can see some seasonal data, we can can use the seasonal Naive model but first we need to fill in NAs
bricks <- aus_production %>%
select(Bricks) %>%
filter(!is.na(Bricks))
bricks %>%
model(SNAIVE(Bricks ~ lag("year"))) %>%
forecast(h = 5 ) %>%
autoplot(bricks) +
labs(title="SNAIVE Forecast ",
subtitle = "Five Year Forecast",
xlab="Years" )
nswl <- aus_livestock %>%
filter(State=="New South Wales" & Animal=="Lambs" & !is.na(Count)) %>%
select(Count)
nswl %>%
autoplot(Count)
# For this one we could either use seasonal naive or naive, we will use naive to try it out
nswl %>%
model(NAIVE(Count)) %>%
forecast(h=(1)) %>%
autoplot(nswl) +
labs(title="NAIVE Forecast ",
xlab="Years" )
hh_budget %>%
autoplot(Wealth)
# Since the trend is rising we can use drift again
hh_budget %>%
model(RW(Wealth ~ drift())) %>%
forecast(h = "5 years") %>%
autoplot(hh_budget) +
labs(y = "Budget", title = "Budget By Company")
aus_retail %>%
filter(Industry=="Takeaway food services") %>%
autoplot(Turnover)
unique(aus_retail$State)
## [1] "Australian Capital Territory" "New South Wales"
## [3] "Northern Territory" "Queensland"
## [5] "South Australia" "Tasmania"
## [7] "Victoria" "Western Australia"
# For this one, I want to train on all three methods but since there are so many states we will choose one
train_fit <- aus_retail %>%
filter(State == "Victoria" & Industry=="Takeaway food services") %>%
model(Mean = MEAN(Turnover),
'Seasonal Naïve' = SNAIVE(Turnover),
'Naïve' = NAIVE(Turnover),
'Drift' = RW(Turnover ~ drift()
)
)
train_fit %>%
forecast(h = 1) %>%
autoplot(aus_retail)
Produce a time plot of the series.
gafa_stock %>%
filter(Symbol=="FB") %>%
autoplot(Close) +
labs(y="$", title="Facebook Closing prices")
Produce forecasts using the drift method and plot them.
gafa_stock %>%
filter(Symbol=="FB" & !is.na(Close)) %>%
mutate(Close = as.numeric(Close)) %>%
update_tsibble(index = Date, regular = TRUE) %>%
fill_gaps() %>%
model(RW(Close ~ drift())) %>%
forecast(h = 90) %>%
autoplot(gafa_stock) +
labs(y = "$", title = "Forecasted Facebook Stock Prices")
Show that the forecasts are identical to extending the line drawn between the first and last observations.
Try using some of the other benchmark functions to forecast the same data set. Which do you think is best? Why?
gafa_stock2 <- gafa_stock %>%
filter(Symbol=="FB" & !is.na(Close)) %>%
mutate(Close = as.numeric(Close)) %>%
update_tsibble(index = Date, regular = TRUE) %>%
fill_gaps() %>%
filter_index("2017-01-01" ~ .)
train_fits <- gafa_stock2 %>%
model(
'Mean' = MEAN(Close),
'Seasonal Naïve' = SNAIVE(Close),
'Naïve' = NAIVE(Close),
'Drift' = RW(Close ~ drift())
)
train_fits %>%
forecast(h = 30) %>%
autoplot(gafa_stock2)
## Warning: Removed 1 row containing missing values (`()`).
## Warning: Removed 2 rows containing missing values (`geom_line()`).
I’m more inclined to believe that the Naive method is the best since stocks famously are hard to predict and do not also follow seasonality and certainly do not follow their mean
Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values (`geom_line()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
We can see that the Residuals follow a normal distribution but not
perfect, which is one of the requirements for a good model, and with the
acf we can see that there are some outlines in lag data as well as a
lack of a cycle.
Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
bricks <- aus_production %>%
select(Bricks) %>%
filter(!is.na(Bricks))
fit <- bricks %>% model(SNAIVE(Bricks ~ lag("year")))
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values (`geom_line()`).
## Warning: Removed 4 rows containing missing values (`geom_point()`).
## Warning: Removed 4 rows containing non-finite values (`stat_bin()`).
We can see that this one follows an even more normal distribution that the Australian beer production and the acf data shows a consistent cycle.
For your retail time series (from Exercise 8 in Section 2.10):
set.seed(43)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
#a Create a training dataset consisting of observations before 2011 using
myseries_train <- myseries |>
filter(year(Month) < 2011)
#b Check that your data have been split appropriately by producing the following plot.
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
#c Fit a seasonal naïve model using SNAIVE() applied to your training data
fit <- myseries_train |>
model(SNAIVE(Turnover ~ lag("year")))
#d Check the residuals.
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values (`geom_line()`).
## Warning: Removed 12 rows containing missing values (`geom_point()`).
## Warning: Removed 12 rows containing non-finite values (`stat_bin()`).
#looks correlated and the residuals follow a normal distribution
#e Produce forecasts for the test data
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
#f Compare the accuracy of your forecasts against the actual values.
fit |> accuracy()
## # A tibble: 1 × 12
## State Indus…¹ .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Western … Hardwa… "SNAI… Trai… 3.39 11.5 8.77 6.65 13.9 1 1 0.823
## # … with abbreviated variable name ¹Industry
fc |> accuracy(myseries)
## # A tibble: 1 × 12
## .model State Indus…¹ .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 "SNAIVE(T… West… Hardwa… Test 59.2 64.2 59.2 34.7 34.7 6.76 5.60 0.923
## # … with abbreviated variable name ¹Industry
G How sensitive are the accuracy measures to the amount of training data used?