Do exercises 5.1, 5.2, 5.3, 5.4 and 5.7 in the Hyndman book.
Produce forecasts for the following series using whichever of NAIVE(y), SNAIVE(y) or RW(y ~ drift()) is more appropriate in each case:
library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.4 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
?global_economy
## starting httpd help server ... done
head(global_economy)
## # A tsibble: 6 x 9 [1Y]
## # Key: Country [1]
## Country Code Year GDP Growth CPI Imports Exports Population
## <fct> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan AFG 1960 537777811. NA NA 7.02 4.13 8996351
## 2 Afghanistan AFG 1961 548888896. NA NA 8.10 4.45 9166764
## 3 Afghanistan AFG 1962 546666678. NA NA 9.35 4.88 9345868
## 4 Afghanistan AFG 1963 751111191. NA NA 16.9 9.17 9533954
## 5 Afghanistan AFG 1964 800000044. NA NA 18.1 8.89 9731361
## 6 Afghanistan AFG 1965 1006666638. NA NA 21.4 11.3 9938414
australia_population <- global_economy %>%
filter(Country == "Australia") %>%
select(Population)
australia_population %>%
autoplot() +
labs(title = "Australian Population")
## Plot variable not specified, automatically selected `.vars = Population`
Since there is a clear increasing trend without much seasonality, I will
be using RW(y ~ drift())
forecast_population <- australia_population %>% model(RW(Population ~ drift()))
fc_pop <- forecast_population %>% forecast(h = "10 years")
autoplot(fc_pop) +
labs(title = "Forecast for Australian Population Using RW(y ~ drift())")
?aus_production
head(aus_production)
## # A tsibble: 6 x 7 [1Q]
## Quarter Beer Tobacco Bricks Cement Electricity Gas
## <qtr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1956 Q1 284 5225 189 465 3923 5
## 2 1956 Q2 213 5178 204 532 4436 6
## 3 1956 Q3 227 5297 208 561 4806 7
## 4 1956 Q4 308 5681 197 570 4418 6
## 5 1957 Q1 262 5577 187 529 4339 5
## 6 1957 Q2 228 5651 214 604 4811 7
brick_production <- aus_production %>%
select(Bricks)
brick_production %>%
autoplot() +
labs(title = "Brick Production Rates in Australia")
## Plot variable not specified, automatically selected `.vars = Bricks`
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
Since there is a lot of seasonality in this graph, I think using
SNAIVE(y) would be best in this case
fit_bricks <- aus_production %>%
model(SNAIVE(Bricks))
fc_bricks <- forecast(fit_bricks, h = "4 years")
autoplot(fc_bricks, aus_production) +
labs(title = "Bricks Production in Australia using SNAIVE(y)",
y = "Bricks", x = "Time")
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 16 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
?aus_livestock
head(aus_livestock)
## # A tsibble: 6 x 4 [1M]
## # Key: Animal, State [1]
## Month Animal State Count
## <mth> <fct> <fct> <dbl>
## 1 1976 Jul Bulls, bullocks and steers Australian Capital Territory 2300
## 2 1976 Aug Bulls, bullocks and steers Australian Capital Territory 2100
## 3 1976 Sep Bulls, bullocks and steers Australian Capital Territory 2100
## 4 1976 Oct Bulls, bullocks and steers Australian Capital Territory 1900
## 5 1976 Nov Bulls, bullocks and steers Australian Capital Territory 2100
## 6 1976 Dec Bulls, bullocks and steers Australian Capital Territory 1800
nsw_lambs <- aus_livestock %>%
filter(State == "New South Wales", Animal == "Lambs") %>%
select(Count)
nsw_lambs %>%
autoplot() +
labs(title = "Count of Lambs in NSW (New South Wales)")
## Plot variable not specified, automatically selected `.vars = Count`
Since this trend shows a lot of seaosnality, I will be using
SNAIVE(y)
fit_lambs <- nsw_lambs %>%
model(SNAIVE(Count))
fc_lambs <- forecast(fit_lambs, h = "2 years")
autoplot(fc_lambs, nsw_lambs) +
labs(title = "Count of Lambs in New South Wales",
y = "Count", x = "Time")
?hh_budget
head(hh_budget)
## # A tsibble: 6 x 8 [1Y]
## # Key: Country [1]
## Country Year Debt DI Expenditure Savings Wealth Unemployment
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Australia 1995 95.7 3.72 3.40 5.24 315. 8.47
## 2 Australia 1996 99.5 3.98 2.97 6.47 315. 8.51
## 3 Australia 1997 108. 2.52 4.95 3.74 323. 8.36
## 4 Australia 1998 115. 4.02 5.73 1.29 339. 7.68
## 5 Australia 1999 121. 3.84 4.26 0.638 354. 6.87
## 6 Australia 2000 126. 3.77 3.18 1.99 350. 6.29
household_wealth <- hh_budget %>%
select(Wealth)
household_wealth %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Wealth`
This graph seems to be showing a positive and increasing trend over time
without much seaosnality, so I will be using the drift method
fit_wealth <- household_wealth %>%
model(RW(Wealth ~ drift()))
fc_wealth <- forecast(fit_wealth, h = "5 years")
autoplot(fc_wealth, household_wealth) +
labs(title = "Household Wealth Forecase using RW(y ~ drift())",
y = "Wealth", x = "Year")
?aus_retail
head(aus_retail)
## # A tsibble: 6 x 5 [1M]
## # Key: State, Industry [1]
## State Industry `Series ID` Month Turnover
## <chr> <chr> <chr> <mth> <dbl>
## 1 Australian Capital Territory Cafes, restaurants… A3349849A 1982 Apr 4.4
## 2 Australian Capital Territory Cafes, restaurants… A3349849A 1982 May 3.4
## 3 Australian Capital Territory Cafes, restaurants… A3349849A 1982 Jun 3.6
## 4 Australian Capital Territory Cafes, restaurants… A3349849A 1982 Jul 4
## 5 Australian Capital Territory Cafes, restaurants… A3349849A 1982 Aug 3.6
## 6 Australian Capital Territory Cafes, restaurants… A3349849A 1982 Sep 4.2
australian_takeaway <- aus_retail %>%
filter(Industry == "Takeaway food services",State == "Australian Capital Territory") %>%
select(Turnover)
australian_takeaway %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Turnover`
Since this graph is showing a positive increasing trend, I will be using
the drift method for this forecast
fit_retail <- australian_takeaway %>%
model(RW(Turnover ~ drift()))
fc_retail <- forecast(fit_retail, h = "5 years")
autoplot(fc_retail, australian_takeaway) +
labs(title = "Australian Takeaway Food Turnover using Drift Method",
y = "Turnover", x = "Time")
Use the Facebook stock price (data set gafa_stock) to do the following:
head(gafa_stock)
## # A tsibble: 6 x 8 [!]
## # Key: Symbol [1]
## Symbol Date Open High Low Close Adj_Close Volume
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL 2014-01-02 79.4 79.6 78.9 79.0 67.0 58671200
## 2 AAPL 2014-01-03 79.0 79.1 77.2 77.3 65.5 98116900
## 3 AAPL 2014-01-06 76.8 78.1 76.2 77.7 65.9 103152700
## 4 AAPL 2014-01-07 77.8 78.0 76.8 77.1 65.4 79302300
## 5 AAPL 2014-01-08 77.0 77.9 77.0 77.6 65.8 64632400
## 6 AAPL 2014-01-09 78.1 78.1 76.5 76.6 65.0 69787200
facebook_stock <- gafa_stock %>%
filter(Symbol == "FB")
autoplot(facebook_stock, Close) +
labs(title = "Facebook Stock Price Time Series", y = "Close Price", x = "Date")
The times for this series are irregular, so we need to make sure we first fix them. This example was shown how to do in the textbook:
facebook_stock <- facebook_stock %>%
mutate(day = row_number()) %>%
update_tsibble(index = day,regular = TRUE) %>%
select(Close)
Now after altering the new time index based on the trade day, we can now model the data using the drift method
fit_facebook <- facebook_stock %>%
model(drift = RW(Close~drift()))
fc_facebook <- forecast(fit_facebook, h = 50)
autoplot(fc_facebook, facebook_stock) +
labs(title = "Facebook Stock Price using Drift method", y = "Close Price", x = "Date")
ggplot(facebook_stock,aes(x = day)) +
geom_line(aes(y = Close)) +
geom_segment(aes(x = min(day),y = first(Close),
xend = max(day),yend = last(Close)), color = "blue",linetype = "dotted") +
labs(title = "Facebook Stock First & Last Observation")
## Warning in geom_segment(aes(x = min(day), y = first(Close), xend = max(day), : All aesthetics have length 1, but the data has 1258 rows.
## ℹ Please consider using `annotate()` or provide this layer with data containing
## a single row.
I’m going to try to use NAIVE method
fit_naive <- facebook_stock %>%
model(NAIVE(Close))
fc_naive <- forecast(fit_naive, h = 50)
autoplot(fc_naive, facebook_stock) +
labs(title = "Facebook Stock Price using NAIVE Method", y = "Close Price", x = "Date")
Using the drift method shows a line following the positive trend rather
than using the NAIVE method which shows a constant line. By using the
Drift method, it is able to incorporate the overall trend between the
first and last observation as well
Apply a seasonal naïve method to the quarterly Australian beer production data from 1992. Check if the residuals look like white noise, and plot the forecasts. The following code will help.
# Extract data of interest
recent_production <- aus_production |>
filter(year(Quarter) >= 1992)
recent_production %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Beer`
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).
# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)
Looking at this information provided and using a seasonal naive method
to the quarterly Australian beer production data from 1992, it seems
like the residuals seem to be centered around 0, but still shows a large
variance across the data. It also does not show a strong significant in
autocorrelations, I beliebe the seasonal naive method is good for
capturing the seasonal structure in the beer production data
Repeat the previous exercise using the Australian Exports series from global_economy and the Bricks series from aus_production. Use whichever of NAIVE() or SNAIVE() is more appropriate in each case.
australian_exports <- global_economy %>%
filter(Country == "Australia") %>%
select(Exports)
australian_exports %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Exports`
this graph is showing an annual correlation without much seasonal
fluctuations, so I think using Naive would be good
fit_exports <- australian_exports %>%
model(NAIVE(Exports))
fc_exports <- forecast(fit_exports)
autoplot(fc_exports, australian_exports) +
labs(title = "Naive Forecast for Australian Exports",
y = "Exports", x = "Year")
fit_exports %>%
gg_tsresiduals()
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).
The residuals for this case are very centered around 0 and normally distributed and it also shows some correlation and distribution as well
brick_forecast <- aus_production %>%
select(Bricks)
brick_forecast %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Bricks`
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
fit_brick <- brick_forecast %>%
model(SNAIVE(Bricks))
fit_brick %>%
gg_tsresiduals()
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).
Here it is showcasing a low variation for the residuals but showing a
larger variation after the 1980s. This makes it believe that the
residuals are white noise
For your retail time series (from Exercise 7 in Section 2.10):
set.seed(242424)
myseries <- aus_retail %>%
filter(`Series ID` == 'A3349849A')
myseries_train <- myseries |>
filter(year(Month) < 2011)
autoplot(myseries, Turnover) +
autolayer(myseries_train, Turnover, colour = "red")
fit <- myseries_train |>
model(SNAIVE(Turnover))
fit |> gg_tsresiduals()
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
Do the residuals appear to be uncorrelated and normally distributed?
The residuals are shown to be not very centered around 0 and there seems to be slight skewness to the right, so it does not seem like the residuals are normally distributed and the ACF plot shows that the residuals are also correlated
fc <- fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc |> autoplot(myseries)
fit |> accuracy()
## # A tibble: 1 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Austral… Cafes, … SNAIV… Trai… 0.985 3.37 2.53 5.05 16.1 1 1 0.826
fc |> accuracy(myseries)
## # A tibble: 1 × 12
## .model State Industry .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Aust… Cafes, … Test 5.86 8.42 7.38 13.4 19.0 2.92 2.50 0.847
The accuracy measures seem to be sensitive to the amount of training data used and seems to show a correlation of having a better accuracy rating for more training data used