library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.4 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ purrr 1.0.4 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
vic_pigs <- aus_livestock |>
filter(Animal == "Pigs") |>
filter(State == "Victoria")
fit <- vic_pigs |>
model(ses = ETS(Count ~ error("A") + trend("N") + season("N")))
report(fit)
## Series: Count
## Model: ETS(A,N,N)
## Smoothing parameters:
## alpha = 0.3221247
##
## Initial states:
## l[0]
## 100646.6
##
## sigma^2: 87480760
##
## AIC AICc BIC
## 13737.10 13737.14 13750.07
Running the ETS function for SES, we get values of .3221 for alpha and 100646.6 for l0.
fc <- fit |>
forecast(h = 4)
fc
## # A fable: 4 x 6 [1M]
## # Key: Animal, State, .model [1]
## Animal State .model Month
## <fct> <fct> <chr> <mth>
## 1 Pigs Victoria ses 2019 Jan
## 2 Pigs Victoria ses 2019 Feb
## 3 Pigs Victoria ses 2019 Mar
## 4 Pigs Victoria ses 2019 Apr
## # ℹ 2 more variables: Count <dist>, .mean <dbl>
This table contains the predicted values using the SES model for the next four periods, which can be shown in the graph below.
fc |>
autoplot(vic_pigs) +
geom_line(aes(y = .fitted), col="#D55E00",
data = augment(fit)) +
labs(y="Count", title="Pigs Slaughtered in Victoria") +
guides(colour = "none")
yHat <- fc %>%
pull(Count) %>%
head(1)
stdev <- augment(fit) %>%
pull(.resid) %>%
sd()
CIlow <- yHat - stdev*1.96
CIhi <- yHat + 1.96*stdev
CIlow
## <distribution[1]>
## [1] N(76871, 8.7e+07)
CIhi
## <distribution[1]>
## [1] N(113502, 8.7e+07)
The Confidence Interval formed 1.96 standard deviations from the first forecast value ranges from 76,871 to 113,502.
We can extract the confidence intervals formed in our earlier model with the hilo() function
hilo(fc, 95)
## # A tsibble: 4 x 7 [1M]
## # Key: Animal, State, .model [1]
## Animal State .model Month
## <fct> <fct> <chr> <mth>
## 1 Pigs Victoria ses 2019 Jan
## 2 Pigs Victoria ses 2019 Feb
## 3 Pigs Victoria ses 2019 Mar
## 4 Pigs Victoria ses 2019 Apr
## # ℹ 3 more variables: Count <dist>, .mean <dbl>, `95%` <hilo>
Here, the 95% confidence interval for the first month ranges from 76,855 to 113,518. This is very close to the values that we calculated, but forming a slightly wider interval.
I will choose Italy since there is a map of it in my apartment.
it_exp <- global_economy |>
filter(Code == "ITA")
it_exp |>
autoplot(Exports)
There is mostly an upwards trend here, with a downwards period in the 1980s. There does not appear to be any seasonality (which makes sense, since this data is collected annually).
fit2 <- it_exp |>
model(ANN = ETS(Exports ~ error("A") + trend("N") + season("N")))
fc2 <- fit2 |>
forecast(h = 4)
fc2 |>
autoplot(it_exp) +
geom_line(aes(y = .fitted), col="#D55E00",
data = augment(fit2)) +
labs(y="Exports", title="Annual Italian Exports") +
guides(colour = "none")
accuracy(fit2)
## # A tibble: 1 × 11
## Country .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Italy ANN Training 0.324 1.34 1.00 1.39 4.75 0.983 0.991 -0.00701
The RMSE for this data is 1.3351.
fit3 <- it_exp |>
model(AAN = ETS(Exports ~ error("A") + trend("A") + season("N")))
accuracy(fit3)
## # A tibble: 1 × 11
## Country .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <fct> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Italy AAN Training -0.00841 1.30 0.934 -0.291 4.47 0.916 0.962 0.0375
The Root Mean Squared Error is slightly lower for the AAN model suggesting that this model is a better fit. This also makes sense considering there was a significant overall trend observed in the data. In this case, a model that takes trend into account would make stronger predictions than one that does not.
fc3 <- fit3 |>
forecast(h = 4)
fc3 |>
autoplot(it_exp) +
geom_line(aes(y = .fitted), col="#D55E00",
data = augment(fit3)) +
labs(y="Exports", title="Annual Italian Exports (AAN)") +
guides(colour = "none")
RMSE1 <- 1.335065
RMSE2 <- 1.295316
yHat2 <- fc2 |>
pull(.mean) |>
head(1)
yHat3 <- fc3 |>
pull(.mean) |>
head(1)
CIlow2 <- yHat2 - RMSE1 * 1.96
CIlow3 <- yHat3 - RMSE2 * 1.96
CIhi2 <- yHat2 + RMSE1 * 1.96
CIhi3 <- yHat3 + RMSE2 * 1.96
CIlow2
## [1] 28.67965
CIhi2
## [1] 33.91311
CIlow3
## [1] 29.03553
CIhi3
## [1] 34.11317
The 95% confidence interval we calculated for the ANN model is 28.70 - 33.91. For the AAN model, that interval is 29.04 - 34.11. The interval is slightly thinner for the second model.
hilo(fc2, 95)
## # A tsibble: 4 x 6 [1Y]
## # Key: Country, .model [1]
## Country .model Year
## <fct> <chr> <dbl>
## 1 Italy ANN 2018
## 2 Italy ANN 2019
## 3 Italy ANN 2020
## 4 Italy ANN 2021
## # ℹ 3 more variables: Exports <dist>, .mean <dbl>, `95%` <hilo>
hilo(fc3, 95)
## # A tsibble: 4 x 6 [1Y]
## # Key: Country, .model [1]
## Country .model Year
## <fct> <chr> <dbl>
## 1 Italy AAN 2018
## 2 Italy AAN 2019
## 3 Italy AAN 2020
## 4 Italy AAN 2021
## # ℹ 3 more variables: Exports <dist>, .mean <dbl>, `95%` <hilo>
For 2018, the 95% confidence interval of the ANN model that R calculated ranges from 28.63 to 33.96, just slightly wider than the range just calculated. For the AAN model in that same year, the calculated range is from 28.94 to 34.21, which is also slightly wider than the calculated range.
First to isolate the Chinese GDP
china_gdp <- global_economy |>
filter(Code == "CHN")
china_gdp |>
autoplot(GDP)
We are seeing exponential growth in our data, so let us investigate which models can accommodate this best.
lambda <- china_gdp |>
features(GDP, features = guerrero) |>
pull(lambda_guerrero)
china_gdp_models <-
china_gdp |>
model("ANN" = ETS(GDP ~ error("A") + trend("N") + season("N")),
"AAN" = ETS(GDP ~ error("A") + trend("A") + season("N")),
"AAN (Damped)" = ETS(GDP ~ error("A") + trend("Ad") + season("N")),
"Box-Cox" = ETS(box_cox(GDP,lambda) ~ error("A") + trend("A") + season("N")),
"Box-Cox (Damped)" = ETS(box_cox(GDP,lambda) ~ error("A") + trend("Ad") + season("N")))
china_gdp_forecasts <-
china_gdp_models |>
forecast(h = 25)
china_gdp_forecasts |>
autoplot(china_gdp, level=NULL) +
labs(title = "Forecasting China's GDP Over 25 Years", x="")
The ANN model does not account for the trend, so its trajectory is much lower than expected. On the other hand, the Box-Cox transformation overemphasizes the exponential nature of the growth, resulting in unrealistically high predictions. The damping applied to the Box-Cox transformation makes the prediction much more realistic. Compared to the scale of difference that damping makes on the Box-Cox transformed prediction, the damped AAN model closely matches the undamped AAN model.
aus_production |>
autoplot(Gas)
We see strong seasonality in this data as well as a general upwards trend, so we should pick between models that take this into consideration.
aus_gas_models <-
aus_production |>
model("AAA" = ETS(Gas ~ error("A") + trend("A") + season("A")),
"MAM" = ETS(Gas ~ error("M") + trend("A") + season("M")),
"MAM (Damped)" = ETS(Gas ~ error("M") + trend("Ad") + season("M")))
gas_forecasts <-
aus_gas_models |>
forecast(h = "5 years")
gas_forecasts |>
autoplot(aus_production, level=NULL) +
facet_wrap(~ .model) +
labs(title = "Forecasting Gas Production in Australia (5 Years)", x="")
accuracy(aus_gas_models)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAA Training 0.00525 4.76 3.35 -4.69 10.9 0.600 0.628 0.0772
## 2 MAM Training -0.115 4.60 3.02 0.199 4.08 0.542 0.606 -0.0131
## 3 MAM (Damped) Training -0.00439 4.59 3.03 0.326 4.10 0.544 0.606 -0.0217
As expected, the multiplicative models produce the strongest predictions, as shown by their lower RMSE. This makes sense because the seasonal variation is not constant, we see more variance as values rise. From these options, the damped multiplicative model seems the strongest.
set.seed(464319)
myseries <-aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
autoplot(myseries)
## Plot variable not specified, automatically selected `.vars = Turnover`
Just like the last data set, the variance is not constant through the time series. As the value of turnover are increasing, the variance increases as well.
retail_models <-
myseries |>
model("MAM" = ETS(Turnover ~ error("M") + trend("A") + season("M")),
"MAM (Damped)" = ETS(Turnover ~ error("M") + trend("Ad") + season("M")))
retail_forecasts <-
retail_models |>
forecast(h = "5 years")
retail_forecasts |>
autoplot(myseries, level=NULL) +
labs(title = "Predicted Turnover in South Australian Footwear Retail (5 Years)", x="")
accuracy(retail_models)
## # A tibble: 2 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South… Footwea… MAM Trai… 0.0668 1.80 1.13 -0.252 4.46 0.539 0.605 0.133
## 2 South… Footwea… MAM (… Trai… 0.134 1.80 1.14 0.169 4.47 0.541 0.606 0.108
The two methods have almost the same RMSE, but the Holt-Winters’ Multiplicative method produced the slightly lower error.
myseries |>
model(multiplicative = ETS(Turnover ~ error("M") + trend("A") + season("M"))) |>
gg_tsresiduals() +
ggtitle("Multiplicative Method Residuals")
Looking at the Autocorrelation plot, I’m not as confident that I can say that the residuals are white noise, since we have lines that cross the significance threshold six times. At least the claim seems to be supposed in the other two graphs.
myseries_train <- myseries |>
filter(year(Month) < 2011)
train_fit <- myseries_train |>
model(multiplicative = ETS(Turnover ~ error("M") + trend("A") + season("M")),
snaive = SNAIVE(Turnover))
fc4 <- train_fit |>
forecast(new_data = anti_join(myseries, myseries_train))
## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`
fc4 |>
autoplot(myseries)
accuracy(train_fit)
## # A tibble: 2 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Sout… Footwea… multi… Trai… 0.0186 1.14 0.793 -0.279 4.12 0.490 0.519 0.0851
## 2 Sout… Footwea… snaive Trai… 0.959 2.19 1.62 5.08 8.23 1 1 0.689
We can see that the naive model under performed in its predictions, predicting values lower than the actual observed ones. The multiplicative model produced values closer to those observed. Also the RMSE and MASE are much lower for the multiplicative model, nearly half of those from the naive model.
lambda <- myseries |>
features(Turnover, features = guerrero) |>
pull(lambda_guerrero)
myseries |>
model(STL(box_cox(Turnover,lambda) ~ season(window = "periodic"), robust = TRUE)) %>%
components() %>%
autoplot() +
ggtitle("STL with Box-Cox for South Australian Footwear Turnover Data")
ts_bc <- myseries |>
mutate(bc = box_cox(Turnover, lambda))
bc_fit <- ts_bc |>
model('STL (BoxCox)' = STL(bc ~ season(window = "periodic"), robust = T),
'ETS (BoxCox)' = ETS(bc))
accuracy(bc_fit)
## # A tibble: 2 × 12
## State Industry .model .type ME RMSE MAE MPE MAPE MASE RMSSE
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 South … Footwea… STL (… Trai… 7.16e-5 0.0275 0.0214 -0.0282 0.973 0.425 0.437
## 2 South … Footwea… ETS (… Trai… -2.09e-3 0.0329 0.0258 -0.117 1.17 0.513 0.522
## # ℹ 1 more variable: ACF1 <dbl>
In this case, our error increased after the ETS is applied.