library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.3 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
Do the exercises 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.8 in Hyndman.
Figure 9.32 shows the ACFs for 36 random numbers, 360 random numbers and 1,000 random numbers.
ACF for a white noise series of 36 numbers:
ACF for a white noise series of 360 numbers:
ACF for a white noise series of 1,000 numbers:
With more numbers, you can become more exact to the mean of zero.
A classic example of a non-stationary series are stock prices. Plot
the daily closing prices for Amazon stock (contained in
gafa_stock
), along with the ACF and PACF. Explain how each
plot shows that the series is non-stationary and should be
differenced.
amazon_stock <- gafa_stock |>
filter(Symbol == "AMZN")
amazon_stock |>
autoplot(Close) +
labs(y = "Amazon closing stock price ($USD)")
ACF:
amazon_stock |>
ACF(Close) |>
autoplot()
## Warning: Provided data has an irregular interval, results should be treated
## with caution. Computing ACF by observation.
The ACF of non-stationary data decreases slowly.
PACF:
amazon_stock |>
PACF(Close) |>
autoplot()
## Warning: Provided data has an irregular interval, results should be treated
## with caution. Computing ACF by observation.
The non-stationary data has quite a few significant spikes at certain lags (largest at 1).
For the following series, find an appropriate Box-Cox transformation and order of differencing in order to obtain stationary data.
global_economy
.Original data:
global_economy_turkey <- global_economy |>
filter(Country == "Turkey")
global_economy_turkey |>
autoplot(GDP)
Box-cox transformation:
# guerrero box cox
lambda_global_economy_turkey <- global_economy_turkey |>
features(GDP, features = guerrero) |>
pull(lambda_guerrero)
lambda_global_economy_turkey
## [1] 0.1572187
global_economy_turkey |>
autoplot(box_cox(GDP, lambda_global_economy_turkey))
global_economy_turkey |>
autoplot(difference(box_cox(GDP, lambda_global_economy_turkey)))
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
KPSS test:
global_economy_turkey |>
features(difference(box_cox(GDP, lambda_global_economy_turkey)), unitroot_kpss)
## # A tibble: 1 × 3
## Country kpss_stat kpss_pvalue
## <fct> <dbl> <dbl>
## 1 Turkey 0.0889 0.1
The p-value is reported as 0.1 so we can conclude that the differenced data appear stationary.
global_economy_turkey |>
features(box_cox(GDP, lambda_global_economy_turkey), unitroot_ndiffs)
## # A tibble: 1 × 2
## Country ndiffs
## <fct> <int>
## 1 Turkey 1
One difference is required to make the
global_economy_turkey
data stationary.
aus_accommodation
.Original data:
aus_accommodation_tasmania <- aus_accommodation |>
filter(State == "Tasmania")
aus_accommodation_tasmania |>
autoplot(Takings)
Box-cox transformation:
# guerrero box cox
lambda_aus_accommodation_tasmania <- aus_accommodation_tasmania |>
features(Takings, features = guerrero) |>
pull(lambda_guerrero)
lambda_global_economy_turkey
## [1] 0.1572187
aus_accommodation_tasmania |>
autoplot(box_cox(Takings, lambda_aus_accommodation_tasmania))
aus_accommodation_tasmania |>
features(box_cox(Takings, lambda_aus_accommodation_tasmania), unitroot_ndiffs)
## # A tibble: 1 × 2
## State ndiffs
## <chr> <int>
## 1 Tasmania 1
One difference is required to make the
aus_accommodation_tasmania
data stationary.
souvenirs
.Original data:
souvenirs |>
autoplot(Sales)
Box-cox transformation:
# guerrero box cox
lambda_souvenirs <- souvenirs |>
features(Sales, features = guerrero) |>
pull(lambda_guerrero)
lambda_souvenirs
## [1] 0.002118221
souvenirs |>
autoplot(box_cox(Sales, lambda_souvenirs))
souvenirs |>
features(box_cox(Sales, lambda_souvenirs), unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 1
One difference is required to make the souvenirs
data
stationary.
For your retail data (from Exercise 7 in Section 2.10), find the appropriate order of differencing (after transformation if necessary) to obtain stationary data.
Original data:
set.seed(100)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
Transformation:
lambda <- myseries |>
features(Turnover, features = guerrero) |>
pull(lambda_guerrero)
lambda
## [1] 0.1571524
myseries |>
autoplot(box_cox(Turnover, lambda))
myseries |>
features(box_cox(Turnover, lambda), unitroot_ndiffs)
## # A tibble: 1 × 3
## State Industry ndiffs
## <chr> <chr> <int>
## 1 New South Wales Supermarket and grocery stores 1
Check for seasonality:
myseries_total <- myseries |>
summarise(Turnover = sum(Turnover))
myseries_total |>
mutate(log_turnover = log(Turnover)) |>
features(log_turnover, unitroot_nsdiffs)
## # A tibble: 1 × 1
## nsdiffs
## <int>
## 1 1
myseries_total |>
mutate(log_turnover = difference(log(Turnover), 12)) |>
features(log_turnover, unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 1
Using log
, these functions suggest we should do both a
seasonal difference and a first difference.
Using box cox:
myseries |>
features(box_cox(Turnover, lambda), unitroot_nsdiffs)
## # A tibble: 1 × 3
## State Industry nsdiffs
## <chr> <chr> <int>
## 1 New South Wales Supermarket and grocery stores 1
myseries |>
features(difference(box_cox(Turnover, lambda)), unitroot_ndiffs)
## # A tibble: 1 × 3
## State Industry ndiffs
## <chr> <chr> <int>
## 1 New South Wales Supermarket and grocery stores 0
Using box-cox, these functions suggest we should do just a seasonal difference.
Simulate and plot some data from simple ARIMA models.
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- 0.6*y[i-1] + e[i]
sim <- tsibble(idx = seq_len(100), y = y, index = idx)
head(sim)
## # A tsibble: 6 x 2 [1]
## idx y
## <int> <dbl>
## 1 1 0
## 2 2 -0.0789
## 3 3 0.839
## 4 4 0.621
## 5 5 0.691
## 6 6 -0.167
sim |>
autoplot(y)
Increase ϕ1:
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- 0.9*y[i-1] + e[i]
sim2 <- tsibble(idx = seq_len(100), y = y, index = idx)
head(sim2)
## # A tsibble: 6 x 2 [1]
## idx y
## <int> <dbl>
## 1 1 0
## 2 2 -0.469
## 3 3 0.421
## 4 4 -1.08
## 5 5 -1.37
## 6 6 -2.01
sim2 |>
autoplot(y)
This graph shows less drastic variation.
Decrease ϕ1:
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- 0.1*y[i-1] + e[i]
sim3 <- tsibble(idx = seq_len(100), y = y, index = idx)
head(sim2)
## # A tsibble: 6 x 2 [1]
## idx y
## <int> <dbl>
## 1 1 0
## 2 2 -0.469
## 3 3 0.421
## 4 4 -1.08
## 5 5 -1.37
## 6 6 -2.01
sim3 |>
autoplot(y)
This graph shows more drastic variation.
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- e[i] + 0.6 * e[i-1]
sim_ma <- tsibble(idx = seq_len(100), y = y, index = idx)
head(sim_ma)
## # A tsibble: 6 x 2 [1]
## idx y
## <int> <dbl>
## 1 1 0
## 2 2 -1.66
## 3 3 -1.28
## 4 4 -0.00809
## 5 5 1.26
## 6 6 0.750
Plot:
sim_ma |>
autoplot(y)
Increase ϕ1:
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- e[i] + 0.9 * e[i-1]
sim_ma <- tsibble(idx = seq_len(100), y = y, index = idx)
head(sim_ma)
## # A tsibble: 6 x 2 [1]
## idx y
## <int> <dbl>
## 1 1 0
## 2 2 1.31
## 3 3 0.255
## 4 4 -1.96
## 5 5 0.173
## 6 6 -1.03
sim_ma |>
autoplot(y)
This graph shows a bit less drastic variation, yet looks very similar to me.
Decrease ϕ1:
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- e[i] + 0.1 * e[i-1]
sim_ma <- tsibble(idx = seq_len(100), y = y, index = idx)
head(sim_ma)
## # A tsibble: 6 x 2 [1]
## idx y
## <int> <dbl>
## 1 1 0
## 2 2 -0.311
## 3 3 -1.97
## 4 4 0.0497
## 5 5 -0.338
## 6 6 2.40
sim_ma |>
autoplot(y)
This graph shows a bit more variation.
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- 0.6*y[i-1] + 0.6*e[i-1] + e[i]
sim_arma <- tsibble(idx = seq_len(100), y = y, index = idx)
head(sim_arma)
## # A tsibble: 6 x 2 [1]
## idx y
## <int> <dbl>
## 1 1 0
## 2 2 -1.26
## 3 3 -0.732
## 4 4 0.830
## 5 5 -0.519
## 6 6 -0.625
y <- numeric(100)
e <- rnorm(100)
for(i in 3:100)
y[i] <- 0.3*y[i-2] + -0.8*y[i-1] + e[i]
sim_ar_2 <- tsibble(idx = seq_len(100), y = y, index = idx)
head(sim_ar_2)
## # A tsibble: 6 x 2 [1]
## idx y
## <int> <dbl>
## 1 1 0
## 2 2 0
## 3 3 0.891
## 4 4 -2.19
## 5 5 4.03
## 6 6 -2.97
sim_arma |>
autoplot(y)
sim_ar_2 |>
autoplot(y)
Graph 1 shows pretty constant variation, whereas graph 2 has a major y axis scale, and shows a major change in variation.
Consider aus_airpassengers
, the total number of
passengers (in millions) from Australian air carriers for the period
1970-2011.
ARIMA()
to find an appropriate ARIMA model. What
model was selected. Check that the residuals look like white noise. Plot
forecasts for the next 10 periods.Original data:
aus_airpassengers_filtered <- aus_airpassengers |>
filter(Year <= 2011)
aus_airpassengers_filtered |>
autoplot(Passengers)
The data appears to have no seasonality.
fit <- aus_airpassengers_filtered |>
model(ARIMA(Passengers))
report(fit)
## Series: Passengers
## Model: ARIMA(0,2,1)
##
## Coefficients:
## ma1
## -0.8756
## s.e. 0.0722
##
## sigma^2 estimated as 4.671: log likelihood=-87.8
## AIC=179.61 AICc=179.93 BIC=182.99
The ARIMA(0,2,1) was selected.
Residuals:
fit |>
gg_tsresiduals()
# Ljung-Box test
augment(fit) |> features(.innov, ljung_box)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 ARIMA(Passengers) 2.38 0.123
p value is larger than 0.05 which suggests white noise.
Plot forecasts:
fit |> forecast(h=10) |>
autoplot(aus_airpassengers_filtered) +
labs(title = "Total number of passengers (in millions) from Australian air carriers")
d=2, so the second-order difference is denoted (1 − B)^2.
y′′t = yt − 2yt−1 + yt−2 = (1 − B)^2 * yt
fit2 <- aus_airpassengers_filtered |>
model(ARIMA(Passengers ~ pdq(0,1,0)))
report(fit2)
## Series: Passengers
## Model: ARIMA(0,1,0) w/ drift
##
## Coefficients:
## constant
## 1.3669
## s.e. 0.3319
##
## sigma^2 estimated as 4.629: log likelihood=-89.08
## AIC=182.17 AICc=182.48 BIC=185.59
fit2 |> forecast(h=10) |>
autoplot(aus_airpassengers_filtered) +
labs(title = "Total number of passengers (in millions) from Australian air carriers")
This graph has a less steep slope than the original forecast. The passengers do not increase as rapidly.
Also, AICc is larger for this model.
fit3 <- aus_airpassengers_filtered |>
model(ARIMA(Passengers ~ pdq(2,1,2)))
report(fit3)
## Series: Passengers
## Model: ARIMA(2,1,2) w/ drift
##
## Coefficients:
## ar1 ar2 ma1 ma2 constant
## 1.4694 -0.5103 -1.5736 0.6780 0.0650
## s.e. 0.3780 0.3558 0.3081 0.2688 0.0294
##
## sigma^2 estimated as 4.748: log likelihood=-87.74
## AIC=187.47 AICc=189.94 BIC=197.75
fit3 |> forecast(h=10) |>
autoplot(aus_airpassengers_filtered) +
labs(title = "Total number of passengers (in millions) from Australian air carriers")
This has the largest AICc value. The graph looks pretty similar to part c.
Without a constant:
fit4 <- aus_airpassengers_filtered |>
model(ARIMA(Passengers ~ 0 + pdq(2,1,2)))
## Warning: 1 error encountered for ARIMA(Passengers ~ 0 + pdq(2, 1, 2))
## [1] non-stationary AR part from CSS
report(fit4)
## Series: Passengers
## Model: NULL model
## NULL model
fit4 |> forecast(h=10) |>
autoplot(aus_airpassengers_filtered) +
labs(title = "Total number of passengers (in millions) from Australian air carriers")
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).
I dont see any difference after removing the constant.
fit5 <- aus_airpassengers_filtered |>
model(ARIMA(Passengers ~ 1 + pdq(0,2,1)))
## Warning: Model specification induces a quadratic or higher order polynomial trend.
## This is generally discouraged, consider removing the constant or reducing the number of differences.
report(fit5)
## Series: Passengers
## Model: ARIMA(0,2,1) w/ poly
##
## Coefficients:
## ma1 constant
## -1.0000 0.0721
## s.e. 0.0709 0.0260
##
## sigma^2 estimated as 4.086: log likelihood=-85.74
## AIC=177.48 AICc=178.15 BIC=182.55
fit5 |> forecast(h=10) |>
autoplot(aus_airpassengers_filtered) +
labs(title = "Total number of passengers (in millions) from Australian air carriers")
For the United States GDP series (from
global_economy
):
Original data:
global_economy_us <- global_economy |>
filter(Country == "United States")
global_economy_us |>
autoplot(GDP)
There is not a lot of changes in variation, so I don’t think a box cox transformation is needed.
fit <- global_economy_us |>
model(ARIMA(GDP))
report(fit)
## Series: GDP
## Model: ARIMA(0,2,2)
##
## Coefficients:
## ma1 ma2
## -0.4206 -0.3048
## s.e. 0.1197 0.1078
##
## sigma^2 estimated as 2.615e+22: log likelihood=-1524.08
## AIC=3054.15 AICc=3054.61 BIC=3060.23
fit |> forecast(h=25) |>
autoplot(global_economy_us)
ARIMA(0,2,2) was selected.
global_economy_us |>
features(GDP, unitroot_ndiffs)
## # A tibble: 1 × 2
## Country ndiffs
## <fct> <int>
## 1 United States 2
So d should be 2. Let’s take double difference:
global_economy_us |>
gg_tsdisplay(difference(GDP) |> difference(), plot_type='partial')
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).
The PACF is sinusoidal and there is a significant spike at lag 7 in the ACF, but none beyond lag 7. This indicates ARIMA(0,d,q) model.
Let’s try (0, 2, q) combinations:
fit <- global_economy_us |>
model(arima020 = ARIMA(GDP ~ pdq(0,2,0)),
arima021 = ARIMA(GDP ~ pdq(0,2,1)),
arima022 = ARIMA(GDP ~ pdq(0,2,2)))
fit |> pivot_longer(!Country, names_to = "Model name",
values_to = "Orders")
## # A mable: 3 x 3
## # Key: Country, Model name [3]
## Country `Model name` Orders
## <fct> <chr> <model>
## 1 United States arima020 <ARIMA(0,2,0)>
## 2 United States arima021 <ARIMA(0,2,1)>
## 3 United States arima022 <ARIMA(0,2,2)>
glance(fit) |> arrange(AICc) |> select(.model:BIC)
## # A tibble: 3 × 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima022 2.61e22 -1524. 3054. 3055. 3060.
## 2 arima021 2.92e22 -1528. 3059. 3059. 3063.
## 3 arima020 3.10e22 -1530. 3061. 3061. 3063.
The AICc values are fairly close, but arima022 has the lowest value.
Since arima022 has the lowest, that will be the model used.
fit |>
select(arima022) |>
gg_tsresiduals()
augment(fit) |>
filter(.model=='arima022') |>
features(.innov, ljung_box)
## # A tibble: 1 × 4
## Country .model lb_stat lb_pvalue
## <fct> <chr> <dbl> <dbl>
## 1 United States arima022 0.0352 0.851
p value is large (larger than 0.05), suggesting residuals are white noise.
fit |>
forecast(h=25) |>
filter(.model=='arima022') |>
autoplot(global_economy_us)
This forecast does look reasonable.
ETS()
(with no transformation).global_economy_us |>
slice(-n()) |>
stretch_tsibble(.init = 10) |>
model(
ETS(GDP),
ARIMA(GDP)
) |>
forecast(h = 25) |>
accuracy(global_economy_us) |>
select(.model, RMSE:MAPE)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing.
## 24 observations are missing between 2018 and 2041
## # A tibble: 2 × 5
## .model RMSE MAE MPE MAPE
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 ARIMA(GDP) 2.09e12 1.51e12 15.9 17.4
## 2 ETS(GDP) 1.95e12 1.43e12 14.4 15.8
fit <- global_economy_us |>
model(ets = ETS(GDP))
report(fit)
## Series: GDP
## Model: ETS(M,A,N)
## Smoothing parameters:
## alpha = 0.9990876
## beta = 0.5011949
##
## Initial states:
## l[0] b[0]
## 448093333334 64917355687
##
## sigma^2: 7e-04
##
## AIC AICc BIC
## 3190.787 3191.941 3201.089
fit |>
forecast (h = 25) |>
autoplot(global_economy_us, level = NULL)
ETS has smaller RMSE. The graph itself looks very similar.