library(fpp3)
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.4 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.1 ✔ feasts 0.4.1
## ✔ lubridate 1.9.4 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ purrr 1.0.4 ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
All of the figures indicate that the data are white noise since the lines never peak across the blue dotted lines on any ACF graph. The similarity across the graphs is that they have the same x and y axes and scales. The differences are how far the peaks span as well as how wide the area bounded by the blue lines is. As the number of numbers increases, the peaks are lower, yet still remain inside relatively smaller areas since this data is white noise.
The formula for the critical values that set the boundary is +/- 1.96/sqrt(T) where T is the length of the time series so as the number of values increases, the critical values will decrease.
amazon <- gafa_stock |>
filter(Symbol == "AMZN")
amazon |>
gg_tsdisplay(Close, plot_type='partial') +
labs(title = "Amazon Closing Stock Price")
## Warning: Provided data has an irregular interval, results should be treated with caution. Computing ACF by observation.
## Provided data has an irregular interval, results should be treated with caution. Computing ACF by observation.
We can tell that this is non-stationary data since there is a clear positive trend in the overall data. This is backed up by the ACF, where every line peaks over the blue dotted line, and the PACF, where at a lag of 1 there is a very large peak, and a few others across the graph. This is all consistent with data where there is a strong correlation between subsequent observations.
turk <- global_economy |>
filter(Country == "Turkey")
turk |>
gg_tsdisplay(GDP, plot_type='partial') +
labs(title = "Turkish GDP (no transformation)")
Looking at the data as is, it is clearly not stationary, with a positive trend and each observation being strongly correlated with the previous one. A Box-Cox transformation could be used to make it stationary.
lambda <- turk |>
features(GDP, features = guerrero) |>
pull(lambda_guerrero)
turk |>
features(box_cox(GDP,lambda), unitroot_ndiffs)
## # A tibble: 1 × 2
## Country ndiffs
## <fct> <int>
## 1 Turkey 1
turk |>
gg_tsdisplay(difference(box_cox(GDP,lambda)), plot_type='partial') +
labs(title = "Turkish GDP Differentiated where lambda = .16")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
taz <- aus_accommodation |>
filter(State == "Tasmania")
taz |>
gg_tsdisplay(Takings, plot_type='partial') +
labs(title = "Tasmanian Accomodation Taking (no transformation)")
This series is also not stationary. There is visible yearly seasonality.
lambda <- taz |>
features(Takings, features = guerrero) |>
pull(lambda_guerrero)
taz |>
features(box_cox(Takings,lambda), unitroot_ndiffs)
## # A tibble: 1 × 2
## State ndiffs
## <chr> <int>
## 1 Tasmania 1
taz |>
gg_tsdisplay(difference(box_cox(Takings,lambda), 4), plot_type='partial') +
labs(title = "Tasmanian Accomodation Taking Transformed where lambda = .002")
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).
souvenirs |>
gg_tsdisplay(Sales, plot_type='partial') +
labs(title = "Souvenir Sales (no transformation)")
Again we see strong seasonality, and high correlations particularly at lags of 1 and 12.
lambda <- souvenirs |>
features(Sales, features = guerrero) |>
pull(lambda_guerrero)
souvenirs |>
features(box_cox(Sales,lambda), unitroot_ndiffs)
## # A tibble: 1 × 1
## ndiffs
## <int>
## 1 1
souvenirs |>
gg_tsdisplay(difference(box_cox(Sales,lambda), 12), plot_type='partial') +
labs(title = "Souvenir Sales Transformed where lambda = .002")
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
set.seed(464319)
myseries <-aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
autoplot(myseries) +
labs(title = "South Australian Footwear & Accessory Retail")
## Plot variable not specified, automatically selected `.vars = Turnover`
This data has clear seasonality and an overall positive trend, so it is not stationary.
lambda <- myseries |>
features(Turnover, features = guerrero) |>
pull(lambda_guerrero)
myseries |>
features(box_cox(Turnover,lambda), unitroot_ndiffs)
## # A tibble: 1 × 3
## State Industry ndiffs
## <chr> <chr> <int>
## 1 South Australia Footwear and other personal accessory retailing 1
myseries |>
gg_tsdisplay(difference(box_cox(Turnover,lambda), 12), plot_type='partial') +
labs(title = "Footwear & Accessory Turnover in South Australia Transformed where lambda = -.188")
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
With the ACF graph falling close to zero over time, that is a sign that the transformation was helpful. The unit root test returns a value of 1, suggesting that only one level of differentiation is necessary.
y <- numeric(100)
e <- rnorm(100)
for(i in 2:100)
y[i] <- 0.6*y[i-1] + e[i]
sim <- tsibble(idx = seq_len(100), y = y, index = idx)
autoplot(sim) +
labs(title = "phi = .6")
## Plot variable not specified, automatically selected `.vars = y`
Now to change the values and see how the plot is affected.
for(i in 2:100)
y[i] <- 0.1*y[i-1] + e[i]
sim2 <- tsibble(idx = seq_len(100), y = y, index = idx)
autoplot(sim2) +
labs(title = "phi = .1")
## Plot variable not specified, automatically selected `.vars = y`
for(i in 2:100)
y[i] <- 1*y[i-1] + e[i]
sim3 <- tsibble(idx = seq_len(100), y = y, index = idx)
autoplot(sim3) +
labs(title = "phi = 1")
## Plot variable not specified, automatically selected `.vars = y`
The change in phi changes the appearance of the time series. It seems that the lower the value of phi is, the more the data looks like random noise. To confirm, let’s check a negative value for phi.
for(i in 2:100)
y[i] <- -0.6*y[i-1] + e[i]
sim4 <- tsibble(idx = seq_len(100), y = y, index = idx)
autoplot(sim4) +
labs(title = "phi = -.6")
## Plot variable not specified, automatically selected `.vars = y`
With a negative value, it looks like the resulting values are oscillating around zero.
for(i in 2:100)
y[i] <- e[i] + 0.6 * e[i-1]
sim5 <- tsibble(idx = seq_len(100), y = y, index = idx)
autoplot(sim5) +
labs(title = "theta = .6")
## Plot variable not specified, automatically selected `.vars = y`
And some other values of theta to compare:
for(i in 2:100)
y[i] <- e[i] + 0.1 * e[i-1]
sim6 <- tsibble(idx = seq_len(100), y = y, index = idx)
autoplot(sim6) +
labs(title = "theta = .1")
## Plot variable not specified, automatically selected `.vars = y`
for(i in 2:100)
y[i] <- e[i] + 1 * e[i-1]
sim7 <- tsibble(idx = seq_len(100), y = y, index = idx)
autoplot(sim7) +
labs(title = "theta = 1")
## Plot variable not specified, automatically selected `.vars = y`
for(i in 2:100)
y[i] <- e[i] + -0.6 * e[i-1]
sim8 <- tsibble(idx = seq_len(100), y = y, index = idx)
autoplot(sim8) +
labs(title = "theta = -.6")
## Plot variable not specified, automatically selected `.vars = y`
It appears as theta increases, the data changes from positive to negative more rapidly. The magnitude of the spikes is consistent across the values of theta.
y <- numeric(100)
for(i in 2:100)
y[i] <- e[i] + 0.6 * e[i-1] + 0.6 * y[i-1]
arma1_1 <- tsibble(idx = seq_len(100), y = y, index = idx)
y <- numeric(100)
for(i in 3:100)
y[i] <- e[i] + -0.8 * y[i-1] + .3 * y[i-2] +e[i]
ar2 <- tsibble(idx = seq_len(100), y=y, index = idx)
gg_tsdisplay(arma1_1, plot_type='partial') +
labs(title = "ARMA (1,1)")
## Plot variable not specified, automatically selected `y = y`
gg_tsdisplay(ar2, plot_type='partial') +
labs(title = "AR2")
## Plot variable not specified, automatically selected `y = y`
The ARMA(1,1) model seems to be potentially stationary, but the AR2 model is clearly not, with its high auto correlations and visible seasonality.
fit <- aus_airpassengers |>
filter(Year < 2012) |>
model(ARIMA(Passengers))
report(fit)
## Series: Passengers
## Model: ARIMA(0,2,1)
##
## Coefficients:
## ma1
## -0.8756
## s.e. 0.0722
##
## sigma^2 estimated as 4.671: log likelihood=-87.8
## AIC=179.61 AICc=179.93 BIC=182.99
An ARIMA(0,2,1) model was selected by the function as the best fitting option.
fit |>
forecast(h=10) |>
autoplot(aus_airpassengers) +
labs(title = "Australian Aircraft Passengers with ARIMA(0,2,1)", y = "Passengers (in millions)")
This plot shows the forecase for next 10 years.
fit |>
gg_tsresiduals() +
labs(title = "Residuals for ARIMA(0,2,1)")
Here we can see that the data looks like white noise from the plot of the residuals and the ACF, with a lack of crossing the dotted line.
(1−B)^2 *yt=(1−0.87B)εt
pass010 <- aus_airpassengers |>
filter(Year < 2012) |>
model(ARIMA(Passengers ~ pdq(0,1,0)))
pass010 |>
forecast(h=10) |>
autoplot(aus_airpassengers) +
labs(title = "Australian Aircraft Passengers with ARIMA(0,1,0)")
pass010 |>
gg_tsresiduals() +
labs(title = "Residuals for ARIMA(0,1,0)")
We do see similarity between the two ARIMA models in terms of their forecasts and residuals. The peaks on the ACF seem slightly higher with the second model, which gives more evidence that the first model is a better fit.
pass212 <- aus_airpassengers |>
filter(Year < 2012) |>
model(ARIMA(Passengers ~ pdq(2,1,2)))
pass212 |>
forecast(h=10) |>
autoplot(aus_airpassengers) +
labs(title = "Australian Aircraft Passengers with ARIMA(2,1,2)")
pass212 |>
gg_tsresiduals() +
labs(title = "Residuals for ARIMA(2,1,2)")
The ACF makes this option look better than our first one. I’m curious to why it wasn’t chosen when running the program that selects the best ARIMA model.
fit <- aus_airpassengers %>%
filter(Year < 2012) %>%
model(ARIMA(Passengers ~ 0 + pdq(2,1,2)))
## Warning: 1 error encountered for ARIMA(Passengers ~ 0 + pdq(2, 1, 2))
## [1] non-stationary AR part from CSS
We got an error when we removed the constant because the series becomes no longer stationary.
pass021 <- aus_airpassengers |>
filter(Year < 2012) |>
model(ARIMA(Passengers ~ pdq(0,2,1) +1))
## Warning: Model specification induces a quadratic or higher order polynomial trend.
## This is generally discouraged, consider removing the constant or reducing the number of differences.
pass021 |>
forecast(h=10) |>
autoplot(aus_airpassengers) +
labs(title = "Australian Aircraft Passengers with ARIMA(0,2,1)")
Although this entry did produce a graph, it came with a warning in the output: Warning: Model specification induces a quadratic or higher order polynomial trend. This is generally discouraged, consider removing the constant or reducing the number of differences. Comparing to the lack of a constant:
pass021 <- aus_airpassengers |>
filter(Year < 2012) |>
model(ARIMA(Passengers ~ pdq(0,2,1)))
pass021 |>
forecast(h=10) |>
autoplot(aus_airpassengers) +
labs(title = "Australian Aircraft Passengers with ARIMA(0,2,1)")
It seems that the constant just added itself to each value, throwing off the accuracy of our prediction.
us <- global_economy |>
filter(Code == "USA")
us |>
autoplot(GDP) +
labs(title = "USA GDP Undifferentiated")
There is a consistent trend and non-constant variance, so a Box-Cox transformation would be helpful.
lambda <- us |>
features(GDP, features = guerrero) |>
pull(lambda_guerrero)
us |>
features(box_cox(GDP,lambda), unitroot_ndiffs)
## # A tibble: 1 × 2
## Country ndiffs
## <fct> <int>
## 1 United States 1
us |>
gg_tsdisplay(difference(box_cox(GDP,lambda)), plot_type='partial') +
labs(title = "USA GDP Differentiated where lambda = .281")
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).
The unit root tests recommends the first order transformation. We see improvement in the stationariness of the data looking at the ACF and PACF graphs.
usmodel <- us |>
model(ARIMA(box_cox(GDP, lambda)))
report(usmodel)
## Series: GDP
## Model: ARIMA(1,1,0) w/ drift
## Transformation: box_cox(GDP, lambda)
##
## Coefficients:
## ar1 constant
## 0.4586 118.1822
## s.e. 0.1198 9.5047
##
## sigma^2 estimated as 5479: log likelihood=-325.32
## AIC=656.65 AICc=657.1 BIC=662.78
The function recommends a (1,1,0) ARIMA model with drift since we used the data after the Box-Cox transformation
Since we know the ideal level of differentiation is 1 from the prior test, let us focus on other ARIMA models that use first level differentiation, but test a few others.
usa_fit <- us |>
model(arima110 = ARIMA(box_cox(GDP,lambda) ~ pdq(1,1,0)),
arima120 = ARIMA(box_cox(GDP,lambda) ~ pdq(1,2,0)),
arima210 = ARIMA(box_cox(GDP,lambda) ~ pdq(2,1,0)),
arima212 = ARIMA(box_cox(GDP,lambda) ~ pdq(2,1,2)),
arima111 = ARIMA(box_cox(GDP,lambda) ~ pdq(1,1,1)),
arima310 = ARIMA(box_cox(GDP,lambda) ~ pdq(3,1,0)))
glance(usa_fit) %>% arrange(AICc) %>% select(.model:BIC)
## # A tibble: 6 × 6
## .model sigma2 log_lik AIC AICc BIC
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 arima120 6780. -326. 656. 656. 660.
## 2 arima110 5479. -325. 657. 657. 663.
## 3 arima111 5580. -325. 659. 659. 667.
## 4 arima210 5580. -325. 659. 659. 667.
## 5 arima310 5676. -325. 661. 662. 671.
## 6 arima212 5734. -325. 662. 664. 674.
The lowest AIC value is associated with the (1,2,0) model. Perhaps the drift was not correctly accounted for?
usa_fit |>
select(arima120) |>
gg_tsresiduals() +
ggtitle("Residuals for ARIMA(1,2,0)")
One spike crosses the dotted line in the ACF, but otherwise these residuals look like white noise, which is the desired outcome.
usa_fit |>
forecast(h=15) |>
filter(.model=='arima120') |>
autoplot(us) +
labs(title = "US GDP Plus Prediction from ARIMA (1,2,0)")
These do seem like reasonable forecasts. They continue the observed trend, but there is a wide range of possibilities.
fit_ets <- us |>
model(ETS(GDP))
report(fit_ets)
## Series: GDP
## Model: ETS(M,A,N)
## Smoothing parameters:
## alpha = 0.9990876
## beta = 0.5011949
##
## Initial states:
## l[0] b[0]
## 448093333334 64917355687
##
## sigma^2: 7e-04
##
## AIC AICc BIC
## 3190.787 3191.941 3201.089
The ETS model produces an AIC of 3190.787, while the chosen ARIMA (1,2,0) model produces an AIC of only 665.9896, so the ARIMA model should be making more reliable predictions.