9. ARIMA Models

Exercises 9.1, 9.2, 9.3, 9.5, 9.6, 9.7, 9.8

#Load Libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tsibble)
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## 
## Attaching package: 'tsibble'
## 
## The following object is masked from 'package:lubridate':
## 
##     interval
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(fabletools)
library(feasts)
library(tsibbledata)
library(fable)
library(dplyr)
library(fpp3)
library(ggplot2)

9.1

  1. Explain the differences among these figures. Do they all indicate that the data are white noise? Yes all three figures indicate that the data are white noise but differ primarily due to the varying sample sizes. The variability in the estimated autocorrelations decreases with increasing sample size.

  2. Why are the critical values at different distances from the mean of zero? Why are the autocorrelations different in each figure when they each refer to white noise? The critical values are diffeent distances from the mean of zero because smaller sample sizes lead to wider bounds, making it more likely for a spike to cross the threshold even by chance. With smaller n, this random variation is larger so the differences in autocorrelations and critical values are purely due to sample size.

9.2

# GAFA stock dataset
data("gafa_stock")

# Daily Closing Prices: Amazon

amazon_data <- gafa_stock %>%
  filter(Symbol == "AMZN") %>%
  select(Date, Close)

amazon_data %>%
  ggplot(aes(x = Date, y = Close)) +
  geom_line(color = "steelblue") +
  labs(title = "Amazon Daily Closing Prices",
       y = "Closing Price (USD)",
       x = "Date") +
  theme_minimal()

amazon_tsibble <- gafa_stock %>%
  filter(Symbol == "AMZN") %>%
  select(Date, Close) %>%
  as_tsibble(index = Date)

# ACF and PACF plots
amazon_tsibble %>%
  ACF(Close) %>%
  autoplot() +
  labs(title = "ACF of Amazon Closing Prices")
## Warning: Provided data has an irregular interval, results should be treated
## with caution. Computing ACF by observation.

amazon_tsibble %>%
  PACF(Close) %>%
  autoplot() +
  labs(title = "PACF of Amazon Closing Prices")
## Warning: Provided data has an irregular interval, results should be treated
## with caution. Computing ACF by observation.

The slow, gradual decline of the ACF suggests the presence of a trend and non-stationarity. The strong initial spike suggests that the series is highly dependent on its most recent past value, again hinting that is is non-stationary.

9.3

# Turkish GDP from global_economy
turkey_gdp <- global_economy %>%
  filter(Country == "Turkey") %>%
  select(Year, GDP) %>%
  as_tsibble(index = Year)

turkey_gdp %>% autoplot(GDP) + ggtitle("Turkish GDP")

# Find Box-Cox lambda
turkey_lambda <- turkey_gdp %>% features(GDP, features = guerrero) %>% pull(lambda_guerrero)

turkey_lambda
## [1] 0.1572187
turkey_gdp %>%
  mutate(transformed = box_cox(GDP, turkey_lambda)) %>%
  features(transformed, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      1.50        0.01
# Plot ACF/PACF after transformation
turkey_gdp %>%
  mutate(transformed = box_cox(GDP, turkey_lambda)) %>%
  ACF(transformed) %>% autoplot()

turkey_gdp %>%
  mutate(transformed = box_cox(GDP, turkey_lambda)) %>%
  PACF(transformed) %>% autoplot()

glimpse(aus_accommodation)
## Rows: 592
## Columns: 5
## Key: State [8]
## $ Date      <qtr> 1998 Q1, 1998 Q2, 1998 Q3, 1998 Q4, 1999 Q1, 1999 Q2, 1999 Q…
## $ State     <chr> "Australian Capital Territory", "Australian Capital Territor…
## $ Takings   <dbl> 24.27889, 22.32354, 22.52939, 24.39096, 23.72143, 25.38245, …
## $ Occupancy <dbl> 65.0, 59.0, 58.0, 59.0, 58.0, 61.0, 66.0, 60.0, 60.9, 64.7, …
## $ CPI       <dbl> 67.0, 67.4, 67.5, 67.8, 67.8, 68.1, 68.7, 69.1, 69.7, 70.2, …
# Accommodation takings in Tasmania from aus_accommodation
tas_accom <- aus_accommodation %>%
  filter(State == "Tasmania") %>%
  select(Date, Takings) %>%
  as_tsibble(index = Date)

tas_accom %>%
  autoplot(Takings) +
  labs(title = "Accommodation Takings in Tasmania", y = "Takings (Million AUD)", x = "Quarter")

# Estimate Box-Cox lambda
lambda_tas <- tas_accom %>%
  features(Takings, features = guerrero) %>%
  pull(lambda_guerrero)

# ACF & PACF after transformation
tas_accom %>%
  mutate(transformed = box_cox(Takings, lambda_tas)) %>%
  ACF(transformed) %>%
  autoplot()

tas_accom %>%
  mutate(transformed = box_cox(Takings, lambda_tas)) %>%
  PACF(transformed) %>%
  autoplot()

# Monthly sales from souvenirs
souvenirs %>% autoplot(Sales) + ggtitle("Monthly Souvenir Sales")

# Estimate Box-Cox lambda
sales_lambda <- souvenirs %>% features(Sales, features = guerrero) %>% pull(lambda_guerrero)

souvenirs %>%
  mutate(transformed = box_cox(Sales, sales_lambda)) %>%
  features(transformed, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      1.79        0.01
# Visualize ACF/PACF
souvenirs %>%
  mutate(transformed = box_cox(Sales, sales_lambda)) %>%
  ACF(transformed) %>% autoplot()

souvenirs %>%
  mutate(transformed = box_cox(Sales, sales_lambda)) %>%
  PACF(transformed) %>% autoplot()

### 9.5

set.seed(123) 
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`, 1)) %>%
  select(Month, Turnover) %>%
  as_tsibble(index = Month)

myseries %>%
  autoplot(Turnover) +
  labs(title = "Randomly Sampled Retail Series", y = "Turnover", x = "Month")

lambda <- myseries %>%
  features(Turnover, features = guerrero) %>%
  pull(lambda_guerrero)

lambda  
## [1] 0.2151641
# Apply transformation
myseries_trans <- myseries %>%
  mutate(transformed = box_cox(Turnover, lambda))

# ACF and PACF
myseries_trans %>% ACF(transformed) %>% autoplot()

myseries_trans %>% PACF(transformed) %>% autoplot()

# KPSS test
myseries_trans %>%
  features(transformed, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      7.35        0.01
myseries_trans %>%
  mutate(diff1 = difference(transformed)) %>%
  features(diff1, unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1    0.0157         0.1

9.6

#a
set.seed(123)

y <- numeric(100)
e <- rnorm(100, mean = 0, sd = 1)
for (i in 2:100) {
  y[i] <- 0.6 * y[i - 1] + e[i]
}

sim_ar1 <- tsibble(idx = seq_len(100), y = y, index = idx)

#b
autoplot(sim_ar1, y) +
  labs(title = "AR(1) Simulation (ϕ₁ = 0.6)", y = "y", x = "Time")

#c 
set.seed(123)

e <- rnorm(101)  
y <- numeric(100)
for (i in 1:100) {
  y[i] <- e[i + 1] + 0.6 * e[i]
}

sim_ma1 <- tsibble(idx = seq_len(100), y = y, index = idx)

#d
autoplot(sim_ma1, y) +
  labs(title = "MA(1) Simulation (θ₁ = 0.6)", y = "y", x = "Time")

#e
set.seed(123)

e <- rnorm(101)
y <- numeric(100)
for (i in 2:100) {
  y[i] <- 0.6 * y[i - 1] + e[i] + 0.6 * e[i - 1]
}

sim_arma11 <- tsibble(idx = seq_len(100), y = y, index = idx)

autoplot(sim_arma11, y) +
  labs(title = "ARMA(1,1) Simulation (ϕ₁ = 0.6, θ₁ = 0.6)", y = "y", x = "Time")

#f
set.seed(123)

e <- rnorm(100)
y <- numeric(100)
y[1:2] <- e[1:2]  

for (i in 3:100) {
  y[i] <- -0.8 * y[i - 1] + 0.3 * y[i - 2] + e[i]
}

sim_ar2_nonstat <- tsibble(idx = seq_len(100), y = y, index = idx)

autoplot(sim_ar2_nonstat, y) +
  labs(title = "AR(2) Non-Stationary Simulation (ϕ₁ = -0.8, ϕ₂ = 0.3)", y = "y", x = "Time")

#g
sim_arma11_tbl <- sim_arma11 %>%
  as_tibble() %>%
  mutate(type = "ARMA(1,1)")

sim_ar2_tbl <- sim_ar2_nonstat %>%
  as_tibble() %>%
  mutate(type = "AR(2) Non-Stationary")

combined <- bind_rows(sim_arma11_tbl, sim_ar2_tbl)

ggplot(combined, aes(x = idx, y = y, color = type)) +
  geom_line() +
  labs(
    title = "Comparison of ARMA(1,1) and AR(2) Non-Stationary Series",
    x = "Time",
    y = "y"
  ) +
  theme_minimal()

### 9.7

#a
fit_auto <- aus_airpassengers %>%
  model(auto = ARIMA(Passengers))

report(fit_auto)
## Series: Passengers 
## Model: ARIMA(0,2,1) 
## 
## Coefficients:
##           ma1
##       -0.8963
## s.e.   0.0594
## 
## sigma^2 estimated as 4.308:  log likelihood=-97.02
## AIC=198.04   AICc=198.32   BIC=201.65
fit_auto %>%
  gg_tsresiduals()

fit_auto %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers) +
  labs(title = "Forecast: Auto ARIMA", y = "Passengers (millions)")

#b
tidy(fit_auto)
## # A tibble: 1 × 6
##   .model term  estimate std.error statistic  p.value
##   <chr>  <chr>    <dbl>     <dbl>     <dbl>    <dbl>
## 1 auto   ma1     -0.896    0.0594     -15.1 3.24e-19
#c
fit_rw_drift <- aus_airpassengers %>%
  model(rw_drift = ARIMA(Passengers ~ drift()))
## Warning: 1 error encountered for rw_drift
## [1] could not find function "drift"
fit_rw_drift %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers) +
  labs(title = "Forecast: ARIMA(0,1,0) with Drift")
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).

#d
fit_212 <- aus_airpassengers %>%
  model(arima212 = ARIMA(Passengers ~ pdq(2,1,2) + drift()))
## Warning: 1 error encountered for arima212
## [1] could not find function "drift"
fit_212 %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers) +
  labs(title = "Forecast: ARIMA(2,1,2) with Drift")
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).

fit_212_nodrift <- aus_airpassengers %>%
  model(arima212_nodrift = ARIMA(Passengers ~ pdq(2,1,2)))
## Warning: It looks like you're trying to fully specify your ARIMA model but have not said if a constant should be included.
## You can include a constant using `ARIMA(y~1)` to the formula or exclude it by adding `ARIMA(y~0)`.
## Warning: 1 error encountered for arima212_nodrift
## [1] Could not find an appropriate ARIMA model.
## This is likely because automatic selection does not select models with characteristic roots that may be numerically unstable.
## For more details, refer to https://otexts.com/fpp3/arima-r.html#plotting-the-characteristic-roots
fit_212_nodrift %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers) +
  labs(title = "Forecast: ARIMA(2,1,2) without Drift")
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).

#e
fit_021 <- aus_airpassengers %>%
  model(arima021 = ARIMA(Passengers ~ pdq(0,2,1) + drift()))
## Warning: 1 error encountered for arima021
## [1] could not find function "drift"
fit_021 %>%
  forecast(h = 10) %>%
  autoplot(aus_airpassengers) +
  labs(title = "Forecast: ARIMA(0,2,1) with Constant")
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning: Removed 10 rows containing missing values or values outside the scale range
## (`geom_line()`).

### 9.8

#a
data(global_economy)

us_gdp <- global_economy %>%
  filter(Country == "United States") %>%
  select(Year, GDP) %>%
  as_tsibble(index = Year)

lambda_us <- us_gdp %>%
  features(GDP, features = guerrero) %>%
  pull(lambda_guerrero)

lambda_us  
## [1] 0.2819443
#b
fit_auto_arima <- us_gdp %>%
  model(auto_arima = ARIMA(box_cox(GDP, lambda_us)))

report(fit_auto_arima)
## Series: GDP 
## Model: ARIMA(1,1,0) w/ drift 
## Transformation: box_cox(GDP, lambda_us) 
## 
## Coefficients:
##          ar1  constant
##       0.4586  118.1822
## s.e.  0.1198    9.5047
## 
## sigma^2 estimated as 5479:  log likelihood=-325.32
## AIC=656.65   AICc=657.1   BIC=662.78
#c
fit_manual <- us_gdp %>%
  model(
    arima111 = ARIMA(box_cox(GDP, lambda_us) ~ pdq(1,1,1)),
    arima012 = ARIMA(box_cox(GDP, lambda_us) ~ pdq(0,1,2))
  )

glance(fit_manual)
## # A tibble: 2 × 8
##   .model   sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots 
##   <chr>     <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>   
## 1 arima111  5580.   -325.  659.  659.  667. <cpl [1]> <cpl [1]>
## 2 arima012  5634.   -326.  659.  660.  667. <cpl [0]> <cpl [2]>
#d
fit_manual %>% select(arima111) %>% gg_tsresiduals()

fit_manual %>% select(arima012) %>% gg_tsresiduals()

#e
fit_best <- fit_manual 

fit_best %>%
  forecast(h = 10) %>%
  autoplot(us_gdp) +
  labs(title = "Forecasts from Selected ARIMA Model", y = "GDP (millions USD)")

#f
fit_ets <- us_gdp %>%
  model(ets = ETS(GDP))

fit_ets %>%
  forecast(h = 10) %>%
  autoplot(us_gdp) +
  labs(title = "Forecasts from ETS Model", y = "GDP (millions USD)")

bind_models <- bind_rows(
  forecast(fit_auto_arima, h = 10) %>% mutate(Model = "ARIMA"),
  forecast(fit_ets, h = 10) %>% mutate(Model = "ETS")
)

autoplot(us_gdp, level = NULL) +
  geom_line(data = bind_models, aes(y = .mean, color = Model)) +
  labs(title = "ARIMA vs ETS Forecast Comparison", y = "GDP")
## Plot variable not specified, automatically selected `.vars = GDP`
## Warning in geom_line(...): Ignoring unknown parameters: `level`