Homework6

library(tidyverse)
library(fpp3)
library(forecast)
library(urca)
library(gridExtra)
library(patchwork)

9.1

Do they all indicate that the data are white noise?

All three figures indicate that the data represent white noise, as most of the lags fall within the blue dashed lines. The exception is the white noise series of 36 numbers, where some spikes exceed the significance bounds.

Why are the critical values at different distances from the mean of zero? Why are the autocorrelations different in each figure when they each refer to white noise?

The blue dashed lines indicate the confidence bounds, which decrease in width as the sample size increases. As the sample size grows, the confidence intervals become narrower, making it easier to establish that the autocorrelations are close to zero. White noise is a completely random process, so each sample will exhibit different random variations. Additionally, these random fluctuations are more pronounced in smaller samples, resulting in more noticeable spikes in the Auto-Correlation Function.

9.2

# Load and filter Amazon stock data
amazon_stock <- gafa_stock %>%
  filter(Symbol == "AMZN")

# Plot the time series
autoplot(amazon_stock, Close) +
  ggtitle("Amazon Stock Prices (Closing)") +
  ylab("Closing Price") + xlab("Year")

The price series demonstrates an upward trend; however, in 2018, prices decreased rapidly with some fluctuations.

# ACF plots
amazon_stock %>%
  ACF(Close) %>%
  autoplot() +
  ggtitle("Autocorrelation Function (ACF) of Amazon Closing Prices")

The ACF plot indicates that lags decrease slowly over time, suggesting non-stationarity. This implies that past values in the series influence future values.

# PACF plots
amazon_stock %>%
  PACF(Close) %>%
  autoplot() +
  ggtitle("Partial Autocorrelation Function (PACF) of Amazon Closing Prices")

The PACF plot shows significant spikes and this indicates strong autocorrelation, another sign of non-stationarity.

9.3

a. Turkish GDP from global_economy.

# Load Turkish GDP data
turkey_gdp <- global_economy %>%
  filter(Country == "Turkey")

# Check for missing values
sum(is.na(turkey_gdp$GDP))

## [1] 0

# Identify Box-Cox transformation parameter
lambda_gdp <- turkey_gdp %>%
  features(GDP, features = guerrero) %>%
  pull(lambda_guerrero)

# Plot GDP and first difference
autoplot(turkey_gdp, GDP) + ggtitle("Turkish GDP Time Series")

# Difference the GDP series
turkey_gdp <- turkey_gdp %>%
  mutate(GDP_diff = difference(GDP))

# Plot differenced series
autoplot(turkey_gdp, GDP_diff) + ggtitle("Differenced Turkish GDP")

# ACF
turkey_gdp %>%
  ACF(GDP) %>%
  autoplot()

# PACF
turkey_gdp %>%
  PACF(GDP) %>%
  autoplot()

b. Accommodation takings in the state of Tasmania from aus_accommodation.

# Load data
tas_acc <- aus_accommodation %>%
  filter(State == "Tasmania")

# Check for missing values
sum(is.na(tas_acc$Takings))

## [1] 0

# Identify Box-Cox lambda
lambda_tas <- tas_acc %>%
  features(Takings, features = guerrero) %>%
  pull(lambda_guerrero)

# Plot series
autoplot(tas_acc, Takings) + ggtitle("Accommodation Takings in Tasmania")

# Difference the Takings series
tas_acc <- tas_acc %>%
  mutate(takings_diff = difference(Takings))

# Plot differenced series
autoplot(tas_acc, takings_diff) + ggtitle("Differenced Accommodation Takings in the State of Tasmania")

# ACF and PACF
tas_acc %>%
  ACF(Takings) %>%
  autoplot()

# PACF
tas_acc %>%
  PACF(Takings) %>%
  autoplot()

c. Monthly sales from souvenirs.

# Load souvenirs sales data
souvenir_sales <- souvenirs

# Check for missing values
sum(is.na(souvenir_sales$Sales))

## [1] 0

# Identify Box-Cox transformation
lambda_sales <- souvenir_sales %>%
  features(Sales, features = guerrero) %>%
  pull(lambda_guerrero)

# Plot original series
autoplot(souvenir_sales, Sales) + ggtitle("Monthly Souvenir Sales")

# Difference the sales series
souvenir_sales <- souvenir_sales %>%
  mutate(sales_diff = difference(Sales))

# Plot differenced series
autoplot(souvenir_sales, sales_diff) + ggtitle("Differenced Monthly sales from souvenirs")

# ACF and PACF
souvenir_sales %>%
  ACF(Sales) %>%
  autoplot()

# PACF
souvenir_sales %>%
  PACF(Sales) %>%
  autoplot()

# KPSS test for stationarity
turkey_gdp %>%
  features(GDP_diff, unitroot_kpss)

## # A tibble: 1 × 3
##   Country kpss_stat kpss_pvalue
##   <fct>       <dbl>       <dbl>
## 1 Turkey      0.386      0.0834

# KPSS test for stationarity
tas_acc %>%
  features(takings_diff, unitroot_kpss)

## # A tibble: 1 × 3
##   State    kpss_stat kpss_pvalue
##   <chr>        <dbl>       <dbl>
## 1 Tasmania     0.231         0.1

# KPSS test for stationarity
souvenir_sales %>%
  features(sales_diff, unitroot_kpss)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.240         0.1

All three series are stationary after first differencing, as their p-values exceed 0.05.

9.5

# Load data
set.seed(1225)
myseries <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

# Plot the original time series
autoplot(myseries, Turnover) + ggtitle("Original Time Series")

# Apply Box-Cox transformation if necessary
lambda <- BoxCox.lambda(myseries$Turnover)  # Estimate lambda
myseries <- myseries |> mutate(Transformed = BoxCox(Turnover, lambda))

d <- ndiffs(myseries$Transformed)
d

## [1] 1

# Apply first-order differencing
myseries <- myseries |> mutate(Diffed = difference(Transformed, lag = 1, differences = d))

# Remove NAs created by differencing
myseries <- myseries |> drop_na(Diffed)

# Plot the differenced series
autoplot(myseries, Diffed) + ggtitle("Differenced Time Series")

# ADF test to confirm stationarity
adf_test_diffed <- ur.df(myseries$Diffed, type = "drift", selectlags = "AIC")
summary(adf_test_diffed)

## 
## ############################################### 
## # Augmented Dickey-Fuller Test Unit Root Test # 
## ############################################### 
## 
## Test regression drift 
## 
## 
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.058513 -0.010064  0.000949  0.011327  0.051101 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0021346  0.0009005   2.371   0.0182 *  
## z.lag.1     -1.6152555  0.0739053 -21.856  < 2e-16 ***
## z.diff.lag   0.2635997  0.0462383   5.701  2.2e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01874 on 435 degrees of freedom
## Multiple R-squared:  0.6643, Adjusted R-squared:  0.6628 
## F-statistic: 430.5 on 2 and 435 DF,  p-value: < 2.2e-16
## 
## 
## Value of test-statistic is: -21.8557 238.8412 
## 
## Critical values for test statistics: 
##       1pct  5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1  6.47  4.61  3.79

# KPSS test for stationarity
myseries %>%
  features(Diffed, unitroot_kpss)

## # A tibble: 1 × 4
##   State           Industry               kpss_stat kpss_pvalue
##   <chr>           <chr>                      <dbl>       <dbl>
## 1 New South Wales Takeaway food services    0.0177         0.1

The appropriate order of differencing to achieve stationary data is 1 since the p-value after first differencing exceeds 0.05.

9.6

Part a: Simulate an AR(1) model with phi1 = 0.6 and sigma^2 = 1

# Load data
set.seed(1048)
y <- numeric(100)
e <- rnorm(100, mean = 0, sd = 1)  # White noise

for(i in 2:100) {
  y[i] <- 0.6 * y[i - 1] + e[i]
}

sim_ar1 <- tsibble(idx = seq_len(100), y = y, index = idx)

Part b: Time plot for AR(1)

ggplot(sim_ar1, aes(x = idx, y = y)) +
  geom_line() +
  ggtitle("Simulated AR(1) Process (phi1 = 0.6)") +
  xlab("Time") +
  ylab("Value")

Increasing phi1 enhances the series’ persistence, while reducing it toward 0 makes it resemble white noise.

Part c: Simulate an MA(1) model with theta1 = 0.6

y_ma <- numeric(100)
e_ma <- rnorm(101, mean = 0, sd = 1)  # One extra for lag

for(i in 2:100) {
  y_ma[i] <- e_ma[i] + 0.6 * e_ma[i - 1]
}

sim_ma1 <- tsibble(idx = seq_len(100), y = y_ma, index = idx)

Part d: Time plot for MA(1)

ggplot(sim_ma1, aes(x = idx, y = y)) +
  geom_line() +
  ggtitle("Simulated MA(1) Process (theta1 = 0.6)") +
  xlab("Time") +
  ylab("Value")

Higher values pf theta1 increase the correlation of the process, while lower values decrease the autocorrelation.

Part e: Simulate an ARMA(1,1) model with phi1 = 0.6, theta1 = 0.6

y_arma <- numeric(100)
e_arma <- rnorm(101, mean = 0, sd = 1)

for(i in 2:100) {
  y_arma[i] <- 0.6 * y_arma[i - 1] + e_arma[i] + 0.6 * e_arma[i - 1]
}

sim_arma11 <- tsibble(idx = seq_len(100), y = y_arma, index = idx)

Part f: Simulate an AR(2) model with phi1 = -0.8, phi2 = 0.3

y_ar2 <- numeric(100)
e_ar2 <- rnorm(100, mean = 0, sd = 1)

for(i in 3:100) {
  y_ar2[i] <- -0.8 * y_ar2[i - 1] + 0.3 * y_ar2[i - 2] + e_ar2[i]
}

sim_ar2 <- tsibble(idx = seq_len(100), y = y_ar2, index = idx)

Part g: Graph the ARMA(1,1) and AR(2) models

p1 <- ggplot(sim_arma11, aes(x = idx, y = y)) +
  geom_line() +
  ggtitle("Simulated ARMA(1,1) Process") +
  xlab("Time") +
  ylab("Value")

p2 <- ggplot(sim_ar2, aes(x = idx, y = y)) +
  geom_line() +
  ggtitle("Simulated AR(2) Process") +
  xlab("Time") +
  ylab("Value")

# Display plots side by side
grid.arrange(p1, p2, ncol = 2)

The ARMA(1,1) model shows both short-term and long-term connections. In contrast, the AR(2) model can be unstable and might display sudden changes in behavior.

9.7

Part a: Automatically select an ARIMA model

# Load the dataset
data("aus_airpassengers")

# Convert to time series format
ts_data <- as.ts(aus_airpassengers$Passengers)

auto_model <- auto.arima(ts_data)
print(auto_model)

## Series: ts_data 
## ARIMA(0,2,1) 
## 
## Coefficients:
##           ma1
##       -0.8963
## s.e.   0.0594
## 
## sigma^2 = 4.308:  log likelihood = -97.02
## AIC=198.04   AICc=198.32   BIC=201.65

# Check residuals
checkresiduals(auto_model)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,2,1)
## Q* = 6.5774, df = 8, p-value = 0.5828
## 
## Model df: 1.   Total lags used: 9

# Forecast for the next 10 periods
forecast_auto <- forecast(auto_model, h=10)
autoplot(forecast_auto)

Part b: Backshift notation

cat("The selected model in backshift notation is: \n")

## The selected model in backshift notation is:

print(auto_model)

## Series: ts_data 
## ARIMA(0,2,1) 
## 
## Coefficients:
##           ma1
##       -0.8963
## s.e.   0.0594
## 
## sigma^2 = 4.308:  log likelihood = -97.02
## AIC=198.04   AICc=198.32   BIC=201.65

Part c: ARIMA(0,1,0) with drift

arima_drift <- Arima(ts_data, order=c(0,1,0), include.drift=TRUE)
forecast_drift <- forecast(arima_drift, h=10)
autoplot(forecast_drift) + ggtitle("ARIMA(0,1,0) with Drift")

In comparison to the auto-selected ARIMA model, which is ARIMA(0,2,1), the ARIMA(0,1,0) model forecasts lower values.

Part d: ARIMA(2,1,2) with drift

arima_212_drift <- Arima(ts_data, order=c(2,1,2), include.drift=TRUE)
forecast_212_drift <- forecast(arima_212_drift, h=10)
autoplot(forecast_212_drift) + ggtitle("ARIMA(2,1,2) with Drift")

Compared to the two previous ARIMA models, this model’s forecast has more fluctuations and curves.

Remove constant and compare

arima_212_no_const <- Arima(ts_data, order=c(2,1,2), include.constant=FALSE, method="ML")
forecast_212_no_const <- forecast(arima_212_no_const, h=10)
autoplot(forecast_212_no_const) + ggtitle("ARIMA(2,1,2) without Constant (ML Method)")

The line representing the forecasts is much steeper now, indicating that the forecast values are significantly larger. Additionally, the confidence intervals are considerably narrower.

Part e: ARIMA(0,2,1) with a constant

arima_021_constant <- Arima(ts_data, order=c(0,2,1), include.constant=TRUE)
forecast_021_constant <- forecast(arima_021_constant, h=10)
autoplot(forecast_021_constant) + ggtitle("ARIMA(0,2,1) with Constant")

This plot is the same as the automatically selected ARIMA model, which is ARIMA(0,2,1), but it does not specify to include a constant.

9.8

Part a: Apply a Box-Cox transformation (if necessary)

# Load the dataset
data("global_economy")

# Filter for United States GDP data
us_gdp <- global_economy %>% filter(Country == "United States") %>% select(Year, GDP)

# Check for transformations using Guerrero's method
lambda <- us_gdp %>% features(GDP, guerrero) %>% pull(lambda_guerrero)

# Display lambda value
lambda

## [1] 0.2819443

# Apply Box-Cox transformation if needed
us_gdp <- us_gdp %>% mutate(GDP_transformed = box_cox(GDP, lambda))

# Plot transformed series
autoplot(us_gdp, GDP_transformed)

Part b: Fit a suitable ARIMA model

# Fit an automatic ARIMA model
fit_arima <- us_gdp %>% model(ARIMA(GDP_transformed))

# Summarize the ARIMA model
report(fit_arima)

## Series: GDP_transformed 
## Model: ARIMA(1,1,0) w/ drift 
## 
## Coefficients:
##          ar1  constant
##       0.4586  118.1822
## s.e.  0.1198    9.5047
## 
## sigma^2 estimated as 5479:  log likelihood=-325.32
## AIC=656.65   AICc=657.1   BIC=662.78

Part c: Experiment with different ARIMA orders

# Try alternative ARIMA models manually
fit_arima2 <- us_gdp %>% model(ARIMA(GDP_transformed ~ pdq(2,1,2) + PDQ(0,1,1)))

# Compare models using AICc
glance(fit_arima)

## # A tibble: 1 × 8
##   .model                 sigma2 log_lik   AIC  AICc   BIC ar_roots  ma_roots 
##   <chr>                   <dbl>   <dbl> <dbl> <dbl> <dbl> <list>    <list>   
## 1 ARIMA(GDP_transformed)  5479.   -325.  657.  657.  663. <cpl [1]> <cpl [0]>

glance(fit_arima2)

## # A tibble: 1 × 8
##   .model                      sigma2 log_lik   AIC  AICc   BIC ar_roots ma_roots
##   <chr>                        <dbl>   <dbl> <dbl> <dbl> <dbl> <list>   <list>  
## 1 ARIMA(GDP_transformed ~ pd…  5734.   -325.  662.  664.  674. <cpl>    <cpl>

Part d: Choose the best model based on diagnostics

# Choose the best model and check residuals
best_model <- fit_arima
best_model %>% gg_tsresiduals()

Part e: Produce and analyze forecasts

# Forecast using the best model
fc_arima <- best_model %>% forecast(h = 10)
autoplot(fc_arima, us_gdp)

The forecasts seem reasonable as they align with previous GDP trends.

Part f: Compare with ETS model forecasts

# Compare with ETS model (without transformation)
fit_ets <- us_gdp %>% model(ETS(GDP))
fc_ets <- fit_ets %>% forecast(h = 10)

# Plot ETS forecasts
autoplot(fc_ets, us_gdp)

The forecasts generated by the ETS() model are significantly larger than those from the ARIMA model, resulting in a steeper line. Moreover, the confidence interval is also much wider compared to the ARIMA model, taking on the shape of a large, curved scalene triangle.