library(tidyverse)
library(fpp3)
library(forecast)
library(urca)
library(gridExtra)
library(patchwork)
All three figures indicate that the data represent white noise, as most of the lags fall within the blue dashed lines. The exception is the white noise series of 36 numbers, where some spikes exceed the significance bounds.
The blue dashed lines indicate the confidence bounds, which decrease in width as the sample size increases. As the sample size grows, the confidence intervals become narrower, making it easier to establish that the autocorrelations are close to zero. White noise is a completely random process, so each sample will exhibit different random variations. Additionally, these random fluctuations are more pronounced in smaller samples, resulting in more noticeable spikes in the Auto-Correlation Function.
# Load and filter Amazon stock data
amazon_stock <- gafa_stock %>%
filter(Symbol == "AMZN")
# Plot the time series
autoplot(amazon_stock, Close) +
ggtitle("Amazon Stock Prices (Closing)") +
ylab("Closing Price") + xlab("Year")
The price series demonstrates an upward trend; however, in 2018, prices decreased rapidly with some fluctuations.
# ACF plots
amazon_stock %>%
ACF(Close) %>%
autoplot() +
ggtitle("Autocorrelation Function (ACF) of Amazon Closing Prices")
The ACF plot indicates that lags decrease slowly over time, suggesting non-stationarity. This implies that past values in the series influence future values.
# PACF plots
amazon_stock %>%
PACF(Close) %>%
autoplot() +
ggtitle("Partial Autocorrelation Function (PACF) of Amazon Closing Prices")
The PACF plot shows significant spikes and this indicates strong autocorrelation, another sign of non-stationarity.
# Load Turkish GDP data
turkey_gdp <- global_economy %>%
filter(Country == "Turkey")
# Check for missing values
sum(is.na(turkey_gdp$GDP))
## [1] 0
# Identify Box-Cox transformation parameter
lambda_gdp <- turkey_gdp %>%
features(GDP, features = guerrero) %>%
pull(lambda_guerrero)
# Plot GDP and first difference
autoplot(turkey_gdp, GDP) + ggtitle("Turkish GDP Time Series")
# Difference the GDP series
turkey_gdp <- turkey_gdp %>%
mutate(GDP_diff = difference(GDP))
# Plot differenced series
autoplot(turkey_gdp, GDP_diff) + ggtitle("Differenced Turkish GDP")
# ACF
turkey_gdp %>%
ACF(GDP) %>%
autoplot()
# PACF
turkey_gdp %>%
PACF(GDP) %>%
autoplot()
# Load data
tas_acc <- aus_accommodation %>%
filter(State == "Tasmania")
# Check for missing values
sum(is.na(tas_acc$Takings))
## [1] 0
# Identify Box-Cox lambda
lambda_tas <- tas_acc %>%
features(Takings, features = guerrero) %>%
pull(lambda_guerrero)
# Plot series
autoplot(tas_acc, Takings) + ggtitle("Accommodation Takings in Tasmania")
# Difference the Takings series
tas_acc <- tas_acc %>%
mutate(takings_diff = difference(Takings))
# Plot differenced series
autoplot(tas_acc, takings_diff) + ggtitle("Differenced Accommodation Takings in the State of Tasmania")
# ACF and PACF
tas_acc %>%
ACF(Takings) %>%
autoplot()
# PACF
tas_acc %>%
PACF(Takings) %>%
autoplot()
# Load souvenirs sales data
souvenir_sales <- souvenirs
# Check for missing values
sum(is.na(souvenir_sales$Sales))
## [1] 0
# Identify Box-Cox transformation
lambda_sales <- souvenir_sales %>%
features(Sales, features = guerrero) %>%
pull(lambda_guerrero)
# Plot original series
autoplot(souvenir_sales, Sales) + ggtitle("Monthly Souvenir Sales")
# Difference the sales series
souvenir_sales <- souvenir_sales %>%
mutate(sales_diff = difference(Sales))
# Plot differenced series
autoplot(souvenir_sales, sales_diff) + ggtitle("Differenced Monthly sales from souvenirs")
# ACF and PACF
souvenir_sales %>%
ACF(Sales) %>%
autoplot()
# PACF
souvenir_sales %>%
PACF(Sales) %>%
autoplot()
# KPSS test for stationarity
turkey_gdp %>%
features(GDP_diff, unitroot_kpss)
## # A tibble: 1 × 3
## Country kpss_stat kpss_pvalue
## <fct> <dbl> <dbl>
## 1 Turkey 0.386 0.0834
# KPSS test for stationarity
tas_acc %>%
features(takings_diff, unitroot_kpss)
## # A tibble: 1 × 3
## State kpss_stat kpss_pvalue
## <chr> <dbl> <dbl>
## 1 Tasmania 0.231 0.1
# KPSS test for stationarity
souvenir_sales %>%
features(sales_diff, unitroot_kpss)
## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.240 0.1
All three series are stationary after first differencing, as their p-values exceed 0.05.
# Load data
set.seed(1225)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
# Plot the original time series
autoplot(myseries, Turnover) + ggtitle("Original Time Series")
# Apply Box-Cox transformation if necessary
lambda <- BoxCox.lambda(myseries$Turnover) # Estimate lambda
myseries <- myseries |> mutate(Transformed = BoxCox(Turnover, lambda))
d <- ndiffs(myseries$Transformed)
d
## [1] 1
# Apply first-order differencing
myseries <- myseries |> mutate(Diffed = difference(Transformed, lag = 1, differences = d))
# Remove NAs created by differencing
myseries <- myseries |> drop_na(Diffed)
# Plot the differenced series
autoplot(myseries, Diffed) + ggtitle("Differenced Time Series")
# ADF test to confirm stationarity
adf_test_diffed <- ur.df(myseries$Diffed, type = "drift", selectlags = "AIC")
summary(adf_test_diffed)
##
## ###############################################
## # Augmented Dickey-Fuller Test Unit Root Test #
## ###############################################
##
## Test regression drift
##
##
## Call:
## lm(formula = z.diff ~ z.lag.1 + 1 + z.diff.lag)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.058513 -0.010064 0.000949 0.011327 0.051101
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0021346 0.0009005 2.371 0.0182 *
## z.lag.1 -1.6152555 0.0739053 -21.856 < 2e-16 ***
## z.diff.lag 0.2635997 0.0462383 5.701 2.2e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01874 on 435 degrees of freedom
## Multiple R-squared: 0.6643, Adjusted R-squared: 0.6628
## F-statistic: 430.5 on 2 and 435 DF, p-value: < 2.2e-16
##
##
## Value of test-statistic is: -21.8557 238.8412
##
## Critical values for test statistics:
## 1pct 5pct 10pct
## tau2 -3.44 -2.87 -2.57
## phi1 6.47 4.61 3.79
# KPSS test for stationarity
myseries %>%
features(Diffed, unitroot_kpss)
## # A tibble: 1 × 4
## State Industry kpss_stat kpss_pvalue
## <chr> <chr> <dbl> <dbl>
## 1 New South Wales Takeaway food services 0.0177 0.1
The appropriate order of differencing to achieve stationary data is 1 since the p-value after first differencing exceeds 0.05.
# Load data
set.seed(1048)
y <- numeric(100)
e <- rnorm(100, mean = 0, sd = 1) # White noise
for(i in 2:100) {
y[i] <- 0.6 * y[i - 1] + e[i]
}
sim_ar1 <- tsibble(idx = seq_len(100), y = y, index = idx)
ggplot(sim_ar1, aes(x = idx, y = y)) +
geom_line() +
ggtitle("Simulated AR(1) Process (phi1 = 0.6)") +
xlab("Time") +
ylab("Value")
Increasing phi1 enhances the series’ persistence, while reducing it toward 0 makes it resemble white noise.
y_ma <- numeric(100)
e_ma <- rnorm(101, mean = 0, sd = 1) # One extra for lag
for(i in 2:100) {
y_ma[i] <- e_ma[i] + 0.6 * e_ma[i - 1]
}
sim_ma1 <- tsibble(idx = seq_len(100), y = y_ma, index = idx)
ggplot(sim_ma1, aes(x = idx, y = y)) +
geom_line() +
ggtitle("Simulated MA(1) Process (theta1 = 0.6)") +
xlab("Time") +
ylab("Value")
Higher values pf theta1 increase the correlation of the process, while lower values decrease the autocorrelation.
y_arma <- numeric(100)
e_arma <- rnorm(101, mean = 0, sd = 1)
for(i in 2:100) {
y_arma[i] <- 0.6 * y_arma[i - 1] + e_arma[i] + 0.6 * e_arma[i - 1]
}
sim_arma11 <- tsibble(idx = seq_len(100), y = y_arma, index = idx)
y_ar2 <- numeric(100)
e_ar2 <- rnorm(100, mean = 0, sd = 1)
for(i in 3:100) {
y_ar2[i] <- -0.8 * y_ar2[i - 1] + 0.3 * y_ar2[i - 2] + e_ar2[i]
}
sim_ar2 <- tsibble(idx = seq_len(100), y = y_ar2, index = idx)
p1 <- ggplot(sim_arma11, aes(x = idx, y = y)) +
geom_line() +
ggtitle("Simulated ARMA(1,1) Process") +
xlab("Time") +
ylab("Value")
p2 <- ggplot(sim_ar2, aes(x = idx, y = y)) +
geom_line() +
ggtitle("Simulated AR(2) Process") +
xlab("Time") +
ylab("Value")
# Display plots side by side
grid.arrange(p1, p2, ncol = 2)
The ARMA(1,1) model shows both short-term and long-term connections. In contrast, the AR(2) model can be unstable and might display sudden changes in behavior.
# Load the dataset
data("aus_airpassengers")
# Convert to time series format
ts_data <- as.ts(aus_airpassengers$Passengers)
auto_model <- auto.arima(ts_data)
print(auto_model)
## Series: ts_data
## ARIMA(0,2,1)
##
## Coefficients:
## ma1
## -0.8963
## s.e. 0.0594
##
## sigma^2 = 4.308: log likelihood = -97.02
## AIC=198.04 AICc=198.32 BIC=201.65
# Check residuals
checkresiduals(auto_model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,2,1)
## Q* = 6.5774, df = 8, p-value = 0.5828
##
## Model df: 1. Total lags used: 9
# Forecast for the next 10 periods
forecast_auto <- forecast(auto_model, h=10)
autoplot(forecast_auto)
cat("The selected model in backshift notation is: \n")
## The selected model in backshift notation is:
print(auto_model)
## Series: ts_data
## ARIMA(0,2,1)
##
## Coefficients:
## ma1
## -0.8963
## s.e. 0.0594
##
## sigma^2 = 4.308: log likelihood = -97.02
## AIC=198.04 AICc=198.32 BIC=201.65
arima_drift <- Arima(ts_data, order=c(0,1,0), include.drift=TRUE)
forecast_drift <- forecast(arima_drift, h=10)
autoplot(forecast_drift) + ggtitle("ARIMA(0,1,0) with Drift")
In comparison to the auto-selected ARIMA model, which is ARIMA(0,2,1), the ARIMA(0,1,0) model forecasts lower values.
arima_212_drift <- Arima(ts_data, order=c(2,1,2), include.drift=TRUE)
forecast_212_drift <- forecast(arima_212_drift, h=10)
autoplot(forecast_212_drift) + ggtitle("ARIMA(2,1,2) with Drift")
Compared to the two previous ARIMA models, this model’s forecast has more fluctuations and curves.
arima_212_no_const <- Arima(ts_data, order=c(2,1,2), include.constant=FALSE, method="ML")
forecast_212_no_const <- forecast(arima_212_no_const, h=10)
autoplot(forecast_212_no_const) + ggtitle("ARIMA(2,1,2) without Constant (ML Method)")
The line representing the forecasts is much steeper now, indicating that the forecast values are significantly larger. Additionally, the confidence intervals are considerably narrower.
arima_021_constant <- Arima(ts_data, order=c(0,2,1), include.constant=TRUE)
forecast_021_constant <- forecast(arima_021_constant, h=10)
autoplot(forecast_021_constant) + ggtitle("ARIMA(0,2,1) with Constant")
This plot is the same as the automatically selected ARIMA model, which is ARIMA(0,2,1), but it does not specify to include a constant.
# Load the dataset
data("global_economy")
# Filter for United States GDP data
us_gdp <- global_economy %>% filter(Country == "United States") %>% select(Year, GDP)
# Check for transformations using Guerrero's method
lambda <- us_gdp %>% features(GDP, guerrero) %>% pull(lambda_guerrero)
# Display lambda value
lambda
## [1] 0.2819443
# Apply Box-Cox transformation if needed
us_gdp <- us_gdp %>% mutate(GDP_transformed = box_cox(GDP, lambda))
# Plot transformed series
autoplot(us_gdp, GDP_transformed)
# Fit an automatic ARIMA model
fit_arima <- us_gdp %>% model(ARIMA(GDP_transformed))
# Summarize the ARIMA model
report(fit_arima)
## Series: GDP_transformed
## Model: ARIMA(1,1,0) w/ drift
##
## Coefficients:
## ar1 constant
## 0.4586 118.1822
## s.e. 0.1198 9.5047
##
## sigma^2 estimated as 5479: log likelihood=-325.32
## AIC=656.65 AICc=657.1 BIC=662.78
# Try alternative ARIMA models manually
fit_arima2 <- us_gdp %>% model(ARIMA(GDP_transformed ~ pdq(2,1,2) + PDQ(0,1,1)))
# Compare models using AICc
glance(fit_arima)
## # A tibble: 1 × 8
## .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 ARIMA(GDP_transformed) 5479. -325. 657. 657. 663. <cpl [1]> <cpl [0]>
glance(fit_arima2)
## # A tibble: 1 × 8
## .model sigma2 log_lik AIC AICc BIC ar_roots ma_roots
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <list> <list>
## 1 ARIMA(GDP_transformed ~ pd… 5734. -325. 662. 664. 674. <cpl> <cpl>
# Choose the best model and check residuals
best_model <- fit_arima
best_model %>% gg_tsresiduals()
# Forecast using the best model
fc_arima <- best_model %>% forecast(h = 10)
autoplot(fc_arima, us_gdp)
The forecasts seem reasonable as they align with previous GDP trends.
# Compare with ETS model (without transformation)
fit_ets <- us_gdp %>% model(ETS(GDP))
fc_ets <- fit_ets %>% forecast(h = 10)
# Plot ETS forecasts
autoplot(fc_ets, us_gdp)
The forecasts generated by the ETS() model are significantly larger than those from the ARIMA model, resulting in a steeper line. Moreover, the confidence interval is also much wider compared to the ARIMA model, taking on the shape of a large, curved scalene triangle.