Q1. Consider the the number of pigs slaughtered in Victoria, available in the aus_livestock dataset.
1a. Use the ETS() function to estimate the equivalent model for simple exponential smoothing. Find the optimal values of α and ℓ0 ,and generate forecasts for the next four months:
Optimal Alpha (α) is 0.3221. Initial level (ℓ₀)is 100646.6
library(fpp3)
library(dplyr)
library(ggplot2)
library(dplyr)
library(patchwork)
Pig_model <- aus_livestock |>
filter(State == "Victoria", Animal =="Pigs") |>
model(ETS(Count ~ error("A") + trend("N") + season("N")))
tidy_pig <- tidy(Pig_model)
alpha <- tidy_pig$estimate[1]
l0 <- tidy_pig$estimate[2]
cat("Optimal values:\n")
## Optimal values:
cat("Alpha (α):", round(alpha, 4), "\n")
## Alpha (α): 0.3221
cat("Initial level (â„“â‚€):", round(l0, 2), "\n")
## Initial level (â„“â‚€): 100646.6
pig_fc <- Pig_model |>
forecast(h = 4)
print(pig_fc)
## # A fable: 4 x 6 [1M]
## # Key: Animal, State, .model [1]
## Animal State .model Month
## <fct> <fct> <chr> <mth>
## 1 Pigs Victoria "ETS(Count ~ error(\"A\") + trend(\"N\") + season(\"… 2019 Jan
## 2 Pigs Victoria "ETS(Count ~ error(\"A\") + trend(\"N\") + season(\"… 2019 Feb
## 3 Pigs Victoria "ETS(Count ~ error(\"A\") + trend(\"N\") + season(\"… 2019 Mar
## 4 Pigs Victoria "ETS(Count ~ error(\"A\") + trend(\"N\") + season(\"… 2019 Apr
## # ℹ 2 more variables: Count <dist>, .mean <dbl>
1b. Compute a 95% prediction interval for the first forecast using y+- 1.96s where s is the standard deviation of the residuals. Compare your interval with the interval produced by R.
The two are very close. Manual interval width: 36631.09 is very close to R’s interval 36663.54
residuals_sd <- sd(residuals(Pig_model)$.resid)
first_forecast <- pig_fc$.mean[1]
manual_lower <- first_forecast - 1.96 * residuals_sd
manual_upper <- first_forecast + 1.96 * residuals_sd
cat("Point forecast:", round(first_forecast, 2), "\n")
## Point forecast: 95186.56
cat("Lower:", round(manual_lower, 2), "\n")
## Lower: 76871.01
cat("Upper:", round(manual_upper, 2), "\n")
## Upper: 113502.1
cat("Standard deviation residuals:", round(residuals_sd, 2), "\n")
## Standard deviation residuals: 9344.67
# R's prediction interval
r_intervals <- pig_fc |>
hilo(level = 95) |>
as_tibble()
r_first_interval <- r_intervals[1, ]
cat("\nR's 95% Prediction Interval for first forecast:\n")
##
## R's 95% Prediction Interval for first forecast:
cat("Point forecast:", round(r_first_interval$.mean, 2), "\n")
## Point forecast: 95186.56
cat("Lower:", round(r_first_interval$`95%`$lower, 2), "\n")
## Lower: 76854.79
cat("Upper:", round(r_first_interval$`95%`$upper, 2), "\n")
## Upper: 113518.3
cat("\nComparison:\n")
##
## Comparison:
cat("Manual interval width:", round(manual_upper - manual_lower, 2), "\n")
## Manual interval width: 36631.09
cat("R's interval width:", round(r_first_interval$`95%`$upper - r_first_interval$`95%`$lower, 2), "\n")
## R's interval width: 36663.54
Q5. Data set global_economy contains the annual Exports from many countries. Select one country to analyse.
The country I choose to focus on is Australia. Year coverd 1960 to 2017. Overall GDP growth 12 to 23%, it shows fluctuations with some periods of growth and delcine.
aus_exports <- global_economy |>
filter(Country == "Australia") |>
select(Year, Exports)
p1 <- aus_exports |>
ggplot(aes(x = Year, y = Exports)) +
geom_line(color = "steelblue", linewidth = 1) +
geom_point(color = "steelblue", size = 1) +
labs(title = "Australian Exports (% of GDP)",
y = "Exports (% of GDP)",
x = "Year") +
theme_minimal()
print(p1)
aus_model_NN <- aus_exports |>
model(ETS(Exports ~ error("A") + trend("N") + season("N")))
fc_NN <- aus_model_NN |>
forecast(h = 10)
p2 <- aus_exports |>
ggplot(aes(x = Year, y = Exports)) +
geom_line(color = "steelblue") +
autolayer(fc_NN) +
labs(title = "Australian Exports Forecast - ETS(A,N,N) Model",
y = "Exports (% of GDP)") +
theme_minimal()
print(p2)
c. Compute
the RMSE values for the training data.
Training RMSE: 1.1468
accuracy_NN <- accuracy(aus_model_NN)
rmse_NN <- accuracy_NN$RMSE
Results are shown below. As for comparision, ETS(A,N,N) is a simpler model, fewer parameters, less risk of overfitting, while ETS(A,A,N) Captures potential trend, may better fit trending patterns
aus_model_AN <- aus_exports |>
model(ETS(Exports ~ error("A") + trend("A") + season("N")))
accuracy_AN <- accuracy(aus_model_AN)
rmse_AN <- accuracy_AN$RMSE
cat("ETS(A,A,N) Model Results:\n")
## ETS(A,A,N) Model Results:
cat("Training RMSE:", round(rmse_AN, 4), "\n\n")
## Training RMSE: 1.1167
tidy_NN <- tidy(aus_model_NN)
tidy_AN <- tidy(aus_model_AN)
cat("ETS(A,N,N) - Parameters:", nrow(tidy_NN), "\n")
## ETS(A,N,N) - Parameters: 2
cat("ETS(A,A,N) - Parameters:", nrow(tidy_AN), "\n")
## ETS(A,A,N) - Parameters: 4
cat("RMSE Difference:", round(rmse_NN - rmse_AN, 4), "\n")
## RMSE Difference: 0.0301
Looks like ETS(A,A,N) has lower RMSE on training data therefore is a better solution.
fc_AN <- aus_model_AN |>
forecast(h = 10)
#compare
forecast_comparison <- bind_rows(
fc_NN |> mutate(Model = "ETS(A,N,N)"),
fc_AN |> mutate(Model = "ETS(A,A,N)")
)
p3 <- aus_exports |>
ggplot(aes(x = Year, y = Exports)) +
geom_line(color = "black") +
geom_line(data = forecast_comparison, aes(y = .mean, color = Model)) +
labs(title = "Comparison",
y = "Exports (% of GDP)",
color = "Model")
print(p3)
# forecast
fc_values_NN <- fc_NN |> as_tibble() |> head(5) |> select(Year, .mean) |> rename(NN = .mean)
fc_values_AN <- fc_AN |> as_tibble() |> head(5) |> select(Year, .mean) |> rename(AN = .mean)
fc_side_by_side <- bind_cols(fc_values_NN, fc_values_AN |> select(AN))
print(fc_side_by_side)
## # A tibble: 5 × 3
## Year NN AN
## <dbl> <dbl> <dbl>
## 1 2018 20.6 20.8
## 2 2019 20.6 21.0
## 3 2020 20.6 21.1
## 4 2021 20.6 21.3
## 5 2022 20.6 21.4
glance_NN <- glance(aus_model_NN)
glance_AN <- glance(aus_model_AN)
cat("ETS(A,N,N) - AICc:", round(glance_NN$AICc, 2), "\n")
## ETS(A,N,N) - AICc: 257.84
cat("ETS(A,A,N) - AICc:", round(glance_AN$AICc, 2), "\n")
## ETS(A,A,N) - AICc: 259.47
auto_model <- aus_exports |>
model(ETS(Exports))
cat("\nAutomatically selected ETS model is: ")
##
## Automatically selected ETS model is:
print(report(auto_model))
## Series: Exports
## Model: ETS(M,N,N)
## Smoothing parameters:
## alpha = 0.5805382
##
## Initial states:
## l[0]
## 12.94723
##
## sigma^2: 0.0048
##
## AIC AICc BIC
## 251.7036 252.1480 257.8849
## # A mable: 1 x 1
## `ETS(Exports)`
## <model>
## 1 <ETS(M,N,N)>
Q6. Forecast the Chinese GDP from the global_economy data set using an ETS model. Experiment with the various options in the ETS() function to see how much the forecasts change with damped trend, or with a Box-Cox transformation. Try to develop an intuition of what each is doing to the forecasts.
When forecasting GDP, multiplicative errors often fit better. Box-Cox Transformation can Stabilizes variance across different levels of the series. Makes patterns more linear and easier to model which is more useful for series with exponential growth patterns. here are few observations for each: ANN: Simple, assumes no trend - underestimates growing series. AAN: Captures linear trend - may overestimate long-term growth. AAdN: Balanced approach - trend but with damping.
china_gdp <- global_economy |>
filter(Country == "China") |>
select(Year, GDP)
p_original <- china_gdp |>
ggplot(aes(x = Year, y = GDP)) +
geom_line(color = "red", linewidth = 1) +
geom_point(color = "red", size = 1) +
labs(title = "Chinese GDP Over Time",
y = "GDP (US$)",
x = "Year")
print(p_original)
#auto ETS
auto_model <- china_gdp |>
model(ETS(GDP))
# manual ETS
models <- china_gdp |>
model(
ANN = ETS(GDP ~ error("A") + trend("N") + season("N")),
AAN = ETS(GDP ~ error("A") + trend("A") + season("N")),
AAdN = ETS(GDP ~ error("A") + trend("Ad") + season("N")),
MNN = ETS(GDP ~ error("M") + trend("N") + season("N")),
MAN = ETS(GDP ~ error("M") + trend("A") + season("N")),
MAdN = ETS(GDP ~ error("M") + trend("Ad") + season("N")),
Auto = ETS(GDP)
)
#forcast
forecasts <- models |>
forecast(h = 10)
forecast_plot <- china_gdp |>
ggplot(aes(x = Year, y = GDP)) +
geom_line(color = "black", linewidth = 1) +
autolayer(forecasts, alpha = 0.6, level = NULL) +
labs(title = "Chinese GDP Forecasts - Different ETS Models",
y = "GDP (US$)",
color = "Model") +
theme_minimal()
print(forecast_plot)
auto_result <- china_gdp |>
model(ETS(GDP)) |>
report()
## Series: GDP
## Model: ETS(M,A,N)
## Smoothing parameters:
## alpha = 0.9998998
## beta = 0.3119984
##
## Initial states:
## l[0] b[0]
## 45713434615 3288256682
##
## sigma^2: 0.0108
##
## AIC AICc BIC
## 3102.064 3103.218 3112.366
cat("\nAutomatically Selected ETS Model:\n")
##
## Automatically Selected ETS Model:
print(auto_result)
## # A mable: 1 x 1
## `ETS(GDP)`
## <model>
## 1 <ETS(M,A,N)>
Q7 Find an ETS model for the Gas data from aus_production and forecast the next few years. Why is multiplicative seasonality necessary here? Experiment with making the trend damped. Does it improve the forecasts?
Seasonal variation analysis shows a strong positive correclation on the Gas data with seasonality, therefore multiplicative seasonality is appropriate approach for the gas data. Best ETS model for gas data from aus_producction forcast is ETS_AAdA, I made forcast for the next 12 years on the gas data. As shown below, damping has a minimal effect on long-term forecasts.
``` r
gas_data <- aus_production |>
select(Quarter, Gas) |>
filter(!is.na(Gas))
p1 <- gas_data |>
ggplot(aes(x = Quarter, y = Gas)) +
geom_line(color = "steelblue", linewidth = 1) +
labs(title = "Australian Gas Production",
y = "Gas Production",
x = "Quarter") +
theme_minimal()
print(p1)
# seasonal plots
p2 <- gas_data |>
gg_season(Gas) +
labs(title = "Seasonal Plot of Australian Gas Production") +
theme_minimal()
## Warning: `gg_season()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_season()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
p3 <- gas_data |>
gg_subseries(Gas) +
labs(title = "Subseries Plot of Australian Gas Production") +
theme_minimal()
## Warning: `gg_subseries()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_subseries()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p2)
print(p3)
# ETS models to compare
gas_models <- gas_data |>
model(
# Additive seasonality models
ETS_AAA = ETS(Gas ~ error("A") + trend("A") + season("A")),
ETS_ANA = ETS(Gas ~ error("A") + trend("N") + season("A")),
ETS_AAdA = ETS(Gas ~ error("A") + trend("Ad") + season("A")),
# Multiplicative seasonality models
ETS_MAM = ETS(Gas ~ error("M") + trend("A") + season("M")),
ETS_MNM = ETS(Gas ~ error("M") + trend("N") + season("M")),
ETS_MAdM = ETS(Gas ~ error("M") + trend("Ad") + season("M")),
# Mixed models
ETS_MA_M = ETS(Gas ~ error("M") + trend("A") + season("M")),
ETS_MA_A = ETS(Gas ~ error("M") + trend("A") + season("A")),
ETS_auto = ETS(Gas)
)
model_accuracy <- accuracy(gas_models) |>
arrange(RMSE)
cat("Model Accuracy Comparison (sorted by RMSE):\n")
## Model Accuracy Comparison (sorted by RMSE):
print(model_accuracy)
## # A tibble: 9 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS_AAdA Training 0.600 4.58 3.16 0.460 7.39 0.566 0.604 0.0660
## 2 ETS_MAdM Training -0.00439 4.59 3.03 0.326 4.10 0.544 0.606 -0.0217
## 3 ETS_MAM Training -0.115 4.60 3.02 0.199 4.08 0.542 0.606 -0.0131
## 4 ETS_MA_M Training -0.115 4.60 3.02 0.199 4.08 0.542 0.606 -0.0131
## 5 ETS_auto Training -0.115 4.60 3.02 0.199 4.08 0.542 0.606 -0.0131
## 6 ETS_ANA Training 1.32 4.66 3.27 1.68 6.94 0.587 0.615 0.00247
## 7 ETS_MNM Training 0.874 4.68 3.20 1.58 4.42 0.574 0.617 -0.177
## 8 ETS_AAA Training 0.00525 4.76 3.35 -4.69 10.9 0.600 0.628 0.0772
## 9 ETS_MA_A Training 0.0508 6.39 4.03 2.02 14.5 0.723 0.843 0.226
best_model_name <- model_accuracy$.model[1]
cat("\nBest model:", best_model_name, "\n")
##
## Best model: ETS_AAdA
# Fit various ETS models to compare
gas_models <- gas_data |>
model(
# Additive seasonality models
ETS_AAA = ETS(Gas ~ error("A") + trend("A") + season("A")),
ETS_ANA = ETS(Gas ~ error("A") + trend("N") + season("A")),
ETS_AAdA = ETS(Gas ~ error("A") + trend("Ad") + season("A")),
# Multiplicative seasonality models
ETS_MAM = ETS(Gas ~ error("M") + trend("A") + season("M")),
ETS_MNM = ETS(Gas ~ error("M") + trend("N") + season("M")),
ETS_MAdM = ETS(Gas ~ error("M") + trend("Ad") + season("M")),
# Mixed models
ETS_MA_M = ETS(Gas ~ error("M") + trend("A") + season("M")),
ETS_MA_A = ETS(Gas ~ error("M") + trend("A") + season("A")),
# Let R choose automatically
ETS_auto = ETS(Gas)
)
# Compare model accuracy
model_accuracy <- accuracy(gas_models) %>%
arrange(RMSE)
cat("Model Accuracy Comparison (sorted by RMSE):\n")
## Model Accuracy Comparison (sorted by RMSE):
print(model_accuracy)
## # A tibble: 9 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS_AAdA Training 0.600 4.58 3.16 0.460 7.39 0.566 0.604 0.0660
## 2 ETS_MAdM Training -0.00439 4.59 3.03 0.326 4.10 0.544 0.606 -0.0217
## 3 ETS_MAM Training -0.115 4.60 3.02 0.199 4.08 0.542 0.606 -0.0131
## 4 ETS_MA_M Training -0.115 4.60 3.02 0.199 4.08 0.542 0.606 -0.0131
## 5 ETS_auto Training -0.115 4.60 3.02 0.199 4.08 0.542 0.606 -0.0131
## 6 ETS_ANA Training 1.32 4.66 3.27 1.68 6.94 0.587 0.615 0.00247
## 7 ETS_MNM Training 0.874 4.68 3.20 1.58 4.42 0.574 0.617 -0.177
## 8 ETS_AAA Training 0.00525 4.76 3.35 -4.69 10.9 0.600 0.628 0.0772
## 9 ETS_MA_A Training 0.0508 6.39 4.03 2.02 14.5 0.723 0.843 0.226
# Show the best model structure
best_model_name <- model_accuracy$.model[1]
cat("\nBest model:", best_model_name, "\n")
##
## Best model: ETS_AAdA
# Generate forecasts for the next 3 years (12 quarters)
forecasts <- gas_models |>
forecast(h = 12)
# Plot forecasts for key models
key_models <- c("ETS_auto", "ETS_MAM", "ETS_MAdM", "ETS_AAA")
forecast_plot <- gas_data |>
ggplot(aes(x = Quarter, y = Gas)) +
geom_line(color = "black", linewidth = 1) +
autolayer(forecasts |> filter(.model %in% key_models),
alpha = 0.7, level = NULL) +
labs(title = "Gas Production Forecasts - Key ETS Models",
y = "Gas Production") +
theme_minimal()
print(forecast_plot)
# Analyze why multiplicative seasonality is necessary
# Calculate seasonal strength and variation
gas_decomposed <- gas_data |>
model(stl = STL(Gas ~ season(window = "periodic"))) |>
components()
# Plot decomposition
decomp_plot <- gas_decomposed |>
autoplot() +
labs(title = "STL Decomposition of Gas Production") +
theme_minimal()
print(decomp_plot)
# Calculate coefficient of variation by season
seasonal_variation <- gas_data |>
mutate(Year = year(Quarter),
Quarter_num = quarter(Quarter)) |>
group_by(Quarter_num) |>
summarise(
mean_gas = mean(Gas, na.rm = TRUE),
sd_gas = sd(Gas, na.rm = TRUE),
cv = sd_gas / mean_gas
)
cat("\nSeasonal Variation Analysis:\n")
##
## Seasonal Variation Analysis:
print(seasonal_variation)
## # A tsibble: 218 x 5 [1Q]
## # Key: Quarter_num [4]
## Quarter_num Quarter mean_gas sd_gas cv
## <int> <qtr> <dbl> <dbl> <dbl>
## 1 1 1956 Q1 5 NA NA
## 2 1 1957 Q1 5 NA NA
## 3 1 1958 Q1 5 NA NA
## 4 1 1959 Q1 5 NA NA
## 5 1 1960 Q1 6 NA NA
## 6 1 1961 Q1 6 NA NA
## 7 1 1962 Q1 6 NA NA
## 8 1 1963 Q1 6 NA NA
## 9 1 1964 Q1 6 NA NA
## 10 1 1965 Q1 6 NA NA
## # ℹ 208 more rows
# Check if seasonal variation increases with level
level_vs_variation <- gas_data |>
mutate(rolling_mean = slider::slide_dbl(Gas, mean, .before = 4),
rolling_sd = slider::slide_dbl(Gas, sd, .before = 4)) |>
filter(!is.na(rolling_mean), !is.na(rolling_sd))
cor_test <- cor(level_vs_variation$rolling_mean, level_vs_variation$rolling_sd,
use = "complete.obs")
cat("\nCorrelation between level and variability:", round(cor_test, 3), "\n")
##
## Correlation between level and variability: 0.968
# Compare damped vs non-damped trends
damped_comparison <- gas_models |>
select(ETS_MAM, ETS_MAdM) |>
forecast(h = 12)
# Plot comparison
damped_plot <- gas_data |>
ggplot(aes(x = Quarter, y = Gas)) +
geom_line(color = "black", linewidth = 1) +
autolayer(damped_comparison, alpha = 0.7, level = NULL) +
labs(title = "Damped vs Non-damped Trend Comparison",
subtitle = "ETS(M,A,M) vs ETS(M,Ad,M)",
y = "Gas Production") +
theme_minimal()
print(damped_plot)
# Extract forecast values
fc_values <- damped_comparison |>
as_tibble() %>%
select(Quarter, .model, .mean) |>
pivot_wider(names_from = .model, values_from = .mean)
cat("\nForecast Comparison - Damped vs Non-damped:\n")
##
## Forecast Comparison - Damped vs Non-damped:
print(fc_values, n = 12)
## # A tibble: 12 × 3
## Quarter ETS_MAM ETS_MAdM
## <qtr> <dbl> <dbl>
## 1 2010 Q3 259. 259.
## 2 2010 Q4 214. 214.
## 3 2011 Q1 201. 201.
## 4 2011 Q2 242. 242.
## 5 2011 Q3 263. 262.
## 6 2011 Q4 217. 216.
## 7 2012 Q1 204. 203.
## 8 2012 Q2 246. 244.
## 9 2012 Q3 267. 265.
## 10 2012 Q4 220. 219.
## 11 2013 Q1 207. 205.
## 12 2013 Q2 249. 247.
# Analyze differences
if (nrow(fc_values) > 0) {
final_fc <- fc_values |>
filter(Quarter == max(Quarter)) |>
mutate(
difference = ETS_MAdM - ETS_MAM,
percent_diff = (ETS_MAdM - ETS_MAM) / ETS_MAM * 100
)
}
cat("\nFinal Forecast Difference:\n")
##
## Final Forecast Difference:
cat("Damped trend forecast:", round(final_fc$ETS_MAdM, 2), "\n")
## Damped trend forecast: 246.9
cat("Non-damped forecast:", round(final_fc$ETS_MAM, 2), "\n")
## Non-damped forecast: 249.17
cat("Absolute difference:", round(final_fc$difference, 2), "\n")
## Absolute difference: -2.27
cat("Percentage difference:", round(final_fc$percent_diff, 2), "%\n")
## Percentage difference: -0.91 %
Q8 Recall your retail time series data (from Exercise 7 in Section 2.10).
Seasonality analysis indicate positive correlation which means multiplicative seasonality is appropriate. The seasonal pattern amplitude increases as the overall turnover level increases.
Apply Holt-Winters’ multiplicative method to the data. Experiment with making the trend damped. As showns blow ETS multiplicative with damped method is being used.
Compare the RMSE of the one-step forecasts from the two methods. Which do you prefer? Damped method has higher RMSE value and higher AICc value, therefore in comparison, multiplicative is better.
Check that the residuals from the best method look like white noise.
I did residual diagnostics for the multiplicative method, and Ljung-Box test show p-value 0.0765. I think residuals are appear to be white noise and the model has captured most of the patterns in the data.
Holt_winter RMSE value is 1.78 while seasonal Naive RMSE value is 1.55. Looks like seasonal naive performs beter. Details and a comparision graph are shown below.
# Set seed and select random series
set.seed(12345678)
myseries <- aus_retail |>
filter(`Series ID` == sample(aus_retail$`Series ID`, 1))
# Plot the series
p1 <- myseries |>
ggplot(aes(x = Month, y = Turnover)) +
geom_line(color = "steelblue", linewidth = 1) +
labs(title = paste("Retail Turnover -",
distinct(myseries, State)$State, "-",
distinct(myseries, Industry)$Industry),
y = "Turnover",
x = "Month") +
theme_minimal()
print(p1)
decomposed <- myseries |>
model(STL(Turnover)) |>
components()
# Plot decomposition
decomp_plot <- decomposed |>
autoplot() +
labs(title = "STL Decomposition of Retail Turnover") +
theme_minimal()
print(decomp_plot)
# Calculate correlation between level and variability
level_variation <- myseries |>
mutate(
rolling_mean = slider::slide_dbl(Turnover, mean, .before = 12, .complete = TRUE),
rolling_sd = slider::slide_dbl(Turnover, sd, .before = 12, .complete = TRUE)
) |>
filter(!is.na(rolling_mean), !is.na(rolling_sd))
# Calculate seasonal strength manually from STL decomposition
var_remainder <- var(decomposed$remainder, na.rm=TRUE)
var_seasonal_remainder <- var(decomposed$seasonal + decomposed$remainder, na.rm=TRUE)
## Warning: Unknown or uninitialised column: `seasonal`.
seasonal_strength_value <- max(0, 1 - (var_remainder / var_seasonal_remainder))
cat("a. Multiplicative Seasonality Analysis:\n")
## a. Multiplicative Seasonality Analysis:
cat("Correlation between level and variability:", round(cor_test, 3), "\n")
## Correlation between level and variability: 0.968
cat("Seasonal strength:", round(seasonal_strength_value, 3), "\n")
## Seasonal strength: NA
# Fit Holt-Winters models
hw_models <- myseries |>
model(
HW_multiplicative = ETS(Turnover ~ error("M") + trend("A") + season("M")),
HW_damped = ETS(Turnover ~ error("M") + trend("Ad") + season("M"))
)
cat("\nModel Summaries:\n")
##
## Model Summaries:
model_summaries <- glance(hw_models)
print(model_summaries)
## # A tibble: 2 × 11
## State Industry .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Northern… Clothin… HW_mu… 0.00447 -870. 1775. 1776. 1841. 0.376 0.473 0.0511
## 2 Northern… Clothin… HW_da… 0.00457 -872. 1781. 1783. 1851. 0.379 0.479 0.0505
cat("\nModel Parameters:\n")
##
## Model Parameters:
model_params <- tidy(hw_models)
print(model_params)
## # A tibble: 35 × 5
## State Industry .model term estimate
## <chr> <chr> <chr> <chr> <dbl>
## 1 Northern Territory Clothing, footwear and personal acc… HW_mu… alpha 0.526
## 2 Northern Territory Clothing, footwear and personal acc… HW_mu… beta 0.000103
## 3 Northern Territory Clothing, footwear and personal acc… HW_mu… gamma 0.000104
## 4 Northern Territory Clothing, footwear and personal acc… HW_mu… l[0] 2.35
## 5 Northern Territory Clothing, footwear and personal acc… HW_mu… b[0] 0.0444
## 6 Northern Territory Clothing, footwear and personal acc… HW_mu… s[0] 0.840
## 7 Northern Territory Clothing, footwear and personal acc… HW_mu… s[-1] 0.762
## 8 Northern Territory Clothing, footwear and personal acc… HW_mu… s[-2] 0.834
## 9 Northern Territory Clothing, footwear and personal acc… HW_mu… s[-3] 1.37
## 10 Northern Territory Clothing, footwear and personal acc… HW_mu… s[-4] 1.00
## # ℹ 25 more rows
# Plot both models
hw_plot <- myseries |>
ggplot(aes(x = Month, y = Turnover)) +
geom_line(color = "black", linewidth = 1) +
autolayer(fitted(hw_models), alpha = 0.7, size = 0.8) +
labs(title = "Holt-Winters Multiplicative Models - Fitted Values",
y = "Turnover") +
theme_minimal()
## Plot variable not specified, automatically selected `.vars = .fitted`
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the fabletools package.
## Please report the issue at <https://github.com/tidyverts/fabletools/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(hw_plot)
# Holt-Winters models
hw_models <- myseries |>
model(
HW_multiplicative = ETS(Turnover ~ error("M") + trend("A") + season("M")),
HW_damped = ETS(Turnover ~ error("M") + trend("Ad") + season("M"))
)
cat("b. Holt-Winters Model Parameters:\n")
## b. Holt-Winters Model Parameters:
cat("\nModel Summaries:\n")
##
## Model Summaries:
model_summaries <- glance(hw_models)
print(model_summaries)
## # A tibble: 2 × 11
## State Industry .model sigma2 log_lik AIC AICc BIC MSE AMSE MAE
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Northern… Clothin… HW_mu… 0.00447 -870. 1775. 1776. 1841. 0.376 0.473 0.0511
## 2 Northern… Clothin… HW_da… 0.00457 -872. 1781. 1783. 1851. 0.379 0.479 0.0505
cat("\nModel Parameters:\n")
##
## Model Parameters:
model_params <- tidy(hw_models)
print(model_params)
## # A tibble: 35 × 5
## State Industry .model term estimate
## <chr> <chr> <chr> <chr> <dbl>
## 1 Northern Territory Clothing, footwear and personal acc… HW_mu… alpha 0.526
## 2 Northern Territory Clothing, footwear and personal acc… HW_mu… beta 0.000103
## 3 Northern Territory Clothing, footwear and personal acc… HW_mu… gamma 0.000104
## 4 Northern Territory Clothing, footwear and personal acc… HW_mu… l[0] 2.35
## 5 Northern Territory Clothing, footwear and personal acc… HW_mu… b[0] 0.0444
## 6 Northern Territory Clothing, footwear and personal acc… HW_mu… s[0] 0.840
## 7 Northern Territory Clothing, footwear and personal acc… HW_mu… s[-1] 0.762
## 8 Northern Territory Clothing, footwear and personal acc… HW_mu… s[-2] 0.834
## 9 Northern Territory Clothing, footwear and personal acc… HW_mu… s[-3] 1.37
## 10 Northern Territory Clothing, footwear and personal acc… HW_mu… s[-4] 1.00
## # ℹ 25 more rows
hw_plot <- myseries |>
ggplot(aes(x = Month, y = Turnover)) +
geom_line(color = "black", linewidth = 1) +
autolayer(fitted(hw_models), alpha = 0.7, size = 0.8) +
labs(title = "Holt-Winters Multiplicative Models - Fitted Values",
y = "Turnover") +
theme_minimal() +
theme(legend.position = "bottom")
## Plot variable not specified, automatically selected `.vars = .fitted`
print(hw_plot)
if ("HW_damped" %in% model_params$.model) {
damped_params <- model_params |>
filter(.model == "HW_damped")
phi <- damped_params |>
filter(term == "phi") |>
pull(estimate)
}
accuracy_measures <- accuracy(hw_models)|>
select(.model, RMSE, MAE) # Removed AICc from here
model_glance <- glance(hw_models) |>
select(.model, AICc)
# Combine
model_accuracy <- accuracy_measures |>
left_join(model_glance, by = ".model") |>
arrange(RMSE)
cat("c. Model Comparison (One-step Forecasts):\n")
## c. Model Comparison (One-step Forecasts):
print(model_accuracy)
## # A tibble: 2 × 4
## .model RMSE MAE AICc
## <chr> <dbl> <dbl> <dbl>
## 1 HW_multiplicative 0.613 0.450 1776.
## 2 HW_damped 0.616 0.444 1783.
best_model <- hw_models |>
select(HW_multiplicative)
residual_diagnostics <- best_model |>
gg_tsresiduals() +
labs(title = paste("Residual Diagnostics for", best_model_name)) +
theme_minimal()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(residual_diagnostics)
# Ljung-Box test
lb_test <- best_model |>
augment() |>
features(.innov, ljung_box, lag = 24, dof = 0)
cat("\nLjung-Box Test Results:\n")
##
## Ljung-Box Test Results:
cat("H0: Residuals are independently distributed (white noise)\n")
## H0: Residuals are independently distributed (white noise)
cat("Test Statistic:", round(lb_test$lb_stat, 4), "\n")
## Test Statistic: 34.4793
cat("p-value:", round(lb_test$lb_pvalue, 4), "\n")
## p-value: 0.0765
# Check for patterns in residuals
residual_time_plot <- best_model |>
augment() |>
ggplot(aes(x = Month, y = .innov)) +
geom_line() +
geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
labs(title = paste("Residuals Over Time for", best_model_name),
y = "Residuals") +
theme_minimal()
print(residual_time_plot)
# Split data
train_data <- myseries |> filter(year(Month) <= 2010)
test_data <- myseries |> filter(year(Month) > 2010)
cat("Training period:", format(min(train_data$Month)), "to", format(max(train_data$Month)), "\n")
## Training period: 1988 Apr to 2010 Dec
cat("Test period:", format(min(test_data$Month)), "to", format(max(test_data$Month)), "\n")
## Test period: 2011 Jan to 2018 Dec
# find best Holt-Winters variant to use
if (best_model_name == "HW_multiplicative") {
hw_spec <- ETS(Turnover ~ error("M") + trend("A") + season("M"))
} else if (best_model_name == "HW_damped") {
hw_spec <- ETS(Turnover ~ error("M") + trend("Ad") + season("M"))
} else {
# Fallback to multiplicative if best_model_name is not recognized
hw_spec <- ETS(Turnover ~ error("M") + trend("A") + season("M"))
}
# Fit models on training data
train_models <- train_data |>
model(
Best_HW = hw_spec,
Seasonal_naive = SNAIVE(Turnover),
Mean = MEAN(Turnover),
Drift = RW(Turnover ~ drift())
)
# forecasts
test_forecasts <- train_models |>
forecast(h = nrow(test_data))
# Calculate accuracy
test_accuracy <- test_forecasts|>
accuracy(test_data)
cat("\nTest Set Performance Comparison:\n")
##
## Test Set Performance Comparison:
performance_table <- test_accuracy |>
select(.model, RMSE, MAE, MAPE) |>
arrange(RMSE)
print(performance_table)
## # A tibble: 4 × 4
## .model RMSE MAE MAPE
## <chr> <dbl> <dbl> <dbl>
## 1 Seasonal_naive 1.55 1.24 9.06
## 2 Best_HW 1.78 1.60 12.6
## 3 Mean 6.07 5.43 38.7
## 4 Drift 7.87 7.45 61.2
# seasonal naive
hw_rmse <- performance_table |>
filter(.model == "Best_HW") |>
pull(RMSE)
snaive_rmse <- performance_table |>
filter(.model == "Seasonal_naive") |>
pull(RMSE)
cat("Holt-Winters RMSE:", round(hw_rmse, 2), "\n")
## Holt-Winters RMSE: 1.78
cat("Seasonal Naive RMSE:", round(snaive_rmse, 2), "\n")
## Seasonal Naive RMSE: 1.55
comparison_plot <- test_data |>
ggplot(aes(x = Month, y = Turnover)) +
geom_line(aes(color = "Actual"), size = 1) +
autolayer(test_forecasts |> filter(.model == "Best_HW"),
aes(color = "Holt-Winters"), alpha = 0.8, size = 0.8) +
autolayer(test_forecasts |> filter(.model == "Seasonal_naive"),
aes(color = "Seasonal Naive"), alpha = 0.8, size = 0.8, linetype = "dashed") +
labs(title = "Test Set: Holt-Winters vs Seasonal Naive Forecasts",
y = "Turnover",
color = "Model") +
theme_minimal() +
theme(legend.position = "bottom")
## Scale for fill_ramp is already present.
## Adding another scale for fill_ramp, which will replace the existing scale.
print(comparison_plot)
Q9 For the same retail data, try an STL decomposition applied to the Box-Cox transformed series, followed by ETS on the seasonally adjusted data. How does that compare with your best previous forecasts on the test set?
I tried STL decomposition applied to the box-cox transformation. optimal box-cox lambda value 0.1971 was used. STL ETS has RMSE value of 4.02, compare to others moethod, seems sealsonal_naive was the best model that has RMSE 1.55. I did Ljung-box test for the STL ETS and have the p-value 0.2244, residuals appear to be white noise. Forcaste with Other ETS models on seasonally adjusted data was also performend but seems sealsonal naive performs the best. a comparision plot is shown below.
# Determine optimal Box-Cox lambda from training data
lambda <- train_data |>
features(Turnover, features = guerrero) |>
pull(lambda_guerrero)
cat("Optimal Box-Cox lambda:", round(lambda, 4), "\n")
## Optimal Box-Cox lambda: 0.1971
stl_ets_fable <- train_data %>%
model(
stl_ets = decomposition_model(
STL(box_cox(Turnover, lambda) ~ season(window = "periodic")),
ETS(season_adjust)
)
)
stl_ets_fc <- stl_ets_fable |>
forecast(h = nrow(test_data))
stl_ets_accuracy <- stl_ets_fc |>
accuracy(test_data)
cat("\nf. STL + ETS Model Performance:\n")
##
## f. STL + ETS Model Performance:
stl_performance <- stl_ets_accuracy %>%
select(.model, RMSE, MAE, MAPE)
print(stl_performance)
## # A tibble: 1 × 4
## .model RMSE MAE MAPE
## <chr> <dbl> <dbl> <dbl>
## 1 stl_ets 4.02 3.62 27.9
# Compare
# previous model performances from test_accuracy
previous_performance <- test_accuracy |>
filter(.model %in% c("Best_HW", "Seasonal_naive")) |>
select(.model, RMSE, MAE, MAPE)
comparison_table <- bind_rows(
previous_performance,
stl_performance
) |>
arrange(RMSE)
print(comparison_table)
## # A tibble: 3 × 4
## .model RMSE MAE MAPE
## <chr> <dbl> <dbl> <dbl>
## 1 Seasonal_naive 1.55 1.24 9.06
## 2 Best_HW 1.78 1.60 12.6
## 3 stl_ets 4.02 3.62 27.9
best_overall <- comparison_table$.model[1]
best_overall_rmse <- comparison_table$RMSE[1]
cat("\nOverall Best Model:", best_overall, "\n")
##
## Overall Best Model: Seasonal_naive
cat("Best Test RMSE:", round(best_overall_rmse, 2), "\n")
## Best Test RMSE: 1.55
# Plot comparison
all_forecasts <- bind_rows(
test_forecasts |> filter(.model == "Best_HW"),
test_forecasts |> filter(.model == "Seasonal_naive"),
stl_ets_fc
)
comparison_plot <- test_data |>
ggplot(aes(x = Month, y = Turnover)) +
geom_line(aes(color = "Actual"), size = 1) +
autolayer(all_forecasts, alpha = 0.8, size = 0.8) +
labs(title = "Test Set: Comparison of All Forecasting Methods",
y = "Turnover",
color = "Series") +
theme_minimal() +
theme(legend.position = "bottom")
print(comparison_plot)
# Residual analysis for STL+ETS model
cat("\nResidual Analysis for STL+ETS Model:\n")
##
## Residual Analysis for STL+ETS Model:
stl_ets_residuals <- stl_ets_fable |>
gg_tsresiduals() +
labs(title = "Residual Diagnostics for STL+ETS Model") +
theme_minimal()
print(stl_ets_residuals)
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_rug()`).
# Ljung-Box test for STL+ETS
stl_lb_test <- stl_ets_fable |>
augment() |>
features(.innov, ljung_box, lag = 24, dof = 0)
# Other ETS models on seasonally adjusted data
cat("\n--- Extended STL+ETS Comparison ---\n")
##
## --- Extended STL+ETS Comparison ---
extended_stl_models <- train_data %>%
model(
stl_ets_auto = decomposition_model(
STL(box_cox(Turnover, lambda) ~ season(window = "periodic")),
ETS(season_adjust)
),
stl_ets_aan = decomposition_model(
STL(box_cox(Turnover, lambda) ~ season(window = "periodic")),
ETS(season_adjust ~ trend("A") + season("N"))
),
stl_ets_mnn = decomposition_model(
STL(box_cox(Turnover, lambda) ~ season(window = "periodic")),
ETS(season_adjust ~ trend("M") + season("N"))
)
)
# forecasts and calculate accuracy
extended_stl_fc <- extended_stl_models |>
forecast(h = nrow(test_data))
extended_stl_accuracy <- extended_stl_fc |>
accuracy(test_data)
cat("Extended STL+ETS Models Performance:\n")
## Extended STL+ETS Models Performance:
extended_stl_comparison <- extended_stl_accuracy |>
select(.model, RMSE, MAE, MAPE) |>
arrange(RMSE)
print(extended_stl_comparison)
## # A tibble: 3 × 4
## .model RMSE MAE MAPE
## <chr> <dbl> <dbl> <dbl>
## 1 stl_ets_aan 4.02 3.62 27.9
## 2 stl_ets_auto 4.02 3.62 27.9
## 3 stl_ets_mnn 7.96 6.46 48.1
# Final comprehensive comparison
final_comparison <- bind_rows(
comparison_table,
extended_stl_comparison
) |>
arrange(RMSE)
cat("\n=== FINAL COMPREHENSIVE COMPARISON ===\n")
##
## === FINAL COMPREHENSIVE COMPARISON ===
print(final_comparison)
## # A tibble: 6 × 4
## .model RMSE MAE MAPE
## <chr> <dbl> <dbl> <dbl>
## 1 Seasonal_naive 1.55 1.24 9.06
## 2 Best_HW 1.78 1.60 12.6
## 3 stl_ets 4.02 3.62 27.9
## 4 stl_ets_aan 4.02 3.62 27.9
## 5 stl_ets_auto 4.02 3.62 27.9
## 6 stl_ets_mnn 7.96 6.46 48.1
# Summary
cat("\n=== SUMMARY ===\n")
##
## === SUMMARY ===
cat("Best model on test set:", final_comparison$.model[1], "\n")
## Best model on test set: Seasonal_naive
cat("Best test RMSE:", round(final_comparison$RMSE[1], 2), "\n")
## Best test RMSE: 1.55
# seasonal naive
if ("Seasonal_naive" %in% final_comparison$.model) {
snaive_rmse <- final_comparison |>
filter(.model == "Seasonal_naive") |>
pull(RMSE)
improvement_over_snaive <- (snaive_rmse - final_comparison$RMSE[1]) / snaive_rmse * 100
cat("Improvement over seasonal naive:", round(improvement_over_snaive, 2), "%\n")
}
## Improvement over seasonal naive: 0 %
# Plot comparison
final_plot <- test_data |>
ggplot(aes(x = Month, y = Turnover)) +
geom_line(aes(color = "Actual"), size = 1.2) +
autolayer(extended_stl_fc, alpha = 0.7, size = 0.8) +
autolayer(test_forecasts %>% filter(.model == "Best_HW"),
aes(color = "Best_HW"), alpha = 0.7, size = 0.8) +
autolayer(test_forecasts %>% filter(.model == "Seasonal_naive"),
aes(color = "Seasonal_naive"), alpha = 0.7, size = 0.8, linetype = "dashed") +
labs(title = "Final Model Comparison on Test Set",
subtitle = paste("Best model:", final_comparison$.model[1],
"- RMSE:", round(final_comparison$RMSE[1], 2)),
y = "Turnover",
color = "Model") +
theme_minimal() +
theme(legend.position = "bottom")
## Scale for fill_ramp is already present.
## Adding another scale for fill_ramp, which will replace the existing scale.
## Scale for fill_ramp is already present.
## Adding another scale for fill_ramp, which will replace the existing scale.
print(final_plot)