# Load libraries
library(tidyverse)
library(fpp3)
library(imputeTS)
library(kableExtra)
library(scales)
library(gridExtra)
library(lubridate)
In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
# Read data
atm_data <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/Data624/refs/heads/main/ATM624Data.csv", header = TRUE, sep = ',', na.strings="", fill = TRUE)
head(atm_data)
## DATE ATM Cash
## 1 5/1/2009 12:00:00 AM ATM1 96
## 2 5/1/2009 12:00:00 AM ATM2 107
## 3 5/2/2009 12:00:00 AM ATM1 82
## 4 5/2/2009 12:00:00 AM ATM2 89
## 5 5/3/2009 12:00:00 AM ATM1 85
## 6 5/3/2009 12:00:00 AM ATM2 90
The initial step involved examining ATM withdrawal data for four ATMs. I interpreted each individial ATM as one banking location and created forecasts for each individual ATM/location,
We loaded the data and converted dates to proper format. The time series visualization revealed gaps in the data and an extreme outlier in ATM4. We created separate tsibble objects for each ATM. ATM3 data was found to be unusable due to extensive missing values, so it was excluded altogether from further analysis. The data exhibited daily patterns with varying withdrawal amounts across different ATMs.
We assume that the data has seasonality - likely a weekly one to account for weekly patterns of ATM usage. The seasonality is accounted for in the analysis and forecasts.
# Convert the DATE column to Date type
atm_data <- atm_data |>
mutate(DATE = mdy_hms(DATE)) |>
mutate(DATE = as.Date(DATE))
# Plot the data
ggplot(atm_data, aes(x = DATE, y = Cash)) +
geom_line() +
facet_wrap(~ATM, scales = "free_y") +
labs(title = "Cash Withdrawals from ATMs Over Time",
x = "Date",
y = "Cash Withdrawals (Hundreds of Dollars)") +
theme_minimal()
# Remove rows where ATM is NA
atm_data_clean <- atm_data |>
filter(!is.na(ATM))
# Split data into separate dataframes for each ATM type
atm1_data <- atm_data_clean |> filter(ATM == "ATM1")
atm2_data <- atm_data_clean |> filter(ATM == "ATM2")
atm3_data <- atm_data_clean |> filter(ATM == "ATM3")
atm4_data <- atm_data_clean |> filter(ATM == "ATM4")
# Convert to tsibbles
atm1_tsibble <- atm1_data |>
as_tsibble(index = DATE)
atm2_tsibble <- atm2_data |>
as_tsibble(index = DATE)
atm4_tsibble <- atm4_data |>
as_tsibble(index = DATE)
# Get interval for each tsibble
interval_atm1 <- interval(atm1_tsibble)
interval_atm2 <- interval(atm2_tsibble)
interval_atm4 <- interval(atm4_tsibble)
# Plot for ATM1
autoplot(atm1_tsibble, Cash) +
labs(title = "Cash Withdrawals from ATM1 Over Time",
x = "Date",
y = "Cash Withdrawals (Hundreds of Dollars)") +
theme_minimal()
# Plot for ATM2
autoplot(atm2_tsibble, Cash) +
labs(title = "Cash Withdrawals from ATM2 Over Time",
x = "Date",
y = "Cash Withdrawals (Hundreds of Dollars)") +
theme_minimal()
# Plot for ATM4
autoplot(atm4_tsibble, Cash) +
labs(title = "Cash Withdrawals from ATM4 Over Time",
x = "Date",
y = "Cash Withdrawals (Hundreds of Dollars)") +
theme_minimal()
# Convert to tsibbles and check intervals
atm1_tsibble <- atm1_data |>
as_tsibble(index = DATE)
print(interval(atm1_tsibble))
## <interval[1]>
## [1] 1D
## <interval[1]>
## [1] 1D
## <interval[1]>
## [1] 1D
Missing values and outliers were identified and addressed using Kalman filtering. This method was chosen because it preserves the time series structure while estimating missing values. For ATM4, a significant outlier that is more than 3 standard deviations was identified and replaced. The imputation process filled gaps in ATM1 and ATM2 while maintaining the underlying time series patterns in the data. The visualization plots show the imputed points fall withing the historical time series pattern and the ATM4 plot shows that the outlier has been eliminated.
#Detect outliers using Z-score
z_scores <- scale(atm4_tsibble$Cash)
outlier_indices <- which(abs(z_scores) > 3)
outlier_dates <- atm4_tsibble$DATE[outlier_indices]
# Print the identified outlier
print(atm4_tsibble[outlier_indices, ])
## # A tsibble: 1 x 3 [1D]
## DATE ATM Cash
## <date> <chr> <int>
## 1 2010-02-09 ATM4 10920
# Impute the outlier using seasonal decomposition
atm4_tsibble_imputed <- atm4_tsibble |>
mutate(Cash = ifelse(DATE %in% outlier_dates, NA, Cash))
# Impute the missing values using na_kalman
atm4_tsibble_imputed <- atm4_tsibble_imputed |>
mutate(Cash = na_kalman(Cash))
# Print the imputed values
imputed_values <- atm4_tsibble_imputed |>
filter(DATE %in% outlier_dates) |>
select(DATE, Cash)
print(imputed_values)
## # A tsibble: 1 x 2 [1D]
## DATE Cash
## <date> <dbl>
## 1 2010-02-09 440.
# Plot the imputed data
autoplot(atm4_tsibble_imputed, Cash) +
labs(title = "Cash Withdrawals from ATM4 (After Kalman Imputation)",
x = "Date",
y = "Cash Withdrawals (Hundreds of Dollars)") +
theme_minimal()
# Clean the data with Kalman imputation
atm1_tsibble_clean <- atm1_tsibble |>
mutate(Cash = na_kalman(Cash))
atm2_tsibble_clean <- atm2_tsibble |>
mutate(Cash = na_kalman(Cash))
atm4_tsibble_clean <- atm4_tsibble_imputed |>
mutate(Cash = na_kalman(Cash))
# Plot ATM1
ggplot() +
geom_line(data = atm1_tsibble_clean, aes(DATE, Cash), color = "blue") +
geom_point(data = atm1_tsibble_clean[is.na(atm1_tsibble$Cash), ],
aes(DATE, Cash),
color = "red",
size = 2) +
labs(title = "ATM1 - Red Points Show Imputed Values",
x = "Date",
y = "Cash Withdrawals") +
theme_minimal()
# Plot ATM2
ggplot() +
geom_line(data = atm2_tsibble_clean, aes(DATE, Cash), color = "blue") +
geom_point(data = atm2_tsibble_clean[is.na(atm2_tsibble$Cash), ],
aes(DATE, Cash),
color = "red",
size = 2) +
labs(title = "ATM2 - Red Points Show Imputed Values",
x = "Date",
y = "Cash Withdrawals") +
theme_minimal()
# Plot ATM4
ggplot() +
geom_line(data = atm4_tsibble_clean, aes(DATE, Cash), color = "blue") +
geom_point(data = atm4_tsibble_clean[is.na(atm4_tsibble_imputed$Cash), ],
aes(DATE, Cash),
color = "red",
size = 2) +
labs(title = "ATM4 - Eliminated Outlier",
x = "Date",
y = "Cash Withdrawals") +
theme_minimal()
Seasonal and Trend decomposition using Loess (STL) with season adjustment was applied to create forecasts. This method was chosen because it can handle both seasonal patterns and trends effectively. The forecasts were generated for 30 days ahead for each ATM. The results showed reasonable predictions that captured the seasonal patterns, though with increasing uncertainty in the prediction intervals over time.
Visually, the STL forecast appears to have very wide prediction intervals.
# Create forecasts for each ATM
atm1_forecast <- atm1_tsibble_clean |>
model(
stl = decomposition_model(
STL(Cash),
NAIVE(season_adjust))) |>
forecast(h = 30)
atm2_forecast <- atm2_tsibble_clean |>
model(
stl = decomposition_model(
STL(Cash),
NAIVE(season_adjust))) |>
forecast(h = 30)
atm4_forecast <- atm4_tsibble_clean |>
model(
stl = decomposition_model(
STL(Cash),
NAIVE(season_adjust))) |>
forecast(h = 30)
# Print forecast results
print(atm1_forecast)
## # A fable: 30 x 4 [1D]
## # Key: .model [1]
## .model DATE Cash .mean
## <chr> <date> <dist> <dbl>
## 1 stl 2010-05-01 N(87, 587) 87.0
## 2 stl 2010-05-02 N(103, 1160) 103.
## 3 stl 2010-05-03 N(69, 1733) 68.9
## 4 stl 2010-05-04 N(3.7, 2305) 3.67
## 5 stl 2010-05-05 N(100, 2878) 100.
## 6 stl 2010-05-06 N(81, 3450) 81.1
## 7 stl 2010-05-07 N(85, 4023) 85
## 8 stl 2010-05-08 N(87, 4611) 87.0
## 9 stl 2010-05-09 N(103, 5183) 103.
## 10 stl 2010-05-10 N(69, 5756) 68.9
## # ℹ 20 more rows
## # A fable: 30 x 4 [1D]
## # Key: .model [1]
## .model DATE Cash .mean
## <chr> <date> <dist> <dbl>
## 1 stl 2010-05-01 N(95, 723) 94.7
## 2 stl 2010-05-02 N(105, 1435) 105.
## 3 stl 2010-05-03 N(39, 2147) 39.1
## 4 stl 2010-05-04 N(31, 2859) 31.0
## 5 stl 2010-05-05 N(129, 3571) 129.
## 6 stl 2010-05-06 N(120, 4284) 120.
## 7 stl 2010-05-07 N(90, 4996) 90
## 8 stl 2010-05-08 N(95, 5717) 94.7
## 9 stl 2010-05-09 N(105, 6430) 105.
## 10 stl 2010-05-10 N(39, 7142) 39.1
## # ℹ 20 more rows
## # A fable: 30 x 4 [1D]
## # Key: .model [1]
## .model DATE Cash .mean
## <chr> <date> <dist> <dbl>
## 1 stl 2010-05-01 N(249, 154696) 249.
## 2 stl 2010-05-02 N(293, 307490) 293.
## 3 stl 2010-05-03 N(284, 460136) 284.
## 4 stl 2010-05-04 N(-98, 612704) -98.2
## 5 stl 2010-05-05 N(448, 765223) 448.
## 6 stl 2010-05-06 N(310, 917706) 310.
## 7 stl 2010-05-07 N(482, 1070164) 482
## 8 stl 2010-05-08 N(249, 1225408) 249.
## 9 stl 2010-05-09 N(293, 1377937) 293.
## 10 stl 2010-05-10 N(284, 1530445) 284.
## # ℹ 20 more rows
# Plot the forecasts
atm1_forecast |>
autoplot(atm1_tsibble_clean) +
ggtitle("ATM1 STL Forecast") +
theme_minimal()
We also fitted ETS models to each ATM’s data. We let fable automatically select the optimal parameters. Diagnostic plots and residual analysis were performed to validate model assumptions. The ETS models showed good fit, particularly for ATMs with strong seasonal patterns.
The model selected for the ATM data was the additive ETS(A,N,A) model. The residuals were generally well behaved with ATM exhibiting white noise characters, no or limited autocorrection and normalish – although this varies by ATM. ATM2 specifically still contains some autocorrelation not captured in the model.
# Create and train the ETS model
atm1_model <- atm1_tsibble_clean |>
model(
ets = ETS(Cash))
# ATM1
# Generate forecasts
atm1_fc <- atm1_model |>
forecast(h = "30 days")
# Plot data and forecasts
autoplot(atm1_tsibble_clean, Cash) +
autolayer(atm1_fc, .mean) +
theme_minimal() +
labs(
title = "ATM1 Forecasts",
y = "Cash Withdrawals",
x = "Date")
# ATM2
# Create and train the ETS model
atm2_model <- atm2_tsibble_clean |>
model(
ets = ETS(Cash))
# Generate forecasts
atm2_fc <- atm2_model |>
forecast(h = "30 days")
# Plot data and forecasts for ATM2
autoplot(atm2_tsibble_clean, Cash) +
autolayer(atm2_fc, .mean) +
theme_minimal() +
labs(
title = "ATM2 Forecasts",
y = "Cash Withdrawals",
x = "Date")
# ATM4
# Create and train the ETS model
atm4_model <- atm4_tsibble_clean |>
model(
ets = ETS(Cash))
# Generate forecasts
atm4_fc <- atm4_model |>
forecast(h = "30 days")
# Plot data and forecasts for ATM4
autoplot(atm4_tsibble_clean, Cash) +
autolayer(atm4_fc, .mean) +
theme_minimal() +
labs(
title = "ATM4 Forecasts",
y = "Cash Withdrawals",
x = "Date")
## Series: Cash
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.01625029
## gamma = 0.3277445
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 77.73628 -67.57616 -2.468905 13.25487 10.55854 21.02297 13.04891 12.15977
##
## sigma^2: 578.5591
##
## AIC AICc BIC
## 4485.947 4486.569 4524.946
## Series: Cash
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.0001000014
## gamma = 0.3597154
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 66.92362 -41.45125 -25.41454 13.39997 -11.18308 13.90732 24.66123 26.08036
##
## sigma^2: 650.6844
##
## AIC AICc BIC
## 4528.829 4529.450 4567.828
## Series: Cash
## Model: ETS(A,N,A)
## Smoothing parameters:
## alpha = 0.0001002006
## gamma = 0.0001000353
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 452.5897 -275.4528 -31.33016 22.24026 42.51331 90.23185 54.19974 97.59783
##
## sigma^2: 110744.2
##
## AIC AICc BIC
## 6403.817 6404.438 6442.816
The ARIMA models were fitted to capture complex temporal dependencies in the data. WE let fable select the best model; in these cases, it identified Seasonal ARIMAs (SARIMA) with varying pdq ARIMA(0,0,1)(0,1,2)[7] , ARIMA(2,0,2)(0,1,1)[7], and ARIMA(3,0,2)(1,0,0)[7] w/ meam for ATM1, ATM2 & ATM4 respectively.
The models were automatically selected using AIC criteria. Diagnostic checks showed generally well-behaved residuals with ATM4 indicating some patterns . The ARIMA forecasts captured both underlying trend and seasonal patterns, though with wider prediction intervals compared to ETS models.
# Create ARIMA forecasts for each ATM
atm1_model <- atm1_tsibble_clean |>
model(arima = ARIMA(Cash))
atm1_forecast <- atm1_model |>
forecast(h = 30)
atm2_model <- atm2_tsibble_clean |>
model(arima = ARIMA(Cash))
atm2_forecast <- atm2_model |>
forecast(h = 30)
atm4_model <- atm4_tsibble_clean |>
model(arima = ARIMA(Cash))
atm4_forecast <- atm4_model |>
forecast(h = 30)
# Print forecast results and model specifications
report(atm1_model)
## Series: Cash
## Model: ARIMA(0,0,1)(0,1,2)[7]
##
## Coefficients:
## ma1 sma1 sma2
## 0.1696 -0.5866 -0.0989
## s.e. 0.0552 0.0508 0.0497
##
## sigma^2 estimated as 559.7: log likelihood=-1641.1
## AIC=3290.2 AICc=3290.31 BIC=3305.72
## Series: Cash
## Model: ARIMA(2,0,2)(0,1,1)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4319 -0.9159 0.4741 0.8106 -0.7552
## s.e. 0.0550 0.0400 0.0859 0.0549 0.0383
##
## sigma^2 estimated as 608.9: log likelihood=-1655.57
## AIC=3323.13 AICc=3323.37 BIC=3346.42
## Series: Cash
## Model: ARIMA(3,0,2)(1,0,0)[7] w/ mean
##
## Coefficients:
## ar1 ar2 ar3 ma1 ma2 sar1 constant
## -0.8867 -0.6739 0.1667 0.9703 0.7980 0.2481 799.3745
## s.e. 0.1043 0.1233 0.0564 0.0951 0.1141 0.0555 48.8659
##
## sigma^2 estimated as 117528: log likelihood=-2645.2
## AIC=5306.41 AICc=5306.81 BIC=5337.61
# Plot the forecasts
atm1_forecast |>
autoplot(atm1_tsibble_clean) +
ggtitle("ATM1 ARIMA Forecast") +
theme_minimal()
# ATM1 residuals
atm1_model |>
select(arima) |>
gg_tsresiduals()+
ggtitle("ATM1 Residual Diagnostics")
# ATM2 residuals
atm2_model |>
select(arima) |>
gg_tsresiduals() +
ggtitle("ATM2 Residual Diagnostics")
# ATM4 residuals
atm4_model |>
select(arima) |>
gg_tsresiduals() +
ggtitle("ATM4 Residual Diagnostics")
The data was split into training (through March 31, 2010) and test (April 2010) sets to evaluate model performance. This split allows us to assess how well each model performs on unseen data. The split was performed for all three modeling approaches (STL, ETS, and ARIMA) so we can compare the accuracy results.
# Create training and test sets for all models
# ATM1
atm1_train <- atm1_tsibble_clean |>
filter_index(. ~ "2010-03-31")
atm1_test <- atm1_tsibble_clean |>
filter_index("2010-04-01" ~ "2010-04-30")
# ATM2
atm2_train <- atm2_tsibble_clean |>
filter_index(. ~ "2010-03-31")
atm2_test <- atm2_tsibble_clean |>
filter_index("2010-04-01" ~ "2010-04-30")
# ATM4
atm4_train <- atm4_tsibble_clean |>
filter_index(. ~ "2010-03-31")
atm4_test <- atm4_tsibble_clean |>
filter_index("2010-04-01" ~ "2010-04-30")
# Fit STL models on training data
atm1_fit <- atm1_train |>
model(
stl = decomposition_model(
STL(Cash),
NAIVE(season_adjust)))
atm2_fit <- atm2_train |>
model(
stl = decomposition_model(
STL(Cash),
NAIVE(season_adjust)))
atm4_fit <- atm4_train |>
model(
stl = decomposition_model(
STL(Cash),
NAIVE(season_adjust)))
# Generate forecasts for April 2010
atm1_forecast <- atm1_fit |> forecast(h = 1)
atm2_forecast <- atm2_fit |> forecast(h = 1)
atm4_forecast <- atm4_fit |> forecast(h = 1)
# Calculate accuracy metrics
atm1_accuracy_stl <- accuracy(atm1_forecast, atm1_tsibble_clean) |>
mutate(.model = "ATM1_STL")
atm2_accuracy_stl <- accuracy(atm2_forecast, atm2_tsibble_clean) |>
mutate(.model = "ATM2_STL")
atm4_accuracy_stl <- accuracy(atm4_forecast, atm4_tsibble_clean) |>
mutate(.model = "ATM4_STL")
# Save metrics
atm1_accuracy_table <- atm1_accuracy_stl
atm2_accuracy_table <- atm2_accuracy_stl
atm4_accuracy_table <- atm4_accuracy_stl
# Fit ETS models on training data
atm1_fit <- atm1_train |>
model(
ets = ETS(Cash))
atm2_fit <- atm2_train |>
model(
ets = ETS(Cash))
atm4_fit <- atm4_train |>
model(
ets = ETS(Cash))
# Generate forecasts for April 2010
atm1_forecast <- atm1_fit |> forecast(h = 1)
atm2_forecast <- atm2_fit |> forecast(h = 1)
atm4_forecast <- atm4_fit |> forecast(h = 1)
# Calculate accuracy metrics
atm1_accuracy_ets <- accuracy(atm1_forecast, atm1_tsibble_clean) |>
mutate(.model = "ATM1_ETS")
atm2_accuracy_ets <- accuracy(atm2_forecast, atm2_tsibble_clean) |>
mutate(.model = "ATM2_ETS")
atm4_accuracy_ets <- accuracy(atm4_forecast, atm4_tsibble_clean) |>
mutate(.model = "ATM4_ETS")
# Fit ARIMA models on training data
atm1_fit <- atm1_train |>
model(
arima = ARIMA(Cash))
atm2_fit <- atm2_train |>
model(
arima = ARIMA(Cash))
atm4_fit <- atm4_train |>
model(
arima = ARIMA(Cash))
# Generate forecasts for April 2010
atm1_forecast <- atm1_fit |> forecast(h = 1)
atm2_forecast <- atm2_fit |> forecast(h = 1)
atm4_forecast <- atm4_fit |> forecast(h = 1)
# Calculate accuracy metrics
atm1_accuracy_arima <- accuracy(atm1_forecast, atm1_tsibble_clean) |>
mutate(.model = "ATM1_ARIMA")
atm2_accuracy_arima <- accuracy(atm2_forecast, atm2_tsibble_clean) |>
mutate(.model = "ATM2_ARIMA")
atm4_accuracy_arima <- accuracy(atm4_forecast, atm4_tsibble_clean) |>
mutate(.model = "ATM4_ARIMA")
Model performance was evaluated using multiple accuracy metrics. The comparison revealed varying performance across ATMs and methods. For most ATMs, the STL and ARIMA models showed slightly better performance than ETS, though the differences were not dramatic. The results suggest that different ATMs might benefit from different forecasting approaches.
ATM1
For ATM1, the RMSE varies significantly across models with ARIMA showing the lowest at 2.02, followed by ETS at 5.54, and STL at 16.92. The MAPE follows a similar pattern with ARIMA performing best at 2.17%, compared to ETS at 5.96% and STL at 18.19%. Based on both RMSE and MAPE being substantially lower than the other models, the ARIMA model is clearly the best choice for ATM1 forecasting.
ATM2
For ATM2, the STL model shows the lowest RMSE at 18.32, marginally better than ETS at 18.81, while ARIMA performs worst at 21.94. The MAPE values follow a similar pattern with STL at 18.51%, ETS at 19.00%, and ARIMA at 22.16%. Based on both RMSE and MAPE metrics, the STL model is the best choice for ATM2’s forecasting, though the difference between STL and ETS is relatively small.
ATM4
For ATM4, while STL and ARIMA both show lower RMSE values (96.40 and 73.17 respectively) compared to ETS’s poor performance at 241.47, the ARIMA model is the best choice with the lowest RMSE and MAPE (18.07% vs STL’s 23.80%). The ETS model performs significantly worse across all metrics with a MAPE of 59.62%, making it unsuitable for ATM4 forecasting.
# Combine accuracy metrics for ATM1
atm1_combined_accuracy <- bind_rows(
atm1_accuracy_stl,
atm1_accuracy_ets,
atm1_accuracy_arima)
# Combine accuracy metrics for ATM2
atm2_combined_accuracy <- bind_rows(
atm2_accuracy_stl,
atm2_accuracy_ets,
atm2_accuracy_arima)
# Combine accuracy metrics for ATM4
atm4_combined_accuracy <- bind_rows(
atm4_accuracy_stl,
atm4_accuracy_ets,
atm4_accuracy_arima)
# Print combined accuracy tables
print(atm1_combined_accuracy)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM1_STL Test -16.9 16.9 16.9 -18.2 18.2 0.914 0.588 NA
## 2 ATM1_ETS Test 5.54 5.54 5.54 5.96 5.96 0.300 0.193 NA
## 3 ATM1_ARIMA Test 2.02 2.02 2.02 2.17 2.17 0.109 0.0701 NA
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM2_STL Test 18.3 18.3 18.3 18.5 18.5 0.854 0.592 NA
## 2 ATM2_ETS Test 18.8 18.8 18.8 19.0 19.0 0.877 0.608 NA
## 3 ATM2_ARIMA Test 21.9 21.9 21.9 22.2 22.2 1.02 0.709 NA
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM4_STL Test -96.4 96.4 96.4 -23.8 23.8 0.273 0.209 NA
## 2 ATM4_ETS Test 241. 241. 241. 59.6 59.6 0.684 0.524 NA
## 3 ATM4_ARIMA Test -73.2 73.2 73.2 -18.1 18.1 0.207 0.159 NA
While the ARIMA for ATM1 performs well, we suspect that ATM2 and ATM4 model performance is being affected by remaining seasonal pattern. To see if we can improve peform model performance for ATM2 and ATM4, we applied seasonal differencing (7-day period) to address strong weekly patterns.
The differenced models were compared with the original models to determine if this extra step provided meaningful improvements in forecast accuracy.
ATM2 With Seasonal Differenced Models
When including a seasonally differenced data, the STL_SDIFF model shows a remarkably lower RMSE (0.60) compared to all other models, which have RMSE values ranging from 18.32 to 21.94. However, the ETS_SDIFF model performs poorly with an extremely high MAPE of 101.66%, while the original STL model maintains relatively stable performance with an RMSE of 18.32 and MAPE of 18.51%. Based on the significantly lower RMSE and reasonable MAPE (2.86%), the STL_SDIFF model is the best choice for ATM2 forecasting.
ATM4 With Seasonal Differenced Models
When including a seasonally differenced data, the STL_SDIFF model shows significant improvement with the lowest RMSE (40.82) among all models, and a reasonable MAPE of 10.94%. The ETS_SDIFF model performs worst with extremely high RMSE (374.10) and MAPE (100.29%), while the original ARIMA remains competitive with RMSE of 73.17 and MAPE of 18.07%. Based on having both the lowest RMSE and MAPE, the STL_SDIFF model is the best choice for ATM4 forecasting.
# Create seasonally differenced series for ATM2 and ATM4
# ATM2 seasonal differencing
atm2_seas_diff <- atm2_tsibble_clean |>
mutate(Cash_diff = difference(Cash, lag = 7))
# ATM4 seasonal differencing
atm4_seas_diff <- atm4_tsibble_clean |>
mutate(Cash_diff = difference(Cash, lag = 7))
# Create training and test sets for differenced data
# ATM2
atm2_diff_train <- atm2_seas_diff |>
filter_index(. ~ "2010-03-31") |>
filter(!is.na(Cash_diff)) # Remove NA values created by differencing
atm2_diff_test <- atm2_seas_diff |>
filter_index("2010-04-01" ~ "2010-04-30")
# ATM4
atm4_diff_train <- atm4_seas_diff |>
filter_index(. ~ "2010-03-31") |>
filter(!is.na(Cash_diff)) # Remove NA values created by differencing
atm4_diff_test <- atm4_seas_diff |>
filter_index("2010-04-01" ~ "2010-04-30")
# Fit STL models on differenced data
# ATM2
atm2_diff_stl_fit <- atm2_diff_train |>
model(
stl = decomposition_model(
STL(Cash_diff),
NAIVE(season_adjust)))
# ATM4
atm4_diff_stl_fit <- atm4_diff_train |>
model(
stl = decomposition_model(
STL(Cash_diff),
NAIVE(season_adjust)))
# Fit ETS models on differenced data
# ATM2
atm2_diff_ets_fit <- atm2_diff_train |>
model(
ets = ETS(Cash_diff))
# ATM4
atm4_diff_ets_fit <- atm4_diff_train |>
model(
ets = ETS(Cash_diff))
# Generate forecasts
# ATM2
atm2_diff_stl_fc <- atm2_diff_stl_fit |> forecast(h = 1)
atm2_diff_ets_fc <- atm2_diff_ets_fit |> forecast(h = 1)
# ATM4
atm4_diff_stl_fc <- atm4_diff_stl_fit |> forecast(h = 1)
atm4_diff_ets_fc <- atm4_diff_ets_fit |> forecast(h = 1)
# Calculate accuracy metrics for differenced models
# ATM2
atm2_diff_stl_accuracy <- accuracy(atm2_diff_stl_fc, atm2_seas_diff) |>
mutate(.model = "ATM2_STL_SDIFF")
atm2_diff_ets_accuracy <- accuracy(atm2_diff_ets_fc, atm2_seas_diff) |>
mutate(.model = "ATM2_ETS_SDIFF")
# ATM4
atm4_diff_stl_accuracy <- accuracy(atm4_diff_stl_fc, atm4_seas_diff) |>
mutate(.model = "ATM4_STL_SDIFF")
atm4_diff_ets_accuracy <- accuracy(atm4_diff_ets_fc, atm4_seas_diff) |>
mutate(.model = "ATM4_ETS_SDIFF")
# Compare all models for ATM2
atm2_all_accuracy <- bind_rows(
atm2_accuracy_stl,
atm2_accuracy_ets,
atm2_accuracy_arima,
atm2_diff_stl_accuracy,
atm2_diff_ets_accuracy
)
# Compare all models for ATM4
atm4_all_accuracy <- bind_rows(
atm4_accuracy_stl,
atm4_accuracy_ets,
atm4_accuracy_arima,
atm4_diff_stl_accuracy,
atm4_diff_ets_accuracy
)
# Print results
print(atm2_all_accuracy)
## # A tibble: 5 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM2_STL Test 18.3 18.3 18.3 18.5 18.5 0.854 0.592 NA
## 2 ATM2_ETS Test 18.8 18.8 18.8 19.0 19.0 0.877 0.608 NA
## 3 ATM2_ARIMA Test 21.9 21.9 21.9 22.2 22.2 1.02 0.709 NA
## 4 ATM2_STL_SDIFF Test 0.602 0.602 0.602 2.86 2.86 0.0157 0.0112 NA
## 5 ATM2_ETS_SDIFF Test 21.3 21.3 21.3 102. 102. 0.558 0.398 NA
## # A tibble: 5 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ATM4_STL Test -96.4 96.4 96.4 -23.8 23.8 0.273 0.209 NA
## 2 ATM4_ETS Test 241. 241. 241. 59.6 59.6 0.684 0.524 NA
## 3 ATM4_ARIMA Test -73.2 73.2 73.2 -18.1 18.1 0.207 0.159 NA
## 4 ATM4_STL_SDIFF Test -40.8 40.8 40.8 10.9 10.9 0.0663 0.0513 NA
## 5 ATM4_ETS_SDIFF Test -374. 374. 374. 100. 100. 0.608 0.471 NA
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
For the power usage data, initial preprocessing involved converting the time series to proper format and handling missing values. The data was structured as a monthly time series from January 1998 to December 2013. Visual inspection showed clear seasonal patterns in power consumption with an overall increasing trend.
How were outliers & missing values handled?
For electrical usage data, the missing values were filled using the last values filled foward/down. This method is based on the assumption that electrical usage data is correlated with weather (temperature). Since weather extremes are not typical and likely not causing major swings in electrical usage, the last value KWH is likely the closest estimation of the next value.
# Read and process the data
elec_data <- read.csv("https://raw.githubusercontent.com/johnnydrodriguez/Data624/refs/heads/main/ResidentialCustomerForecastLoad-624.csv",
header = TRUE, sep = ',', na.strings="", fill = TRUE)
# Convert to tsibble with yearmonth index
elec_tsbl <- elec_data |>
mutate(
YYYY.MMM = yearmonth(parse_date_time(YYYY.MMM, "Y-b"))) |>
as_tsibble(index = YYYY.MMM)
# Create the plot
ggplot(elec_tsbl, aes(x = YYYY.MMM, y = KWH)) +
geom_line() +
scale_x_yearmonth(date_breaks = "6 months", date_labels = "%Y %b") +
labs(title = "Monthly Electricity Usage Over Time",
x = "Year-Month",
y = "KWH") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
## <interval[1]>
## [1] 1M
## # A tsibble: 6 x 3 [1M]
## CaseSequence YYYY.MMM KWH
## <int> <mth> <int>
## 1 733 1998 Jan 6862583
## 2 734 1998 Feb 5838198
## 3 735 1998 Mar 5420658
## 4 736 1998 Apr 5010364
## 5 737 1998 May 4665377
## 6 738 1998 Jun 6467147
STL decomposition with seasonal adjustment was applied to the power usage data, capturing both seasonal patterns and trend components. The model was validated using 2013 data and then used to forecast 2014 values.
For the 2013 STL validation forecast using the train/test split, the red line with blue shaded confidence intervals shows the model’s forecast roughly capture the seasonal pattern though there are some discrepancies with the actual test values. The 2014 future forecast showed reasonable continuation of both seasonal patterns and trend, with appropriate prediction intervals reflecting forecast uncertainty.
# Handle missing values and create tsibble
elec_tsbl_complete <- elec_tsbl |>
mutate(KWH = as.numeric(KWH)) |>
fill_gaps() |>
tidyr::fill(KWH, .direction = "down")
# Create training and test sets for all models
elec_train <- elec_tsbl_complete |>
filter_index(. ~ "2012-12")
elec_test <- elec_tsbl_complete |>
filter_index("2013-01" ~ .)
# Fit STL model with seasonal adjustment on training data
elec_fit_stl <- elec_train |>
model(
stl = decomposition_model(
STL(KWH ~ season(period = 12)),
NAIVE(season_adjust)))
# Generate validation forecasts for 2013
elec_fc_validation <- elec_fit_stl |>
forecast(h = 12, level = c(80, 95))
# Fit model on full data for future forecasts
elec_fit_full <- elec_tsbl_complete |>
model(
stl = decomposition_model(
STL(KWH ~ season(period = 12)),
NAIVE(season_adjust)))
# Generate future forecasts for 2014
elec_fc_future <- elec_fit_full |>
forecast(h = 12, level = c(80, 95))
# Plot Validation Forecast (2013)
elec_fc_validation |>
autoplot(elec_tsbl_complete, level = c(80, 95)) +
autolayer(elec_fc_validation, level = NULL, color = "red") +
geom_line(data = elec_train, aes(x = YYYY.MMM, y = KWH), color = "black") +
geom_line(data = elec_test, aes(x = YYYY.MMM, y = KWH), color = "blue") +
labs(title = "Electricity Usage - STL Validation Forecast (2013)",
subtitle = "Black = Training, Blue = Test, Red = Forecast",
y = "KWH",
x = "Year-Month") +
theme_minimal()
# Plot Future Forecast (2014)
elec_fc_future |>
autoplot(elec_tsbl_complete, level = c(80, 95)) +
autolayer(elec_fc_future, level = NULL, color = "red") +
labs(title = "Electricity Usage - STL Future Forecast (2014)",
y = "KWH",
x = "Year-Month") +
theme_minimal()
# Calculate and save accuracy metrics
stl_accuracy <- accuracy(elec_fc_validation, elec_test) |>
mutate(.model = "STL")
# Save the forecasts for 2014
forecasts_2014 <- elec_fc_future |>
hilo(level = c(80, 95)) |>
as_tibble() |>
select(YYYY.MMM, .mean, `80%`, `95%`)
print(forecasts_2014)
## # A tibble: 12 × 4
## YYYY.MMM .mean `80%` `95%`
## <mth> <dbl> <hilo> <hilo>
## 1 2014 Jan 11413446. [10250265, 12576626]80 [9634514, 13192377]95
## 2 2014 Feb 10381847. [ 8738500, 12025195]80 [7868564, 12895131]95
## 3 2014 Mar 8915296. [ 6903172, 10927420]80 [5838017, 11992574]95
## 4 2014 Apr 8121685. [ 5798555, 10444814]80 [4568765, 11674605]95
## 5 2014 May 7801538. [ 5204354, 10398722]80 [3829487, 11773589]95
## 6 2014 Jun 9517084. [ 6672104, 12362063]80 [5166063, 13868105]95
## 7 2014 Jul 9742089. [ 6669219, 12814959]80 [5042539, 14441639]95
## 8 2014 Aug 11230403. [ 7945404, 14515402]80 [6206431, 16254376]95
## 9 2014 Sep 10339023. [ 6854779, 13823267]80 [5010332, 15667715]95
## 10 2014 Oct 8019603. [ 4346901, 11692305]80 [2402690, 13636516]95
## 11 2014 Nov 7719316. [ 3867361, 11571271]80 [1828259, 13610373]95
## 12 2014 Dec 9606304 [ 5583070, 13629538]80 [3453299, 15759309]95
Fable selected an optimal multiplicative ETS(M,N,M) model fitted to the power usage data, automatically selecting the components for trend, seasonality, and error type.
For the 2013 validation forecast, the red forecast line with blue confidence intervals seems to underestimate the peaks in electricity usage during 2013, suggesting the ETS model may not be fully capturing the magnitude of seasonal variations in the data.The model showed good fit to historical patterns and provided reasonable forecasts for 2014.
The residuals shows relatively stable innovation residuals over time with one notable outlier suggesting generally consistent model performance. The ACF plot indicates minimal autocorrelation. The histogram of residuals appears approximately normally distributed and centered near zero with some slight right skewness. Overall, residuals are well-behaved showing no significant patterns.
# Fit ETS model on training data
elec_fit_ets <- elec_train |>
model(
ets = ETS(KWH))
# Print the selected model
report(elec_fit_ets)
## Series: KWH
## Model: ETS(M,N,M)
## Smoothing parameters:
## alpha = 0.1527175
## gamma = 0.00010014
##
## Initial states:
## l[0] s[0] s[-1] s[-2] s[-3] s[-4] s[-5] s[-6]
## 6139983 0.9384328 0.7502792 0.8755048 1.194094 1.266397 1.204814 0.9969779
## s[-7] s[-8] s[-9] s[-10] s[-11]
## 0.7699923 0.8075335 0.91558 1.073069 1.207325
##
## sigma^2: 0.0137
##
## AIC AICc BIC
## 5813.955 5816.882 5861.849
# Generate validation forecasts for 2013
elec_fc_validation <- elec_fit_ets |>
forecast(h = 12, level = c(80, 95))
# Fit model on full data for future forecasts
elec_fit_full_ets <- elec_tsbl_complete |>
model(
ets = ETS(KWH))
# Generate future forecasts for 2014
elec_fc_future <- elec_fit_full_ets |>
forecast(h = 12, level = c(80, 95))
# Plot Validation Forecast (2013)
autoplot(elec_tsbl_complete, KWH) +
autolayer(elec_fc_validation, level = c(80, 95)) +
autolayer(elec_fc_validation, level = NULL, color = "red") +
geom_line(data = elec_train, aes(y = KWH), color = "black") +
geom_line(data = elec_test, aes(y = KWH), color = "blue") +
labs(title = "Electricity Usage - ETS Validation Forecast (2013)",
subtitle = "Black = Training, Blue = Test, Red = Forecast",
y = "KWH",
x = "Year-Month") +
theme_minimal()
# Plot Future Forecast (2014)
autoplot(elec_tsbl_complete, KWH) +
autolayer(elec_fc_future, level = c(80, 95)) +
autolayer(elec_fc_future, level = NULL, color = "red") +
labs(title = "Electricity Usage - ETS Future Forecast (2014)",
y = "KWH",
x = "Year-Month") +
theme_minimal()
# Calculate and save accuracy metrics for ETS
ets_accuracy <- accuracy(elec_fc_validation, elec_test) |>
mutate(.model = "ETS")
# Save the forecasts for 2014
forecasts_2014 <- elec_fc_future |>
hilo(level = c(80, 95)) |>
as_tibble() |>
select(YYYY.MMM, .mean, `80%`, `95%`)
print(forecasts_2014)
## # A tibble: 12 × 4
## YYYY.MMM .mean `80%` `95%`
## <mth> <dbl> <hilo> <hilo>
## 1 2014 Jan 9301055. [7880379, 10721730]80 [7128319, 11473791]95
## 2 2014 Feb 8090307. [6845956, 9334657]80 [6187237, 9993376]95
## 3 2014 Mar 6913741. [5843048, 7984434]80 [5276257, 8551225]95
## 4 2014 Apr 6180135. [5216563, 7143708]80 [4706478, 7653792]95
## 5 2014 May 5860650. [4940776, 6780524]80 [4453824, 7267476]95
## 6 2014 Jun 7579650. [6382110, 8777190]80 [5748170, 9411130]95
## 7 2014 Jul 9083334. [7638866, 10527803]80 [6874210, 11292459]95
## 8 2014 Aug 9551776. [8023038, 11080515]80 [7213772, 11889780]95
## 9 2014 Sep 9084586. [7621379, 10547793]80 [6846805, 11322367]95
## 10 2014 Oct 6636296. [5560713, 7711878]80 [4991334, 8281257]95
## 11 2014 Nov 5699778. [4770254, 6629302]80 [4278194, 7121362]95
## 12 2014 Dec 7291321. [6094964, 8487679]80 [5461650, 9120992]95
The ARIMA model was applied to the power usage data, with automatic model selection determining the optimal orders. Fable automatically selected a Seasonal ARIMA ARIMA(1,0,1)(2,1,0)[12] w/ drift.
For the 2013 validation forecast, the ARIMA model’s red forecast line with blue confidence intervals appears to underestimate the actual usage peaks shown by the blue test line. The prediction intervals for ARIMA appear slightly wider than those of the ETS model, suggesting more uncertainty in its forecasts.
The future forecasts captured both trend and seasonal component patterns and appear visually reasonable.
The residuals are stable except for a large negative spike. The ACF plot indicates that there is no autocorrelation as none of lags exceed the critical bounds. The residuals appears roughly normally distributed.
# Fit ARIMA model on training data
elec_fit_arima <- elec_train |>
model(
arima = ARIMA(KWH))
# Print the selected model
report(elec_fit_arima)
## Series: KWH
## Model: ARIMA(0,0,1)(2,1,0)[12] w/ drift
##
## Coefficients:
## ma1 sar1 sar2 constant
## 0.1667 -0.8486 -0.6244 211502.33
## s.e. 0.0741 0.0678 0.0774 78431.95
##
## sigma^2 estimated as 6.827e+11: log likelihood=-2533.15
## AIC=5076.3 AICc=5076.67 BIC=5091.92
# Generate validation forecasts for 2013
elec_fc_validation <- elec_fit_arima |>
forecast(h = 12, level = c(80, 95))
# Fit model on full data for future forecasts
elec_fit_full_arima <- elec_tsbl_complete |>
model(
arima = ARIMA(KWH))
# Generate future forecasts for 2014
elec_fc_future <- elec_fit_full_arima |>
forecast(h = 12, level = c(80, 95))
# Plot Validation Forecast (2013)
autoplot(elec_tsbl_complete, KWH) +
autolayer(elec_fc_validation, level = c(80, 95)) +
autolayer(elec_fc_validation, level = NULL, color = "red") +
geom_line(data = elec_train, aes(y = KWH), color = "black") +
geom_line(data = elec_test, aes(y = KWH), color = "blue") +
labs(title = "Electricity Usage - ARIMA Validation Forecast (2013)",
subtitle = "Black = Training, Blue = Test, Red = Forecast",
y = "KWH",
x = "Year-Month") +
theme_minimal()
# Plot Future Forecast (2014)
autoplot(elec_tsbl_complete, KWH) +
autolayer(elec_fc_future, level = c(80, 95)) +
autolayer(elec_fc_future, level = NULL, color = "red") +
labs(title = "Electricity Usage - ARIMA Future Forecast (2014)",
y = "KWH",
x = "Year-Month") +
theme_minimal()
# Calculate and save accuracy metrics for ARIMA
arima_accuracy <- accuracy(elec_fc_validation, elec_test) |>
mutate(.model = "ARIMA")
# Save the forecasts for 2014 to a data frame
forecasts_2014 <- elec_fc_future |>
hilo(level = c(80, 95)) |>
as_tibble() |>
select(YYYY.MMM, .mean, `80%`, `95%`)
print(forecasts_2014)
## # A tibble: 12 × 4
## YYYY.MMM .mean `80%` `95%`
## <mth> <dbl> <hilo> <hilo>
## 1 2014 Jan 9679752. [8575834, 10783670]80 [7991455, 11368049]95
## 2 2014 Feb 8171758. [7038791, 9304725]80 [6439035, 9904481]95
## 3 2014 Mar 6740326. [5607359, 7873293]80 [5007603, 8473050]95
## 4 2014 Apr 5960961. [4827994, 7093928]80 [4228238, 7693685]95
## 5 2014 May 5728348. [4595382, 6861315]80 [3995625, 7461072]95
## 6 2014 Jun 7523647. [6390680, 8656614]80 [5790923, 9256370]95
## 7 2014 Jul 7918371. [6785404, 9051338]80 [6185648, 9651094]95
## 8 2014 Aug 9282293. [8149326, 10415260]80 [7549570, 11015016]95
## 9 2014 Sep 8287817. [7154850, 9420784]80 [6555094, 10020541]95
## 10 2014 Oct 6026133. [4893166, 7159100]80 [4293410, 7758856]95
## 11 2014 Nov 5766302. [4633335, 6899269]80 [4033579, 7499026]95
## 12 2014 Dec 7476683. [6343716, 8609649]80 [5743960, 9209406]95
# Residuals
elec_fit_full_arima |>
gg_tsresiduals() +
labs(title = "Residual Analysis for ARIMA Model")
In comparison the accuracy metrics for the STL, ETS and ARIMA modesl, the ETS model shows the best performance with the lowest RMSE and MAE, indicating it has the smallest average prediction errors among all three models. The STL model comes in second place in these metrics while the ARIMA model performs worst with the highest RMSE and MAE. Considering the MAPE percentage errors, ETS performs best with 8.13%, compared to STL’s 11.79% and ARIMA’s 12.41%.
The ETS model is the best choice for forecasting this electricity usage data.
# Combine the accuracy metrics into a single table
combined_accuracy <- bind_rows(
stl_accuracy,
ets_accuracy,
arima_accuracy) |>
arrange(RMSE)
# Print the comparison table
print(combined_accuracy)
## # A tibble: 3 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 ETS Test 411327. 1064622. 683843. 4.30 8.16 NaN NaN 0.0933
## 2 STL Test 960749. 1288258. 961373. 11.8 11.8 NaN NaN 0.000976
## 3 ARIMA Test 653023. 1542167. 1008787. 7.18 12.2 NaN NaN 0.0116