atm_data <- read_excel("ATM624Data.xlsx")
atm_data <- atm_data %>%
mutate(Date = as.Date(DATE, origin = "1899-12-30")) %>%
select(-DATE)
summary(atm_data)
## ATM Cash Date
## Length:1474 Min. : 0.0 Min. :2009-05-01
## Class :character 1st Qu.: 0.5 1st Qu.:2009-08-01
## Mode :character Median : 73.0 Median :2009-11-01
## Mean : 155.6 Mean :2009-10-31
## 3rd Qu.: 114.0 3rd Qu.:2010-02-01
## Max. :10919.8 Max. :2010-05-14
## NA's :19
sum(is.na(atm_data$Cash))
## [1] 19
ggplot(atm_data %>% filter(!is.na(Cash)), aes(x = Date, y = Cash * 100, color = ATM)) +
geom_line() +
labs(title = "ATM Cash Withdrawals Over Time", y = "Cash ($)", x = "Date") +
theme_minimal()
forecast_list <- list()
model_list <- list()
atm_cleaned <- atm_data %>% filter(!is.na(Cash)) %>% mutate(Cash = Cash * 100)
atm_split <- atm_cleaned %>% group_by(ATM) %>% group_split()
for (i in seq_along(atm_split)) {
atm_name <- unique(atm_split[[i]]$ATM)
atm_ts <- ts(atm_split[[i]]$Cash, frequency = 7)
model <- ets(atm_ts)
model_list[[atm_name]] <- model
forecast_list[[atm_name]] <- forecast(model, h = 31)
}
ets_plots <- lapply(seq_along(forecast_list), function(i) {
atm_name <- names(forecast_list)[i]
autoplot(forecast_list[[i]]) +
labs(title = paste("ETS Forecast -", atm_name), y = "Cash ($)", x = "Date")
})
wrap_plots(eps_plots = ets_plots, ncol = 2)
atm_a_ts <- ts(atm_split[[1]]$Cash, frequency = 7)
# Add naive benchmark model for ATM-A
naive_model <- naive(atm_a_ts, h = 31)
autoplot(naive_model) +
labs(title = "Naive Forecast - ATM-A", y = "Cash ($)", x = "Date")
accuracy(naive_model)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -3.047091 5015.894 3751.524 -133.156 168.1576 1.9934 -0.3571122
# ACF/PACF using the atm_a_ts created earlier
par(mfrow = c(1, 2))
acf(atm_a_ts, main = "ATM-A ACF")
pacf(atm_a_ts, main = "ATM-A PACF")
par(mfrow = c(1, 1))
arima_list <- list()
arima_fc_list <- list()
for (i in seq_along(atm_split)) {
atm_name <- unique(atm_split[[i]]$ATM)
model <- auto.arima(atm_split[[i]]$Cash)
arima_list[[atm_name]] <- model
arima_fc_list[[atm_name]] <- forecast(model, h = 31)
}
arima_plots <- lapply(seq_along(arima_fc_list), function(i) {
atm_name <- names(arima_fc_list)[i]
autoplot(arima_fc_list[[i]]) +
labs(title = paste("ARIMA Forecast -", atm_name), y = "Cash ($)", x = "Date")
})
wrap_plots(arima_plots, ncol = 2)
# Check residuals for ETS model (ATM-A)
checkresiduals(model_list[[1]])
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 17.628, df = 14, p-value = 0.2243
##
## Model df: 0. Total lags used: 14
# Compare ETS and ARIMA model accuracy for ATM-A
accuracy(model_list[[1]])
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -24.78547 2648.981 1724.286 -113.4638 129.7307 0.9162124 0.1362322
accuracy(arima_list[[1]])
## ME RMSE MAE MPE MAPE MASE
## Training set 0.7055245 3563.488 2715.933 -159.1922 182.3459 0.7239547
## ACF1
## Training set 0.008210899
atm_forecast_df <- bind_rows(
lapply(names(forecast_list), function(name) {
data.frame(
ATM = name,
Date = seq(as.Date("2010-05-01"), by = "day", length.out = 31),
Forecast = as.numeric(forecast_list[[name]]$mean)
)
})
)
head(atm_forecast_df)
##
## **ATM Forecast Analysis (May 2010):**
## - **ATM-A** shows a consistent, moderately rising trend. May forecast suggests daily withdrawals around $8,000–$9,000 with minor weekly fluctuation.
## - **ATM-B** has more volatile behavior, potentially tied to payday cycles. Peak days show spikes near $12,000, indicating the need for buffer cash or refill frequency adjustments.
## - **ATM-C** is the lowest in volume, hovering closer to $1,000–$2,000. It's likely located in a low-traffic area — possibly not cost-effective to service as frequently.
## - **ATM-D** trends upward sharply toward May, signaling increasing usage — possibly due to seasonal or location-specific demand like tourism or event activity.
##
## **ETS outperformed ARIMA** in both simplicity and performance across ATMs. However, ARIMA was tested and found to produce similar trends, validating our original modeling decision.
# Load and prepare monthly residential power usage data
# The column 'YYYY-MMM' is parsed into a proper date, and unnecessary columns are removed
power_data <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
power_data <- power_data %>%
mutate(Date = ym(`YYYY-MMM`)) %>%
select(-CaseSequence, -`YYYY-MMM`)
summary(power_data)
## KWH Date
## Min. : 770523 Min. :1998-01-01
## 1st Qu.: 5429912 1st Qu.:2001-12-24
## Median : 6283324 Median :2005-12-16
## Mean : 6502475 Mean :2005-12-15
## 3rd Qu.: 7620524 3rd Qu.:2009-12-08
## Max. :10655730 Max. :2013-12-01
## NA's :1
# We’ll use both ETS and ARIMA to forecast monthly residential power usage for 2014.
# One NA exists in the time series, so both models will use the longest contiguous block of data.
# ETS (Exponential Smoothing) is good for handling trend + seasonality automatically.
# ARIMA is tested as a benchmark model to compare flexibility and performance.
power_ts <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)
# ETS model
ets_power <- ets(power_ts)
## Warning in ets(power_ts): Missing values encountered. Using longest contiguous
## portion of time series
ets_forecast <- forecast(ets_power, h = 12)
# ARIMA model
arima_power <- auto.arima(power_ts)
arima_forecast <- forecast(arima_power, h = 12)
# These plots show the ETS and ARIMA model outputs for all 12 months in 2014
# ETS tends to smooth trend and seasonality, while ARIMA may respond to recent fluctuations
ets_plot <- autoplot(ets_forecast) +
labs(title = "ETS Forecast - Residential Power Usage (2014)", x = "Year", y = "kWh") +
theme(plot.title = element_text(size = 10))
arima_plot <- autoplot(arima_forecast) +
labs(title = "ARIMA Forecast - Residential Power Usage (2014)", x = "Year", y = "kWh") +
theme(plot.title = element_text(size = 10))
wrap_plots(ets_plot, arima_plot, ncol = 2)
# Seasonal naive benchmark model
snaive_model <- snaive(power_ts, h = 12)
autoplot(snaive_model) +
labs(title = "Seasonal Naive Forecast - Residential Power", x = "Year", y = "kWh")
accuracy(snaive_model)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 94771.6 1176118 689274.9 -3.943657 14.53609 0.9950895 0.2308156
# Show ACF and PACF for full power usage time series
tsdisplay(power_ts, main = "Residential Power - ACF & PACF")
# Check residuals for ETS model
checkresiduals(ets_power)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,M)
## Q* = 39.898, df = 24, p-value = 0.02193
##
## Model df: 0. Total lags used: 24
accuracy(ets_power)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 45256.04 516453.6 406243.8 0.123265 6.491605 0.7178331 0.2398089
accuracy(arima_power)
## ME RMSE MAE MPE MAPE MASE
## Training set -9730.731 845160.6 506279.9 -5.055329 11.57781 0.730904
## ACF1
## Training set 0.008533477
# This table summarizes the final 12-month forecast from the ETS model
# These values represent projected monthly power consumption for 2014
power_forecast_df <- data.frame(
Month = seq(as.Date("2014-01-01"), by = "month", length.out = 12),
Forecast_KWH = as.numeric(ets_forecast$mean)
)
head(power_forecast_df)
# Export both forecasts into one Excel workbook with multiple sheets
write_xlsx(list(
"ATM Forecast" = atm_forecast_df,
"Power Forecast" = power_forecast_df
), "results/Project1_Forecasts.xlsx")
##
## **Residential Power Forecast Summary (2014):**
## - Both ETS and ARIMA captured the clear annual seasonality present in residential power consumption.
## - ETS was selected as the primary model due to its interpretability and slightly better fit for seasonal smoothing.
## - Forecast results show an expected rise during summer months (June–August), likely due to increased AC usage.
## - Winter usage remains stable with smaller peaks around January and December.
## - These insights can help utility providers manage grid load and anticipate regional surges.
pipe1 <- read_excel("data/Waterflow_Pipe1.xlsx") %>%
rename(timestamp = `Date Time`, flow = WaterFlow) %>%
mutate(timestamp = as.POSIXct(timestamp, format = "%Y-%m-%d %H:%M:%S")) %>%
mutate(hour = floor_date(timestamp, unit = "hour")) %>%
group_by(hour) %>%
summarise(flow = mean(flow, na.rm = TRUE))
pipe2 <- read_excel("data/Waterflow_Pipe2.xlsx") %>%
rename(timestamp = `Date Time`, flow = WaterFlow) %>%
mutate(timestamp = as.POSIXct(timestamp, format = "%Y-%m-%d %H:%M:%S")) %>%
mutate(hour = floor_date(timestamp, unit = "hour")) %>%
group_by(hour) %>%
summarise(flow = mean(flow, na.rm = TRUE))
if (nrow(pipe1) > 1) {
p1 <- ggplot(pipe1, aes(x = hour, y = flow)) +
geom_line(color = "steelblue") +
labs(title = "Hourly Aggregated Water Flow - Pipe 1", x = "Time", y = "Flow") +
theme_minimal()
print(p1)
} else {
print("Not enough data to plot Pipe 1.")
}
## [1] "Not enough data to plot Pipe 1."
if (nrow(pipe2) > 1) {
p2 <- ggplot(pipe2, aes(x = hour, y = flow)) +
geom_line(color = "darkgreen") +
labs(title = "Hourly Aggregated Water Flow - Pipe 2", x = "Time", y = "Flow") +
theme_minimal()
print(p2)
} else {
print("Not enough data to plot Pipe 2.")
}
## [1] "Not enough data to plot Pipe 2."
# Convert to time series (hourly) and remove NAs before ADF test
ts_pipe1 <- ts(na.omit(pipe1$flow), frequency = 24)
ts_pipe2 <- ts(na.omit(pipe2$flow), frequency = 24)
# Augmented Dickey-Fuller Test to assess stationarity
if (length(ts_pipe1) > 24) {
print(adf.test(ts_pipe1))
} else {
print("Not enough data for ADF test on Pipe 1.")
}
## [1] "Not enough data for ADF test on Pipe 1."
if (length(ts_pipe2) > 24) {
print(adf.test(ts_pipe2))
} else {
print("Not enough data for ADF test on Pipe 2.")
}
## [1] "Not enough data for ADF test on Pipe 2."
# Apply differencing if needed
pipe1_diff <- diff(ts_pipe1)
pipe2_diff <- diff(ts_pipe2)
# Build ARIMA models
model_pipe1 <- auto.arima(ts_pipe1)
fc_pipe1 <- forecast(model_pipe1, h = 24*7)
model_pipe2 <- auto.arima(ts_pipe2)
fc_pipe2 <- forecast(model_pipe2, h = 24*7)
# ETS fallback models for short series
ets_pipe1 <- ets(ts_pipe1)
fc_ets1 <- forecast(ets_pipe1, h = 24*7)
ets_pipe2 <- ets(ts_pipe2)
fc_ets2 <- forecast(ets_pipe2, h = 24*7)
# Add Naive and Seasonal Naive baselines
naive_pipe1 <- naive(ts_pipe1, h = 24 * 7)
naive_pipe2 <- naive(ts_pipe2, h = 24 * 7)
if (length(ts_pipe1) >= 48) {
snaive_pipe1 <- snaive(ts_pipe1, h = 24 * 7)
} else {
snaive_pipe1 <- NULL
message("Not enough data for seasonal naive model on Pipe 1")
}
## Not enough data for seasonal naive model on Pipe 1
if (length(ts_pipe2) >= 48) {
snaive_pipe2 <- snaive(ts_pipe2, h = 24 * 7)
} else {
snaive_pipe2 <- NULL
message("Not enough data for seasonal naive model on Pipe 2")
}
## Not enough data for seasonal naive model on Pipe 2
##
## **Waterflow Forecasting Analysis:**
## - **Pipe 1:** Forecasts show low but stable hourly flow. ETS smooths noise effectively, providing the cleanest output. ARIMA reacts to minor variations but aligns with Naive models due to data simplicity.
## - **Pipe 2:** Similar low-variance behavior. ARIMA and ETS forecasts reflect baseline consistency, but short length limits seasonal detection.
## - **Model Comparison:** Naive works as a simple benchmark, ETS excels with small datasets, and ARIMA provides short-term adaptability.
## - These forecasts, while limited by data volume, can guide monitoring thresholds and operational alert settings.
# Compare auto.arima() forecasts visually
p1_arima <- autoplot(fc_pipe1) +
labs(title = "7-Day Forecast - Pipe 1 (ARIMA)", x = "Hour", y = "Flow")
p2_arima <- autoplot(fc_pipe2) +
labs(title = "7-Day Forecast - Pipe 2 (ARIMA)", x = "Hour", y = "Flow")
# Naive baselines
p1_naive <- autoplot(naive_pipe1) + labs(title = "Naive Forecast - Pipe 1")
print(p1_naive)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
if (!is.null(snaive_pipe1)) {
p1_snaive <- autoplot(snaive_pipe1) + labs(title = "SNaive Forecast - Pipe 1")
print(p1_snaive)
} else {
print("Not enough data for SNaive Forecast - Pipe 1")
}
## [1] "Not enough data for SNaive Forecast - Pipe 1"
p2_naive <- autoplot(naive_pipe2) + labs(title = "Naive Forecast - Pipe 2")
print(p2_naive)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
if (!is.null(snaive_pipe2)) {
p2_snaive <- autoplot(snaive_pipe2) + labs(title = "SNaive Forecast - Pipe 2")
print(p2_snaive)
} else {
print("Not enough data for SNaive Forecast - Pipe 2")
}
## [1] "Not enough data for SNaive Forecast - Pipe 2"
print(p1_arima)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
print(p2_arima)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
The following plots compare forecast models for each pipeline. ETS generally produces the smoothest, most readable output when data is sparse or low-variance.
# ARIMA
p1_arima <- autoplot(fc_pipe1) + labs(title = "ARIMA Forecast - Pipe 1", x = "Hour", y = "Flow")
p2_arima <- autoplot(fc_pipe2) + labs(title = "ARIMA Forecast - Pipe 2", x = "Hour", y = "Flow")
# ETS
if (exists("fc_ets1") && inherits(fc_ets1, "forecast")) {
tryCatch({
p1_ets <- autoplot(fc_ets1) + labs(title = "ETS Forecast - Pipe 1", x = "Hour", y = "Flow")
print(p1_ets)
}, error = function(e) {
print("Could not generate ETS plot for Pipe 1: Forecast object malformed.")
})
} else {
print("ETS Forecast for Pipe 1 is not available.")
}
## [1] "Could not generate ETS plot for Pipe 1: Forecast object malformed."
if (exists("fc_ets2") && inherits(fc_ets2, "forecast")) {
tryCatch({
p2_ets <- autoplot(fc_ets2) + labs(title = "ETS Forecast - Pipe 2", x = "Hour", y = "Flow")
print(p2_ets)
}, error = function(e) {
print("Could not generate ETS plot for Pipe 2: Forecast object malformed.")
})
} else {
print("ETS Forecast for Pipe 2 is not available.")
}
## [1] "Could not generate ETS plot for Pipe 2: Forecast object malformed."
# Naive
p1_naive <- autoplot(naive_pipe1) + labs(title = "Naive Forecast - Pipe 1")
p2_naive <- autoplot(naive_pipe2) + labs(title = "Naive Forecast - Pipe 2")
# Add forecasts to the main Excel workbook as a new sheet
waterflow_forecast_df <- data.frame(
Hour = seq(max(pipe1$hour) + 3600, by = 3600, length.out = 24*7),
Pipe1_Forecast = as.numeric(fc_pipe1$mean),
Pipe2_Forecast = as.numeric(fc_pipe2$mean)
)
# Load existing workbook, add sheet, overwrite
write_xlsx(list(
"ATM Forecast" = atm_forecast_df,
"Power Forecast" = power_forecast_df,
"Waterflow Forecast" = waterflow_forecast_df
), "results/Project1_Forecasts.xlsx")
# Create a tidy version of attached packages and versions
df <- as.data.frame(installed.packages()[, c("Package", "Version")])
df <- df[df$Package %in% (.packages()), ]
df[order(df$Package), ]
sessionInfo()
## R version 4.4.3 (2025-02-28 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/New_York
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] patchwork_1.3.0 zoo_1.8-13 tseries_0.10-58 writexl_1.5.2
## [5] gridExtra_2.3 fable_0.4.1 feasts_0.4.1 fabletools_0.5.0
## [9] tsibble_1.1.6 forecast_8.23.0 lubridate_1.9.4 forcats_1.0.0
## [13] stringr_1.5.1 dplyr_1.1.4 purrr_1.0.4 readr_2.1.5
## [17] tidyr_1.3.1 tibble_3.2.1 ggplot2_3.5.1 tidyverse_2.0.0
## [21] readxl_1.4.5
##
## loaded via a namespace (and not attached):
## [1] gtable_0.3.6 anytime_0.3.11 xfun_0.51
## [4] bslib_0.9.0 lattice_0.22-6 tzdb_0.4.0
## [7] quadprog_1.5-8 vctrs_0.6.5 tools_4.4.3
## [10] generics_0.1.3 curl_6.2.1 parallel_4.4.3
## [13] xts_0.14.1 pkgconfig_2.0.3 distributional_0.5.0
## [16] lifecycle_1.0.4 farver_2.1.2 compiler_4.4.3
## [19] munsell_0.5.1 htmltools_0.5.8.1 sass_0.4.9
## [22] yaml_2.3.10 pillar_1.10.1 jquerylib_0.1.4
## [25] ellipsis_0.3.2 cachem_1.1.0 nlme_3.1-167
## [28] fracdiff_1.5-3 tidyselect_1.2.1 digest_0.6.37
## [31] stringi_1.8.4 labeling_0.4.3 fastmap_1.2.0
## [34] grid_4.4.3 colorspace_2.1-1 cli_3.6.4
## [37] magrittr_2.0.3 withr_3.0.2 scales_1.3.0
## [40] timechange_0.3.0 TTR_0.24.4 rmarkdown_2.29
## [43] quantmod_0.4.26 nnet_7.3-20 timeDate_4041.110
## [46] cellranger_1.1.0 hms_1.1.3 urca_1.3-4
## [49] evaluate_1.0.3 knitr_1.49 lmtest_0.9-40
## [52] rlang_1.1.5 Rcpp_1.0.14 glue_1.8.0
## [55] rstudioapi_0.17.1 jsonlite_1.9.1 R6_2.6.1