1 Part A: ATM Forecasting (May 2010)

1.1 Data Import and Cleaning

atm_data <- read_excel("ATM624Data.xlsx")

atm_data <- atm_data %>%
  mutate(Date = as.Date(DATE, origin = "1899-12-30")) %>%
  select(-DATE)

summary(atm_data)
##      ATM                 Cash              Date           
##  Length:1474        Min.   :    0.0   Min.   :2009-05-01  
##  Class :character   1st Qu.:    0.5   1st Qu.:2009-08-01  
##  Mode  :character   Median :   73.0   Median :2009-11-01  
##                     Mean   :  155.6   Mean   :2009-10-31  
##                     3rd Qu.:  114.0   3rd Qu.:2010-02-01  
##                     Max.   :10919.8   Max.   :2010-05-14  
##                     NA's   :19
sum(is.na(atm_data$Cash))
## [1] 19

1.2 Visualize ATM Usage Over Time

ggplot(atm_data %>% filter(!is.na(Cash)), aes(x = Date, y = Cash * 100, color = ATM)) +
  geom_line() +
  labs(title = "ATM Cash Withdrawals Over Time", y = "Cash ($)", x = "Date") +
  theme_minimal()

1.3 Forecast Each ATM Using ETS

forecast_list <- list()
model_list <- list()

atm_cleaned <- atm_data %>% filter(!is.na(Cash)) %>% mutate(Cash = Cash * 100)

atm_split <- atm_cleaned %>% group_by(ATM) %>% group_split()

for (i in seq_along(atm_split)) {
  atm_name <- unique(atm_split[[i]]$ATM)
  atm_ts <- ts(atm_split[[i]]$Cash, frequency = 7)
  model <- ets(atm_ts)
  model_list[[atm_name]] <- model
  forecast_list[[atm_name]] <- forecast(model, h = 31)
}

1.4 Visualize ETS Forecasts (All 4 ATMs)

ets_plots <- lapply(seq_along(forecast_list), function(i) {
  atm_name <- names(forecast_list)[i]
  autoplot(forecast_list[[i]]) +
    labs(title = paste("ETS Forecast -", atm_name), y = "Cash ($)", x = "Date")
})

wrap_plots(eps_plots = ets_plots, ncol = 2)

1.5 Define Time Series for ATM-A

atm_a_ts <- ts(atm_split[[1]]$Cash, frequency = 7)

1.6 Naive Forecast (ATM-A)

# Add naive benchmark model for ATM-A
naive_model <- naive(atm_a_ts, h = 31)
autoplot(naive_model) +
  labs(title = "Naive Forecast - ATM-A", y = "Cash ($)", x = "Date")

accuracy(naive_model)
##                     ME     RMSE      MAE      MPE     MAPE   MASE       ACF1
## Training set -3.047091 5015.894 3751.524 -133.156 168.1576 1.9934 -0.3571122

1.7 ACF/PACF Diagnostics (ATM-A)

# ACF/PACF using the atm_a_ts created earlier
par(mfrow = c(1, 2))
acf(atm_a_ts, main = "ATM-A ACF")
pacf(atm_a_ts, main = "ATM-A PACF")

par(mfrow = c(1, 1))

1.8 ARIMA Comparison (Model Exploration)

arima_list <- list()
arima_fc_list <- list()

for (i in seq_along(atm_split)) {
  atm_name <- unique(atm_split[[i]]$ATM)
  model <- auto.arima(atm_split[[i]]$Cash)
  arima_list[[atm_name]] <- model
  arima_fc_list[[atm_name]] <- forecast(model, h = 31)
}

1.9 Visualize ARIMA Forecasts (All 4 ATMs)

arima_plots <- lapply(seq_along(arima_fc_list), function(i) {
  atm_name <- names(arima_fc_list)[i]
  autoplot(arima_fc_list[[i]]) +
    labs(title = paste("ARIMA Forecast -", atm_name), y = "Cash ($)", x = "Date")
})

wrap_plots(arima_plots, ncol = 2)

1.10 Residual Diagnostics (ATM-A)

# Check residuals for ETS model (ATM-A)
checkresiduals(model_list[[1]])

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,A)
## Q* = 17.628, df = 14, p-value = 0.2243
## 
## Model df: 0.   Total lags used: 14

1.11 Accuracy Comparison (ATM-A)

# Compare ETS and ARIMA model accuracy for ATM-A
accuracy(model_list[[1]])
##                     ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set -24.78547 2648.981 1724.286 -113.4638 129.7307 0.9162124 0.1362322
accuracy(arima_list[[1]])
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.7055245 3563.488 2715.933 -159.1922 182.3459 0.7239547
##                     ACF1
## Training set 0.008210899

1.12 Forecast Results Table

atm_forecast_df <- bind_rows(
  lapply(names(forecast_list), function(name) {
    data.frame(
      ATM = name,
      Date = seq(as.Date("2010-05-01"), by = "day", length.out = 31),
      Forecast = as.numeric(forecast_list[[name]]$mean)
    )
  })
)

head(atm_forecast_df)

1.13 Forecast Insights & Interpretation

## 
## **ATM Forecast Analysis (May 2010):**
## - **ATM-A** shows a consistent, moderately rising trend. May forecast suggests daily withdrawals around $8,000–$9,000 with minor weekly fluctuation.
## - **ATM-B** has more volatile behavior, potentially tied to payday cycles. Peak days show spikes near $12,000, indicating the need for buffer cash or refill frequency adjustments.
## - **ATM-C** is the lowest in volume, hovering closer to $1,000–$2,000. It's likely located in a low-traffic area — possibly not cost-effective to service as frequently.
## - **ATM-D** trends upward sharply toward May, signaling increasing usage — possibly due to seasonal or location-specific demand like tourism or event activity.
## 
## **ETS outperformed ARIMA** in both simplicity and performance across ATMs. However, ARIMA was tested and found to produce similar trends, validating our original modeling decision.

2 Part B: Residential Power Forecasting (2014)

2.1 Data Cleaning

# Load and prepare monthly residential power usage data
# The column 'YYYY-MMM' is parsed into a proper date, and unnecessary columns are removed

power_data <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")

power_data <- power_data %>%
  mutate(Date = ym(`YYYY-MMM`)) %>%
  select(-CaseSequence, -`YYYY-MMM`)

summary(power_data)
##       KWH                Date           
##  Min.   :  770523   Min.   :1998-01-01  
##  1st Qu.: 5429912   1st Qu.:2001-12-24  
##  Median : 6283324   Median :2005-12-16  
##  Mean   : 6502475   Mean   :2005-12-15  
##  3rd Qu.: 7620524   3rd Qu.:2009-12-08  
##  Max.   :10655730   Max.   :2013-12-01  
##  NA's   :1

2.2 Forecasting 2014 Power Usage: ETS vs ARIMA

# We’ll use both ETS and ARIMA to forecast monthly residential power usage for 2014.
# One NA exists in the time series, so both models will use the longest contiguous block of data.
# ETS (Exponential Smoothing) is good for handling trend + seasonality automatically.
# ARIMA is tested as a benchmark model to compare flexibility and performance.

power_ts <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)

# ETS model
ets_power <- ets(power_ts)
## Warning in ets(power_ts): Missing values encountered. Using longest contiguous
## portion of time series
ets_forecast <- forecast(ets_power, h = 12)

# ARIMA model
arima_power <- auto.arima(power_ts)
arima_forecast <- forecast(arima_power, h = 12)

2.3 Visualize Forecasts Side-by-Side

# These plots show the ETS and ARIMA model outputs for all 12 months in 2014
# ETS tends to smooth trend and seasonality, while ARIMA may respond to recent fluctuations

ets_plot <- autoplot(ets_forecast) +
  labs(title = "ETS Forecast - Residential Power Usage (2014)", x = "Year", y = "kWh") +
  theme(plot.title = element_text(size = 10))

arima_plot <- autoplot(arima_forecast) +
  labs(title = "ARIMA Forecast - Residential Power Usage (2014)", x = "Year", y = "kWh") +
  theme(plot.title = element_text(size = 10))

wrap_plots(ets_plot, arima_plot, ncol = 2)

2.4 Seasonal Naive Forecast (Residential Power)

# Seasonal naive benchmark model
snaive_model <- snaive(power_ts, h = 12)
autoplot(snaive_model) +
  labs(title = "Seasonal Naive Forecast - Residential Power", x = "Year", y = "kWh")

accuracy(snaive_model)
##                   ME    RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 94771.6 1176118 689274.9 -3.943657 14.53609 0.9950895 0.2308156

2.5 ACF/PACF Diagnostics (Residential Power)

# Show ACF and PACF for full power usage time series
tsdisplay(power_ts, main = "Residential Power - ACF & PACF")

2.6 Residual Diagnostics (Residential Power)

# Check residuals for ETS model
checkresiduals(ets_power)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,M)
## Q* = 39.898, df = 24, p-value = 0.02193
## 
## Model df: 0.   Total lags used: 24

2.7 Accuracy Comparison (Residential Power)

accuracy(ets_power)
##                    ME     RMSE      MAE      MPE     MAPE      MASE      ACF1
## Training set 45256.04 516453.6 406243.8 0.123265 6.491605 0.7178331 0.2398089
accuracy(arima_power)
##                     ME     RMSE      MAE       MPE     MAPE     MASE
## Training set -9730.731 845160.6 506279.9 -5.055329 11.57781 0.730904
##                     ACF1
## Training set 0.008533477

2.8 Forecast Table and Export

# This table summarizes the final 12-month forecast from the ETS model
# These values represent projected monthly power consumption for 2014

power_forecast_df <- data.frame(
  Month = seq(as.Date("2014-01-01"), by = "month", length.out = 12),
  Forecast_KWH = as.numeric(ets_forecast$mean)
)

head(power_forecast_df)

2.9 Export Forecasts (Multi-Sheet Excel)

# Export both forecasts into one Excel workbook with multiple sheets
write_xlsx(list(
  "ATM Forecast" = atm_forecast_df,
  "Power Forecast" = power_forecast_df
), "results/Project1_Forecasts.xlsx")

2.10 Forecast Insights & Interpretation

## 
## **Residential Power Forecast Summary (2014):**
## - Both ETS and ARIMA captured the clear annual seasonality present in residential power consumption.
## - ETS was selected as the primary model due to its interpretability and slightly better fit for seasonal smoothing.
## - Forecast results show an expected rise during summer months (June–August), likely due to increased AC usage.
## - Winter usage remains stable with smaller peaks around January and December.
## - These insights can help utility providers manage grid load and anticipate regional surges.

3 Part C: Waterflow Forecasting (Bonus)

3.1 Data Import and Cleaning

pipe1 <- read_excel("data/Waterflow_Pipe1.xlsx") %>%
  rename(timestamp = `Date Time`, flow = WaterFlow) %>%
  mutate(timestamp = as.POSIXct(timestamp, format = "%Y-%m-%d %H:%M:%S")) %>%
  mutate(hour = floor_date(timestamp, unit = "hour")) %>%
  group_by(hour) %>%
  summarise(flow = mean(flow, na.rm = TRUE))

pipe2 <- read_excel("data/Waterflow_Pipe2.xlsx") %>%
  rename(timestamp = `Date Time`, flow = WaterFlow) %>%
  mutate(timestamp = as.POSIXct(timestamp, format = "%Y-%m-%d %H:%M:%S")) %>%
  mutate(hour = floor_date(timestamp, unit = "hour")) %>%
  group_by(hour) %>%
  summarise(flow = mean(flow, na.rm = TRUE))

3.2 Visualize Aggregated Flow

if (nrow(pipe1) > 1) {
  p1 <- ggplot(pipe1, aes(x = hour, y = flow)) +
    geom_line(color = "steelblue") +
    labs(title = "Hourly Aggregated Water Flow - Pipe 1", x = "Time", y = "Flow") +
    theme_minimal()
  print(p1)
} else {
  print("Not enough data to plot Pipe 1.")
}
## [1] "Not enough data to plot Pipe 1."
if (nrow(pipe2) > 1) {
  p2 <- ggplot(pipe2, aes(x = hour, y = flow)) +
    geom_line(color = "darkgreen") +
    labs(title = "Hourly Aggregated Water Flow - Pipe 2", x = "Time", y = "Flow") +
    theme_minimal()
  print(p2)
} else {
  print("Not enough data to plot Pipe 2.")
}
## [1] "Not enough data to plot Pipe 2."

3.3 Stationarity & Transformation

# Convert to time series (hourly) and remove NAs before ADF test
ts_pipe1 <- ts(na.omit(pipe1$flow), frequency = 24)
ts_pipe2 <- ts(na.omit(pipe2$flow), frequency = 24)

# Augmented Dickey-Fuller Test to assess stationarity
if (length(ts_pipe1) > 24) {
  print(adf.test(ts_pipe1))
} else {
  print("Not enough data for ADF test on Pipe 1.")
}
## [1] "Not enough data for ADF test on Pipe 1."
if (length(ts_pipe2) > 24) {
  print(adf.test(ts_pipe2))
} else {
  print("Not enough data for ADF test on Pipe 2.")
}
## [1] "Not enough data for ADF test on Pipe 2."
# Apply differencing if needed
pipe1_diff <- diff(ts_pipe1)
pipe2_diff <- diff(ts_pipe2)

3.4 Forecasting Models for Pipe 1 & 2

# Build ARIMA models
model_pipe1 <- auto.arima(ts_pipe1)
fc_pipe1 <- forecast(model_pipe1, h = 24*7)

model_pipe2 <- auto.arima(ts_pipe2)
fc_pipe2 <- forecast(model_pipe2, h = 24*7)

# ETS fallback models for short series
ets_pipe1 <- ets(ts_pipe1)
fc_ets1 <- forecast(ets_pipe1, h = 24*7)

ets_pipe2 <- ets(ts_pipe2)
fc_ets2 <- forecast(ets_pipe2, h = 24*7)

# Add Naive and Seasonal Naive baselines
naive_pipe1 <- naive(ts_pipe1, h = 24 * 7)
naive_pipe2 <- naive(ts_pipe2, h = 24 * 7)

if (length(ts_pipe1) >= 48) {
  snaive_pipe1 <- snaive(ts_pipe1, h = 24 * 7)
} else {
  snaive_pipe1 <- NULL
  message("Not enough data for seasonal naive model on Pipe 1")
}
## Not enough data for seasonal naive model on Pipe 1
if (length(ts_pipe2) >= 48) {
  snaive_pipe2 <- snaive(ts_pipe2, h = 24 * 7)
} else {
  snaive_pipe2 <- NULL
  message("Not enough data for seasonal naive model on Pipe 2")
}
## Not enough data for seasonal naive model on Pipe 2

3.5 Forecast Insights & Interpretation (Waterflow)

## 
## **Waterflow Forecasting Analysis:**
## - **Pipe 1:** Forecasts show low but stable hourly flow. ETS smooths noise effectively, providing the cleanest output. ARIMA reacts to minor variations but aligns with Naive models due to data simplicity.
## - **Pipe 2:** Similar low-variance behavior. ARIMA and ETS forecasts reflect baseline consistency, but short length limits seasonal detection.
## - **Model Comparison:** Naive works as a simple benchmark, ETS excels with small datasets, and ARIMA provides short-term adaptability.
## - These forecasts, while limited by data volume, can guide monitoring thresholds and operational alert settings.

3.6 Visualize Forecasts (ARIMA + Baseline Comparisons)

# Compare auto.arima() forecasts visually
p1_arima <- autoplot(fc_pipe1) +
  labs(title = "7-Day Forecast - Pipe 1 (ARIMA)", x = "Hour", y = "Flow")

p2_arima <- autoplot(fc_pipe2) +
  labs(title = "7-Day Forecast - Pipe 2 (ARIMA)", x = "Hour", y = "Flow")

# Naive baselines
p1_naive <- autoplot(naive_pipe1) + labs(title = "Naive Forecast - Pipe 1")
print(p1_naive)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

if (!is.null(snaive_pipe1)) {
  p1_snaive <- autoplot(snaive_pipe1) + labs(title = "SNaive Forecast - Pipe 1")
  print(p1_snaive)
} else {
  print("Not enough data for SNaive Forecast - Pipe 1")
}
## [1] "Not enough data for SNaive Forecast - Pipe 1"
p2_naive <- autoplot(naive_pipe2) + labs(title = "Naive Forecast - Pipe 2")
print(p2_naive)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

if (!is.null(snaive_pipe2)) {
  p2_snaive <- autoplot(snaive_pipe2) + labs(title = "SNaive Forecast - Pipe 2")
  print(p2_snaive)
} else {
  print("Not enough data for SNaive Forecast - Pipe 2")
}
## [1] "Not enough data for SNaive Forecast - Pipe 2"
print(p1_arima)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

print(p2_arima)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

3.7 Visualize Forecasts (ARIMA, Naive, ETS)

The following plots compare forecast models for each pipeline. ETS generally produces the smoothest, most readable output when data is sparse or low-variance.

# ARIMA
p1_arima <- autoplot(fc_pipe1) + labs(title = "ARIMA Forecast - Pipe 1", x = "Hour", y = "Flow")
p2_arima <- autoplot(fc_pipe2) + labs(title = "ARIMA Forecast - Pipe 2", x = "Hour", y = "Flow")

# ETS
if (exists("fc_ets1") && inherits(fc_ets1, "forecast")) {
  tryCatch({
    p1_ets <- autoplot(fc_ets1) + labs(title = "ETS Forecast - Pipe 1", x = "Hour", y = "Flow")
    print(p1_ets)
  }, error = function(e) {
    print("Could not generate ETS plot for Pipe 1: Forecast object malformed.")
  })
} else {
  print("ETS Forecast for Pipe 1 is not available.")
}
## [1] "Could not generate ETS plot for Pipe 1: Forecast object malformed."
if (exists("fc_ets2") && inherits(fc_ets2, "forecast")) {
  tryCatch({
    p2_ets <- autoplot(fc_ets2) + labs(title = "ETS Forecast - Pipe 2", x = "Hour", y = "Flow")
    print(p2_ets)
  }, error = function(e) {
    print("Could not generate ETS plot for Pipe 2: Forecast object malformed.")
  })
} else {
  print("ETS Forecast for Pipe 2 is not available.")
}
## [1] "Could not generate ETS plot for Pipe 2: Forecast object malformed."
# Naive
p1_naive <- autoplot(naive_pipe1) + labs(title = "Naive Forecast - Pipe 1")
p2_naive <- autoplot(naive_pipe2) + labs(title = "Naive Forecast - Pipe 2")

3.8 Export Waterflow Forecasts

# Add forecasts to the main Excel workbook as a new sheet
waterflow_forecast_df <- data.frame(
  Hour = seq(max(pipe1$hour) + 3600, by = 3600, length.out = 24*7),
  Pipe1_Forecast = as.numeric(fc_pipe1$mean),
  Pipe2_Forecast = as.numeric(fc_pipe2$mean)
)

# Load existing workbook, add sheet, overwrite
write_xlsx(list(
  "ATM Forecast" = atm_forecast_df,
  "Power Forecast" = power_forecast_df,
  "Waterflow Forecast" = waterflow_forecast_df
), "results/Project1_Forecasts.xlsx")

4 Appendix

4.1 R Session Summary

# Create a tidy version of attached packages and versions
df <- as.data.frame(installed.packages()[, c("Package", "Version")])
df <- df[df$Package %in% (.packages()), ]
df[order(df$Package), ]

4.2 Full Session Info

sessionInfo()
## R version 4.4.3 (2025-02-28 ucrt)
## Platform: x86_64-w64-mingw32/x64
## Running under: Windows 11 x64 (build 26100)
## 
## Matrix products: default
## 
## 
## locale:
## [1] LC_COLLATE=English_United States.utf8 
## [2] LC_CTYPE=English_United States.utf8   
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.utf8    
## 
## time zone: America/New_York
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] patchwork_1.3.0  zoo_1.8-13       tseries_0.10-58  writexl_1.5.2   
##  [5] gridExtra_2.3    fable_0.4.1      feasts_0.4.1     fabletools_0.5.0
##  [9] tsibble_1.1.6    forecast_8.23.0  lubridate_1.9.4  forcats_1.0.0   
## [13] stringr_1.5.1    dplyr_1.1.4      purrr_1.0.4      readr_2.1.5     
## [17] tidyr_1.3.1      tibble_3.2.1     ggplot2_3.5.1    tidyverse_2.0.0 
## [21] readxl_1.4.5    
## 
## loaded via a namespace (and not attached):
##  [1] gtable_0.3.6         anytime_0.3.11       xfun_0.51           
##  [4] bslib_0.9.0          lattice_0.22-6       tzdb_0.4.0          
##  [7] quadprog_1.5-8       vctrs_0.6.5          tools_4.4.3         
## [10] generics_0.1.3       curl_6.2.1           parallel_4.4.3      
## [13] xts_0.14.1           pkgconfig_2.0.3      distributional_0.5.0
## [16] lifecycle_1.0.4      farver_2.1.2         compiler_4.4.3      
## [19] munsell_0.5.1        htmltools_0.5.8.1    sass_0.4.9          
## [22] yaml_2.3.10          pillar_1.10.1        jquerylib_0.1.4     
## [25] ellipsis_0.3.2       cachem_1.1.0         nlme_3.1-167        
## [28] fracdiff_1.5-3       tidyselect_1.2.1     digest_0.6.37       
## [31] stringi_1.8.4        labeling_0.4.3       fastmap_1.2.0       
## [34] grid_4.4.3           colorspace_2.1-1     cli_3.6.4           
## [37] magrittr_2.0.3       withr_3.0.2          scales_1.3.0        
## [40] timechange_0.3.0     TTR_0.24.4           rmarkdown_2.29      
## [43] quantmod_0.4.26      nnet_7.3-20          timeDate_4041.110   
## [46] cellranger_1.1.0     hms_1.1.3            urca_1.3-4          
## [49] evaluate_1.0.3       knitr_1.49           lmtest_0.9-40       
## [52] rlang_1.1.5          Rcpp_1.0.14          glue_1.8.0          
## [55] rstudioapi_0.17.1    jsonlite_1.9.1       R6_2.6.1