ATM Cash Forecasting Business Case

ATM forecasting is a specialized form of cash management that matters to banks, credit unions, independent ATM operators, cash-in-transit companies, and retail businesses. By accurately predicting cash demand, these organizations can avoid both costly overstocking and reputation-damaging cash shortages. Organizations must find the optimal cash level to meet customer demand. Accurate forecasting allows businesses to balance the costs associated with stockouts and overstocking, improving profitability while maintaining service quality.

The Business Challenge

Key Metrics

Business-relevant KPIs

Project Scope

Goals for Initial Data Assessment

cat(" DATA ACQUISITION AND INITIAL ASSESSMENT \n")
##  DATA ACQUISITION AND INITIAL ASSESSMENT
library(readxl)       # Import Excel files
library(forecast)     # ARIMA, ETS, and classical forecasting methods
library(prophet)      # Facebook's Prophet for time series with seasonality
library(tsibble)      # Modern time series data structure
library(fable)        # Tidy forecasting framework (successor to forecast)
library(feasts)       # Feature extraction and statistics for time series
library(zoo)          # Time series handling
library(dplyr)        # Data manipulation
library(tidyr)        # Data tidying
library(lubridate)    # Date/time handling
library(tseries)      # Time series analysis and tests
library(Metrics)      # RMSE, MAE, MAPE calculations
library(yardstick)    # Tidy model metrics
library(writexl)      # Export to Excel
library(readxl)       # Import Excel files
library(kableExtra)    # Enhanced tables
# Read with explicit date handling
atm_data <- read_excel("ATM624Data.xlsx") %>%
  mutate(
    # Force proper date conversion from Excel numeric format
    DATE = as.Date(as.numeric(DATE), origin = "1899-12-30")
  )

# Check result
cat("Date range:", as.character(min(atm_data$DATE, na.rm = TRUE)), "to", 
    as.character(max(atm_data$DATE, na.rm = TRUE)), "\n")
## Date range: 2009-05-01 to 2010-05-14
# Column names
cat("Column Names:\n")
## Column Names:
print(names(atm_data))
## [1] "DATE" "ATM"  "Cash"
cat("\n")
head(atm_data, 10)
## # A tibble: 10 × 3
##    DATE       ATM    Cash
##    <date>     <chr> <dbl>
##  1 2009-05-01 ATM1     96
##  2 2009-05-01 ATM2    107
##  3 2009-05-02 ATM1     82
##  4 2009-05-02 ATM2     89
##  5 2009-05-03 ATM1     85
##  6 2009-05-03 ATM2     90
##  7 2009-05-04 ATM1     90
##  8 2009-05-04 ATM2     55
##  9 2009-05-05 ATM1     99
## 10 2009-05-05 ATM2     79
sum(is.na(atm_data$Cash))
## [1] 19
# Dataset Overview
cat("Overview of the Dataset:\n")
## Overview of the Dataset:
glimpse(atm_data, 10)
## Rows: 1,474
## Columns: 3
## $ DATE <date> …
## $ ATM  <chr> …
## $ Cash <dbl> …
#Missing data assessment
# Unique ATM identifiers
unique_atms <- unique(atm_data$ATM)
print(unique_atms)
## [1] "ATM1" "ATM2" NA     "ATM3" "ATM4"
cat("\nNumber of ATM machines:", length(unique_atms[!is.na(unique_atms)]), "\n\n")
## 
## Number of ATM machines: 4
# Missing values by ATM
cat("Missing Cash values by ATM:\n")
## Missing Cash values by ATM:
missing_by_atm <- atm_data %>%
  group_by(ATM) %>%
  summarise(
    Total_Records = n(),
    Missing_Cash = sum(is.na(Cash)),
    Missing_Percentage = round(sum(is.na(Cash)) / n() * 100, 2)
  )
print(missing_by_atm)
## # A tibble: 5 × 4
##   ATM   Total_Records Missing_Cash Missing_Percentage
##   <chr>         <int>        <int>              <dbl>
## 1 ATM1            365            3               0.82
## 2 ATM2            365            2               0.55
## 3 ATM3            365            0               0   
## 4 ATM4            365            0               0   
## 5 <NA>             14           14             100
cat("\n")
# Missing values by ATM - sorted by most missing to least
missing_by_atm <- atm_data %>%
  group_by(ATM) %>%
  summarise(
    Total_Records = n(),
    Missing_Cash = sum(is.na(Cash)),
    Missing_Pct = round(sum(is.na(Cash)) / n() * 100, 2)
  ) %>%
  arrange(desc(Missing_Cash))  # ← Highest missing first

kable(missing_by_atm, caption = "Missing Cash Values by ATM (Sorted: Most Missing First)") %>%
   kable_styling(bootstrap_options = c("striped", "hover"))
Missing Cash Values by ATM (Sorted: Most Missing First)
ATM Total_Records Missing_Cash Missing_Pct
NA 14 14 100.00
ATM1 365 3 0.82
ATM2 365 2 0.55
ATM3 365 0 0.00
ATM4 365 0 0.00
# Check where missing values actually are

cat("Missing Values by Column:\n")
## Missing Values by Column:
cat("Cash:", sum(is.na(atm_data$Cash)), "\n")
## Cash: 19
cat("ATM:", sum(is.na(atm_data$ATM)), "\n")
## ATM: 14
cat("DATE:", sum(is.na(atm_data$DATE)), "\n\n")
## DATE: 0
# Cross-check: rows with missing ATM but present Cash
atm_data %>%
  filter(is.na(ATM)) %>%
  summarise(
    Count = n(),
    Has_Cash = sum(!is.na(Cash)),
    Has_Date = sum(!is.na(DATE))
  )
## # A tibble: 1 × 3
##   Count Has_Cash Has_Date
##   <int>    <int>    <int>
## 1    14        0       14
# Check for Duplicates 
duplicates <- atm_data %>%
  group_by(DATE, ATM) %>%
  summarise(Count = n(), .groups = 'drop') %>%
  filter(Count > 1)

cat("Duplicate DATE-ATM combinations:", nrow(duplicates), "\n")
## Duplicate DATE-ATM combinations: 0
if(nrow(duplicates) > 0) {
  cat("\nDuplicate records found:\n")
  print(duplicates)
  cat("\n")
} else {
  cat("No duplicate DATE-ATM combinations found.\n\n")
}
## No duplicate DATE-ATM combinations found.
# Date range by ATM
cat("Date Range by ATM:\n")
## Date Range by ATM:
date_range_by_atm <- atm_data %>%
  filter(!is.na(ATM)) %>%
  group_by(ATM) %>%
  summarise(
    Start_Date = min(DATE, na.rm = TRUE),
    End_Date = max(DATE, na.rm = TRUE),
    Days_Span = as.numeric(difftime(max(DATE, na.rm = TRUE), 
                                    min(DATE, na.rm = TRUE), 
                                    units = "days"))
  )
print(date_range_by_atm)
## # A tibble: 4 × 4
##   ATM   Start_Date End_Date   Days_Span
##   <chr> <date>     <date>         <dbl>
## 1 ATM1  2009-05-01 2010-04-30       364
## 2 ATM2  2009-05-01 2010-04-30       364
## 3 ATM3  2009-05-01 2010-04-30       364
## 4 ATM4  2009-05-01 2010-04-30       364
# --- Sample Size per ATM ---

sample_size <- atm_data %>%
  filter(!is.na(ATM)) %>%
  group_by(ATM) %>%
  summarise(
    Total_Records = n(),
    Non_Missing_Cash = sum(!is.na(Cash)),
    Missing_Cash = sum(is.na(Cash))
  )
print(sample_size)
## # A tibble: 4 × 4
##   ATM   Total_Records Non_Missing_Cash Missing_Cash
##   <chr>         <int>            <int>        <int>
## 1 ATM1            365              362            3
## 2 ATM2            365              363            2
## 3 ATM3            365              365            0
## 4 ATM4            365              365            0
cat("\n")
# --- Summary Statistics for Each ATM ---


summary_stats <- atm_data %>%
  filter(!is.na(ATM) & !is.na(Cash)) %>%
  group_by(ATM) %>%
  summarise(
    Count = n(),
    Mean = round(mean(Cash, na.rm = TRUE), 2),
    Median = round(median(Cash, na.rm = TRUE), 2),
    Std_Dev = round(sd(Cash, na.rm = TRUE), 2),
    Min = round(min(Cash, na.rm = TRUE), 2),
    Max = round(max(Cash, na.rm = TRUE), 2),
    Q1 = round(quantile(Cash, 0.25, na.rm = TRUE), 2),
    Q3 = round(quantile(Cash, 0.75, na.rm = TRUE), 2)
  )
print(summary_stats)
## # A tibble: 4 × 9
##   ATM   Count   Mean Median Std_Dev   Min    Max    Q1    Q3
##   <chr> <int>  <dbl>  <dbl>   <dbl> <dbl>  <dbl> <dbl> <dbl>
## 1 ATM1    362  83.9     91    36.7   1      180   73    108 
## 2 ATM2    363  62.6     67    38.9   0      147   25.5   93 
## 3 ATM3    365   0.72     0     7.94  0       96    0      0 
## 4 ATM4    365 474.     404.  651.    1.56 10920. 124.   705.
cat("\n")
#is.na(atm_data)

Analysis of Initial Data Assessment

The initial data exploration revealed “NA” values in the ATM column, indicating 14 records without a specified ATM identifier. Records lacking ATM identification render individual ATM-level analysis impossible. While these records could provide aggregate withdrawal insights if all ATMs were operated by a single entity, they cannot support ATM-specific performance metrics required by stakeholders. Consequently, rows with missing ATM values were removed to ensure data quality and analytical relevance.

Data Uniqueness and Completeness

  • No duplicate date-ATM combinations were found, confirming each record represents a unique ATM-date pair
  • The date column contains no missing values
  • Both the cash and ATM columns show missing data requiring attention

Missing Cash Data by ATM

Further investigation of missing cash withdrawal data by ATM revealed:

  • ATM1: 0.82% missing cash values
  • ATM2: 0.55% missing cash values
  • ATM3: 0.00% missing cash values
  • ATM4: 0.00% missing cash values

Data Integrity Validation

All ATMs share identical date ranges, confirming that data was collected consistently across the same time period. This consistency strengthens the integrity of comparative analyses across ATMs.

Summary Statistics and Key Findings

  • ATM4 demonstrates the highest performance with a mean withdrawal of $474.00 and median of $403.84
  • ATM3 shows anomalously low activity with a mean of $0.72 and median of $0.00, suggesting extended periods of inactivity. This pattern likely indicates the ATM was out of service, undergoing maintenance, or decommissioned during portions of the observation period.

Exploratory Data Analysis

Clean data and create temporal features to enable comprehensive pattern detection. Since missing data represents less than 1% of total records, we remove these rows to ensure clean visualizations and unbiased statistical calculations. Temporal features (day of week, month, weekend indicators) are engineered to test hypotheses about withdrawal patterns and support forecasting model development.

library(ggplot2)      # Advanced plotting
library(gridExtra)    # Multiple plots
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(scales)       # Scale formatting
library(viridis)      # Color palettes
## Loading required package: viridisLite
## 
## Attaching package: 'viridis'
## The following object is masked from 'package:scales':
## 
##     viridis_pal
library(corrplot)     # Correlation plots
## corrplot 0.95 loaded
library(forecast)     # ACF/PACF plots
library(tidyr)        # Data reshaping
library(dplyr)        # Data manipulation
library(lubridate)    # Date functions

# --- DATA PREPARATION FOR EDA ---
cat("Preparing data for analysis...\n\n")
## Preparing data for analysis...
# Remove rows with missing ATM or Cash values for clean visualization since missing data is less than 1% of the total data.
atm_clean <- atm_data %>%
  filter(!is.na(ATM) & !is.na(Cash)) %>%
  mutate(
    ATM = as.factor(ATM),
    DATE = as.Date(DATE),
    # Create temporal features
    DayOfWeek = wday(DATE, label = TRUE, abbr = TRUE),
    DayOfMonth = day(DATE),
    Month = month(DATE, label = TRUE, abbr = TRUE),
    Year = year(DATE),
    WeekOfYear = week(DATE),
    IsWeekend = ifelse(wday(DATE) %in% c(1, 7), "Weekend", "Weekday"),
    # Paycheck hypothesis features
    IsFirstHalf = ifelse(day(DATE) <= 15, "1st Half", "2nd Half"),
    IsPayday = ifelse(day(DATE) %in% c(1, 15), "Payday", "Regular")
  )

cat("Clean dataset prepared with", nrow(atm_clean), "records\n")
## Clean dataset prepared with 1455 records
cat("Date range:", as.character(min(atm_clean$DATE)), "to", 
    as.character(max(atm_clean$DATE)), "\n\n")
## Date range: 2009-05-01 to 2010-04-30

In Depth Exploratory Data Analysis (EDA)

Understanding the distribution of cash withdrawals for each ATM is fundamental to selecting appropriate forecasting models. The distribution shape informs model assumptions and helps identify outliers that may require special handling, and reveals whether a one-size-fits-all approach is appropriate or if ATM-specific models are needed.

# ============================================================================
# 1. UNIVARIATE ANALYSIS
# ============================================================================

cat("--- 1. UNIVARIATE ANALYSIS ---\n\n")
## --- 1. UNIVARIATE ANALYSIS ---
# --- 1.1 Distribution: Histograms with Density Overlays ---
cat("Creating distribution plots...\n")
## Creating distribution plots...
p1_hist <- ggplot(atm_clean, aes(x = Cash, fill = ATM)) +
  geom_histogram(aes(y = ..density..), bins = 30, alpha = 0.6, color = "black") +
  geom_density(alpha = 0.3, linewidth = 1) +
  facet_wrap(~ATM, scales = "free", ncol = 2) +
  scale_fill_viridis_d(option = "plasma") +
  labs(
    title = "Distribution of Cash Withdrawals by ATM",
    subtitle = "Histogram with density overlay - identifying distributional characteristics",
    x = "Cash Withdrawn (Hundreds of Dollars)",
    y = "Density",
    caption = "Note: Check for skewness, multiple modes, and outliers"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "none",
    strip.text = element_text(face = "bold", size = 11)
  )

print(p1_hist)
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

cat("\n")
# --- 1.2 Boxplots for Outlier Detection ---
cat("Creating boxplots for outlier detection...\n")
## Creating boxplots for outlier detection...
p1_box <- ggplot(atm_clean, aes(x = ATM, y = Cash, fill = ATM)) +
  geom_boxplot(alpha = 0.7, outlier.color = "red", outlier.size = 2) +
  stat_summary(fun = mean, geom = "point", shape = 23, size = 3, 
               fill = "white", color = "black") +
  scale_fill_viridis_d(option = "plasma") +
  labs(
    title = "Cash Withdrawal Distribution & Outlier Detection",
    subtitle = "Boxplots by ATM - Red points indicate outliers (IQR method)",
    x = "ATM",
    y = "Cash Withdrawn (Hundreds of Dollars)",
    caption = "Diamond = Mean | Box = IQR | Whiskers = 1.5×IQR"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "none"
  )

print(p1_box)

cat("\n")
# --- 1.4 Statistical Outlier Detection (IQR and Z-scores) ---
cat("Calculating outlier statistics...\n\n")
## Calculating outlier statistics...
outlier_analysis <- atm_clean %>%
  group_by(ATM) %>%
  mutate(
    # IQR Method
    Q1 = quantile(Cash, 0.25),
    Q3 = quantile(Cash, 0.75),
    IQR = Q3 - Q1,
    Lower_Bound = Q1 - 1.5 * IQR,
    Upper_Bound = Q3 + 1.5 * IQR,
    Is_Outlier_IQR = Cash < Lower_Bound | Cash > Upper_Bound,
    
    # Z-Score Method
    Mean = mean(Cash),
    SD = sd(Cash),
    Z_Score = (Cash - Mean) / SD,
    Is_Outlier_Z = abs(Z_Score) > 3
  ) %>%
  ungroup()

outlier_summary_stats <- outlier_analysis %>%
  group_by(ATM) %>%
  summarise(
    Total_Records = n(),
    Outliers_IQR = sum(Is_Outlier_IQR),
    Outliers_Z = sum(Is_Outlier_Z),
    Pct_Outliers_IQR = round(sum(Is_Outlier_IQR) / n() * 100, 2),
    Pct_Outliers_Z = round(sum(Is_Outlier_Z) / n() * 100, 2),
    Mean = round(mean(Cash), 2),
    Median = round(median(Cash), 2),
    SD = round(sd(Cash), 2),
    Min = round(min(Cash), 2),
    Max = round(max(Cash), 2)
  )

cat("Descriptive Statistics & Outlier Detection by ATM:\n")
## Descriptive Statistics & Outlier Detection by ATM:
print(outlier_summary_stats)
## # A tibble: 4 × 11
##   ATM   Total_Records Outliers_IQR Outliers_Z Pct_Outliers_IQR Pct_Outliers_Z
##   <fct>         <int>        <int>      <int>            <dbl>          <dbl>
## 1 ATM1            362           51          0            14.1            0   
## 2 ATM2            363            0          0             0              0   
## 3 ATM3            365            3          3             0.82           0.82
## 4 ATM4            365            3          1             0.82           0.27
## # ℹ 5 more variables: Mean <dbl>, Median <dbl>, SD <dbl>, Min <dbl>, Max <dbl>
cat("\n")
# Show example outliers
outlier_examples <- outlier_analysis %>%
  filter(Is_Outlier_IQR) %>%
  select(DATE, ATM, Cash, Lower_Bound, Upper_Bound, Z_Score) %>%
  arrange(ATM, desc(Cash))

if(nrow(outlier_examples) > 0) {
  cat("Sample of Detected Outliers (IQR Method):\n")
  print(head(outlier_examples, 15))
  cat("\n")
}
## Sample of Detected Outliers (IQR Method):
## # A tibble: 15 × 6
##    DATE       ATM    Cash Lower_Bound Upper_Bound Z_Score
##    <date>     <fct> <dbl>       <dbl>       <dbl>   <dbl>
##  1 2009-09-22 ATM1    180        20.5        160.    2.62
##  2 2010-02-17 ATM1    179        20.5        160.    2.59
##  3 2009-08-25 ATM1    174        20.5        160.    2.46
##  4 2009-05-21 ATM1     20        20.5        160.   -1.74
##  5 2009-07-10 ATM1     19        20.5        160.   -1.77
##  6 2009-07-16 ATM1     19        20.5        160.   -1.77
##  7 2009-09-03 ATM1     19        20.5        160.   -1.77
##  8 2010-03-30 ATM1     19        20.5        160.   -1.77
##  9 2009-09-10 ATM1     18        20.5        160.   -1.80
## 10 2009-11-26 ATM1     18        20.5        160.   -1.80
## 11 2009-06-11 ATM1     16        20.5        160.   -1.85
## 12 2009-07-02 ATM1     16        20.5        160.   -1.85
## 13 2009-07-23 ATM1     16        20.5        160.   -1.85
## 14 2009-11-12 ATM1     15        20.5        160.   -1.88
## 15 2010-01-07 ATM1     15        20.5        160.   -1.88

ATM Cash Withdrawal Analysis: Key Insights

ATM1 and ATM2: Active ATMs with Diverse Customer Base - Both ATMs display multimodal distributions with a right skew, meaning most transactions are small withdrawals, with occasional large withdrawals creating a “tail” toward higher values - The multiple peaks in the distribution indicate distinct customer segments—for example, one group consistently withdrawing $20-$40, another withdrawing $100-$200, and a third making larger withdrawals - Both ATMs show clear, predictable patterns, making them suitable candidates for time-series forecasting models

ATM3: Potential Service or Operational Issues - The distribution is heavily concentrated at or near zero, suggesting either: - Prolonged periods of inactivity - Service outages or technical malfunctions - Data collection errors - Recommendation: Investigate whether ATM3 is operational and meeting customer demand

ATM4: Moderate Activity with Anomalies - Exhibits a more symmetric (bell-shaped) distribution compared to ATM1 and ATM2 - The boxplot reveals several outliers—unusually high withdrawal amounts that fall outside the expected range - While these outliers don’t necessarily indicate fraud, they warrant closer examination by the ATM4 location owner for: - Legitimate business-related withdrawals - Potential data entry errors - Unusual customer behavior patterns

Strategic Implications for Forecasting and Operations - ATM1 & ATM2: Strong candidates for sophisticated forecasting models (ARIMA, exponential smoothing) due to clear trends - ATM3: Requires operational review before implementing forecasting; may need different monitoring approach - ATM4: Forecasting models should account for higher variability; consider threshold alerts for outlier transactions

These distribution insights enable stakeholders to tailor ATM-specific operational strategies, cash replenishment schedules, and monitoring protocols based on each location’s unique usage patterns.

Temporal Pattern Analysis: Uncovering Withdrawal Behaviors

Day-of-Week Effects Business Question: Do customer behaviors differ by day of the week?

Value to Stakeholders:

Operations: Schedule cash replenishment on low-volume days to minimize disruption Finance: Allocate working capital efficiently based on predictable demand spikes

Weekend vs. Weekday Comparison Business Question: Do weekends require different operational strategies? Statistical Rigor:

**p-value < 0.05 (marked *): Statistically significant difference → action required p-value > 0.05: No meaningful difference → maintain standard operations

Value to Stakeholders:

Justifies weekend premium staffing or differential schedules Identifies ATM locations: business districts (dead weekends) vs. retail areas (busy weekends)

Day-of-Month Effects (Payday Hypothesis) Business Question: Do withdrawals spike around paydates (1st and 15th)? Value to Stakeholders:

Spikes confirmed: Pre-load ATMs 1-2 days before payday to prevent outages No pattern: Simplifies inventory management—demand is uniform

Monthly Seasonality Business Question: Do certain months require special preparation? Value to Stakeholders:

Financial Planning: Budget for higher cash inventory in peak months Marketing: Time promotions around high-traffic periods Forecasting: Adjust models for holiday spending, tax season, back-to-school

# ============================================================================
# 3. TEMPORAL PATTERN ANALYSIS
# ============================================================================

cat("\n--- 3. TEMPORAL PATTERN ANALYSIS ---\n\n")
## 
## --- 3. TEMPORAL PATTERN ANALYSIS ---
# --- 3.1 Day-of-Week Effects ---
cat("Analyzing day-of-week patterns...\n")
## Analyzing day-of-week patterns...
dow_summary <- atm_clean %>%
  group_by(ATM, DayOfWeek) %>%
  summarise(
    Mean_Cash = mean(Cash),
    Median_Cash = median(Cash),
    SD_Cash = sd(Cash),
    Count = n(),
    .groups = 'drop'
  )

p3_dow <- ggplot(dow_summary, aes(x = DayOfWeek, y = Mean_Cash, fill = ATM)) +
  geom_col(position = "dodge", alpha = 0.8) +
  geom_errorbar(aes(ymin = Mean_Cash - SD_Cash, ymax = Mean_Cash + SD_Cash),
                position = position_dodge(0.9), width = 0.2, alpha = 0.6) +
  scale_fill_viridis_d(option = "plasma") +
  labs(
    title = "Day-of-Week Effects on Cash Withdrawals",
    subtitle = "Mean withdrawal by day with ±1 SD error bars",
    x = "Day of Week",
    y = "Mean Cash (Hundreds of Dollars)",
    fill = "ATM"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "right"
  )

print(p3_dow)

cat("\n")
# --- 3.2 Weekend vs Weekday Comparison ---
cat("Comparing weekend vs weekday patterns...\n")
## Comparing weekend vs weekday patterns...
weekend_summary <- atm_clean %>%
  group_by(ATM, IsWeekend) %>%
  summarise(
    Mean_Cash = mean(Cash),
    Median_Cash = median(Cash),
    SD_Cash = sd(Cash),
    Count = n(),
    .groups = 'drop'
  )

p3_weekend <- ggplot(atm_clean, aes(x = IsWeekend, y = Cash, fill = IsWeekend)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ATM, scales = "free_y", ncol = 4) +
  scale_fill_manual(values = c("Weekend" = "#FDE725FF", "Weekday" = "#440154FF")) +
  labs(
    title = "Weekend vs Weekday Cash Withdrawal Patterns",
    subtitle = "Distribution comparison across ATMs",
    x = "",
    y = "Cash Withdrawn (Hundreds of Dollars)",
    fill = "Period"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "bottom"
  )

print(p3_weekend)

cat("\n")
# Statistical test
cat("Statistical Testing: Weekend vs Weekday\n")
## Statistical Testing: Weekend vs Weekday
for(atm_id in unique(atm_clean$ATM)) {
  atm_subset <- atm_clean %>% filter(ATM == atm_id)
  weekend_data <- atm_subset %>% filter(IsWeekend == "Weekend") %>% pull(Cash)
  weekday_data <- atm_subset %>% filter(IsWeekend == "Weekday") %>% pull(Cash)
  
  t_test <- t.test(weekend_data, weekday_data)
  
  cat(sprintf("%s: Weekend mean = %.2f, Weekday mean = %.2f, p-value = %.4f %s\n",
              atm_id,
              mean(weekend_data),
              mean(weekday_data),
              t_test$p.value,
              ifelse(t_test$p.value < 0.05, "***", "")))
}
## ATM1: Weekend mean = 99.66, Weekday mean = 77.61, p-value = 0.0000 ***
## ATM2: Weekend mean = 71.57, Weekday mean = 58.97, p-value = 0.0007 ***
## ATM3: Weekend mean = 0.00, Weekday mean = 1.01, p-value = 0.0840 
## ATM4: Weekend mean = 515.31, Weekday mean = 457.60, p-value = 0.3119
cat("\n")
# --- 3.3 Day-of-Month Effects (Paycheck Hypothesis) ---
cat("Analyzing day-of-month patterns (paycheck effect)...\n")
## Analyzing day-of-month patterns (paycheck effect)...
dom_summary <- atm_clean %>%
  group_by(ATM, DayOfMonth) %>%
  summarise(
    Mean_Cash = mean(Cash),
    Count = n(),
    .groups = 'drop'
  )

p3_dom <- ggplot(dom_summary, aes(x = DayOfMonth, y = Mean_Cash, color = ATM)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  geom_vline(xintercept = c(1, 15), linetype = "dashed", color = "red", linewidth = 0.8) +
  annotate("text", x = 1, y = max(dom_summary$Mean_Cash) * 0.95, 
           label = "Payday?", color = "red", angle = 90, vjust = -0.5) +
  annotate("text", x = 15, y = max(dom_summary$Mean_Cash) * 0.95, 
           label = "Mid-month Payday?", color = "red", angle = 90, vjust = -0.5) +
  scale_color_viridis_d(option = "plasma") +
  scale_x_continuous(breaks = seq(1, 31, 2)) +
  labs(
    title = "Day-of-Month Effect: Testing Payday Hypothesis",
    subtitle = "Do cash withdrawals spike around 1st and 15th (typical paydays)?",
    x = "Day of Month",
    y = "Mean Cash (Hundreds of Dollars)",
    color = "ATM"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "right"
  )

print(p3_dom)

cat("\n")
# --- 3.5 Monthly Seasonality ---
cat("Analyzing monthly seasonality...\n")
## Analyzing monthly seasonality...
monthly_summary <- atm_clean %>%
  group_by(ATM, Month) %>%
  summarise(
    Mean_Cash = mean(Cash),
    Median_Cash = median(Cash),
    Count = n(),
    .groups = 'drop'
  )

p3_monthly <- ggplot(monthly_summary, aes(x = Month, y = Mean_Cash, group = ATM, color = ATM)) +
  geom_line(linewidth = 1.2) +
  geom_point(size = 3) +
  scale_color_viridis_d(option = "plasma") +
  labs(
    title = "Monthly Seasonality in Cash Withdrawals",
    subtitle = "Mean withdrawals by month across ATMs",
    x = "Month",
    y = "Mean Cash (Hundreds of Dollars)",
    color = "ATM"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "right",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

print(p3_monthly)

cat("\n")
# ============================================================================
# 5. ANOMALY DETECTION
# ============================================================================

cat("\n--- 5. ANOMALY DETECTION ---\n\n")
## 
## --- 5. ANOMALY DETECTION ---
# --- 5.1 Statistical Anomaly Detection ---
cat("Detecting anomalies using multiple methods...\n")
## Detecting anomalies using multiple methods...
anomalies <- atm_clean %>%
  group_by(ATM) %>%
  mutate(
    # Rolling statistics
    Rolling_Mean_7 = zoo::rollmean(Cash, k = 7, fill = NA, align = "center"),
    Rolling_SD_7 = zoo::rollapply(Cash, width = 7, FUN = sd, fill = NA, align = "center"),
    
    # Deviation from rolling mean
    Deviation = Cash - Rolling_Mean_7,
    Z_Score_Rolling = Deviation / Rolling_SD_7,
    
    # IQR method
    Q1 = quantile(Cash, 0.25),
    Q3 = quantile(Cash, 0.75),
    IQR = Q3 - Q1,
    Lower_Bound = Q1 - 1.5 * IQR,
    Upper_Bound = Q3 + 1.5 * IQR,
    
    # Flag anomalies
    Is_Anomaly_IQR = Cash < Lower_Bound | Cash > Upper_Bound,
    Is_Anomaly_Z = abs(Z_Score_Rolling) > 3,
    Is_Anomaly = Is_Anomaly_IQR | Is_Anomaly_Z
  ) %>%
  ungroup()

anomaly_summary <- anomalies %>%
  group_by(ATM) %>%
  summarise(
    Total_Records = n(),
    Anomalies_Detected = sum(Is_Anomaly, na.rm = TRUE),
    Pct_Anomalies = round(sum(Is_Anomaly, na.rm = TRUE) / n() * 100, 2)
  )

cat("Anomaly Detection Summary:\n")
## Anomaly Detection Summary:
print(anomaly_summary)
## # A tibble: 4 × 4
##   ATM   Total_Records Anomalies_Detected Pct_Anomalies
##   <fct>         <int>              <int>         <dbl>
## 1 ATM1            362                 51         14.1 
## 2 ATM2            363                  0          0   
## 3 ATM3            365                  3          0.82
## 4 ATM4            365                  3          0.82
cat("\n")
# --- 5.2 Visualize Anomalies ---
cat("Creating anomaly visualization...\n")
## Creating anomaly visualization...
p5_anomaly <- ggplot(anomalies, aes(x = DATE, y = Cash)) +
  geom_line(color = "gray60", linewidth = 0.6) +
  geom_point(data = anomalies %>% filter(Is_Anomaly), 
             aes(color = "Anomaly"), size = 3, alpha = 0.8) +
  geom_line(aes(y = Rolling_Mean_7, color = "7-Day Mean"), linewidth = 1) +
  facet_wrap(~ATM, scales = "free_y", ncol = 2) +
  scale_color_manual(values = c("Anomaly" = "#DC3220", "7-Day Mean" = "#005AB5")) +
  labs(
    title = "Anomaly Detection: Unusual Cash Withdrawal Patterns",
    subtitle = "Red points = detected anomalies using IQR and rolling Z-score methods",
    x = "Date",
    y = "Cash Withdrawn (Hundreds of Dollars)",
    color = ""
  ) +
  theme_minimal(base_size = 11) +
  theme(
    plot.title = element_text(face = "bold", size = 12),
    strip.text = element_text(face = "bold"),
    legend.position = "bottom",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

print(p5_anomaly)
## Warning: Removed 6 rows containing missing values or values outside the scale range
## (`geom_line()`).

cat("\n")
# Show specific anomalies
anomaly_examples <- anomalies %>%
  filter(Is_Anomaly) %>%
  select(DATE, ATM, Cash, Rolling_Mean_7, Deviation, Z_Score_Rolling) %>%
  arrange(ATM, DATE)

if(nrow(anomaly_examples) > 0) {
  cat("Examples of Detected Anomalies:\n")
  print(head(anomaly_examples, 20))
  cat("\n")
}
## Examples of Detected Anomalies:
## # A tibble: 20 × 6
##    DATE       ATM    Cash Rolling_Mean_7 Deviation Z_Score_Rolling
##    <date>     <fct> <dbl>          <dbl>     <dbl>           <dbl>
##  1 2009-05-07 ATM1      8           81.3     -73.3           -2.23
##  2 2009-05-14 ATM1      6           77.9     -71.9           -2.08
##  3 2009-05-21 ATM1     20           77.7     -57.7           -2.15
##  4 2009-05-28 ATM1     10           79.9     -69.9           -2.16
##  5 2009-06-04 ATM1     14           72.7     -58.7           -1.31
##  6 2009-06-05 ATM1      3           74.3     -71.3           -1.57
##  7 2009-06-11 ATM1     16          101.      -84.9           -1.93
##  8 2009-06-25 ATM1     13           87.1     -74.1           -2.04
##  9 2009-07-02 ATM1     16           89       -73             -2.13
## 10 2009-07-09 ATM1      4           80       -76             -1.51
## 11 2009-07-10 ATM1     19           75.9     -56.9           -1.21
## 12 2009-07-16 ATM1     19           87.4     -68.4           -2.12
## 13 2009-07-23 ATM1     16           86.9     -70.9           -2.06
## 14 2009-07-30 ATM1     13           83.4     -70.4           -1.92
## 15 2009-08-06 ATM1     13           89.6     -76.6           -2.09
## 16 2009-08-25 ATM1    174          108        66              1.35
## 17 2009-08-27 ATM1     13          111.      -97.7           -1.99
## 18 2009-09-03 ATM1     19          105.      -85.7           -1.92
## 19 2009-09-10 ATM1     18           83.3     -65.3           -1.27
## 20 2009-09-11 ATM1      1           80       -79             -1.56
# ============================================================================
# 6. ADDITIONAL COMPELLING VISUALIZATIONS
# ============================================================================

cat("\n--- 6. ADDITIONAL INSIGHTS & VISUALIZATIONS ---\n\n")
## 
## --- 6. ADDITIONAL INSIGHTS & VISUALIZATIONS ---
# --- 6.1 Volatility Over Time ---
cat("Analyzing volatility patterns...\n")
## Analyzing volatility patterns...
volatility <- atm_clean %>%
  group_by(ATM) %>%
  arrange(DATE) %>%
  mutate(
    Rolling_SD_14 = zoo::rollapply(Cash, width = 14, FUN = sd, fill = NA, align = "center")
  ) %>%
  ungroup()

p6_volatility <- ggplot(volatility, aes(x = DATE, y = Rolling_SD_14, color = ATM)) +
  geom_line(linewidth = 0.8) +
  scale_color_viridis_d(option = "plasma") +
  labs(
    title = "Cash Withdrawal Volatility Over Time",
    subtitle = "14-day rolling standard deviation - measuring forecast uncertainty",
    x = "Date",
    y = "Rolling SD (14 days)",
    color = "ATM",
    caption = "Higher values = more unpredictable withdrawals"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "right",
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

print(p6_volatility)
## Warning: Removed 52 rows containing missing values or values outside the scale range
## (`geom_line()`).

cat("\n")

Model Validation Summary

This validation process tests whether our forecasting models can accurately predict real-world cash needs by withholding the last 30 days of data, training models on historical data only, and then comparing predictions against actual withdrawals. The 30-day hold-out period represents a practical operational planning cycle while the auto.ARIMA algorithm automatically selects the optimal forecasting model for each ATM’s unique patterns without requiring manual assumptions. Three complementary accuracy metrics—MAE (average error in dollars), RMSE (penalizing large misses), and MAPE (percentage error)—provide a comprehensive assessment of forecast reliability, with MAPE being most stakeholder-friendly for comparing performance across ATMs. Industry-standard benchmarks (<5% MAPE = excellent, 5-10% = good, 10-20% = acceptable, >20% = needs improvement) enable quick pass/fail decisions on model deployment. This validation transforms theoretical models into deployable business tools by proving they work on unseen future data, quantifying forecast confidence, and identifying which ATMs can use automated cash allocation versus those requiring larger safety buffers or alternative strategies.

Key Benefits: - Prevents overfitting: Models proven on new data, not just memorizing history - Quantifies risk: Know prediction accuracy before committing operational resources - ATM-specific insights: Identifies which locations are predictable (deploy automation) vs. volatile (manual oversight) - Actionable thresholds: Clear benchmarks guide deployment decisions (e.g., 8% MAPE → proceed with confidence) - Real-world simulation: “If we used this model last month, how would it have performed?”

============================================================================

MODEL VALIDATION

============================================================================

cat("\n\n=== MODEL VALIDATION ===\n\n")
## 
## 
## === MODEL VALIDATION ===
validation_results <- list()

for(atm_name in unique(atm_clean$ATM)) {
  cat(paste("Validating model for", atm_name, "...\n"))
  
  # Get full data
  atm_ts_data <- atm_clean %>%
    filter(ATM == atm_name) %>%
    arrange(DATE)
  
  # Hold out last 30 days for validation
  n_total <- nrow(atm_ts_data)
  n_train <- n_total - 30
  
  train_data <- atm_ts_data[1:n_train, ]
  test_data <- atm_ts_data[(n_train+1):n_total, ]
  
  # Fit model on training data
  train_ts <- ts(train_data$Cash, frequency = 7)
  model <- auto.arima(train_ts, seasonal = TRUE, stepwise = FALSE)
  
  # Forecast 30 days ahead
  forecast_val <- forecast(model, h = 30)
  
  # Calculate accuracy metrics
  actual <- test_data$Cash
  predicted <- as.numeric(forecast_val$mean)
  
  mae <- mean(abs(actual - predicted))
  rmse <- sqrt(mean((actual - predicted)^2))
  mape <- mean(abs((actual - predicted) / actual)) * 100
  
  validation_results[[atm_name]] <- data.frame(
    ATM = atm_name,
    MAE = round(mae, 2),
    RMSE = round(rmse, 2),
    MAPE = round(mape, 2),
    Model = as.character(model)
  )
  
  cat(paste("  Model:", as.character(model), "\n"))
  cat(paste("  RMSE:", round(rmse, 2), "hundred dollars\n"))
  cat(paste("  MAPE:", round(mape, 2), "%\n\n"))
}
## Validating model for ATM1 ...
##   Model: ARIMA(0,0,1)(0,1,1)[7] 
##   RMSE: 12.19 hundred dollars
##   MAPE: 50.83 %
## 
## Validating model for ATM2 ...
##   Model: ARIMA(2,0,2)(0,1,1)[7] 
##   RMSE: 20.17 hundred dollars
##   MAPE: 30.46 %
## 
## Validating model for ATM3 ...
##   Model: ARIMA(0,0,0) with non-zero mean 
##   RMSE: 27.79 hundred dollars
##   MAPE: NaN %
## 
## Validating model for ATM4 ...
##   Model: ARIMA(0,0,0) with non-zero mean 
##   RMSE: 293.5 hundred dollars
##   MAPE: 1240.19 %
# Summary table
validation_summary <- bind_rows(validation_results)
cat("VALIDATION SUMMARY (Out-of-Sample Performance):\n")
## VALIDATION SUMMARY (Out-of-Sample Performance):
print(validation_summary)
##    ATM    MAE   RMSE    MAPE                           Model
## 1 ATM1   9.60  12.19   50.83          ARIMA(0,0,1)(0,1,1)[7]
## 2 ATM2  14.33  20.17   30.46          ARIMA(2,0,2)(0,1,1)[7]
## 3 ATM3   8.77  27.79     NaN ARIMA(0,0,0) with non-zero mean
## 4 ATM4 249.08 293.50 1240.19 ARIMA(0,0,0) with non-zero mean
cat("\n")
# Interpretation
avg_mape <- mean(validation_summary$MAPE)
cat(paste("Average MAPE across all ATMs:", round(avg_mape, 2), "%\n"))
## Average MAPE across all ATMs: NaN %
cat("Interpretation:\n")
## Interpretation:
cat("  <5% MAPE = Excellent forecast accuracy\n")
##   <5% MAPE = Excellent forecast accuracy
cat("  5-10% = Good accuracy\n")
##   5-10% = Good accuracy
cat("  10-20% = Acceptable for business decisions\n")
##   10-20% = Acceptable for business decisions
cat("  >20% = Model needs improvement\n\n")
##   >20% = Model needs improvement

Creating the Forecast and Exporting the Data to an Excel file

# Load required libraries
library(forecast)
library(writexl)  # or library(openxlsx)
library(dplyr)
library(lubridate)

cat("--- ARIMA FORECASTING ---\n\n")
## --- ARIMA FORECASTING ---
# Prepare time series data for each ATM
forecast_results <- list()
forecast_horizon <- 30  # Number of days to forecast

for(atm_name in unique(atm_clean$ATM)) {
  cat(paste("Forecasting for", atm_name, "...\n"))
  
  # Filter data for specific ATM and sort by date
  atm_ts_data <- atm_clean %>%
    filter(ATM == atm_name) %>%
    arrange(DATE)
  
  # Create time series object
  ts_cash <- ts(atm_ts_data$Cash, frequency = 7)  # Weekly seasonality
  
  # Fit ARIMA model (auto.arima finds best parameters)
  arima_model <- auto.arima(ts_cash, seasonal = TRUE, stepwise = FALSE)
  
  # Generate forecast
  forecast_result <- forecast(arima_model, h = forecast_horizon)
  
  # Extract forecasted values
  forecasted_values <- data.frame(
    ATM = atm_name,
    Forecast_Day = 1:forecast_horizon,
    Forecast_Date = max(atm_ts_data$DATE) + 1:forecast_horizon,
    Predicted_Cash = as.numeric(forecast_result$mean),
    Lower_80 = as.numeric(forecast_result$lower[,1]),
    Upper_80 = as.numeric(forecast_result$upper[,1]),
    Lower_95 = as.numeric(forecast_result$lower[,2]),
    Upper_95 = as.numeric(forecast_result$upper[,2]),
    Model = as.character(arima_model)
  )
  
  forecast_results[[atm_name]] <- forecasted_values
  
  cat(paste("  Model:", as.character(arima_model), "\n"))
  cat(paste("  Mean forecast:", round(mean(forecast_result$mean), 2), "\n\n"))
}
## Forecasting for ATM1 ...
##   Model: ARIMA(0,0,1)(0,1,1)[7] 
##   Mean forecast: 77.17 
## 
## Forecasting for ATM2 ...
##   Model: ARIMA(2,0,2)(0,1,1)[7] 
##   Mean forecast: 60.03 
## 
## Forecasting for ATM3 ...
##   Model: ARIMA(0,0,2) with zero mean 
##   Mean forecast: 0.13 
## 
## Forecasting for ATM4 ...
##   Model: ARIMA(0,0,0) with non-zero mean 
##   Mean forecast: 474.04
# Combine all forecasts
all_forecasts <- bind_rows(forecast_results)

# Export to Excel
output_file <- "ATM_Forecasts.xlsx"
write_xlsx(all_forecasts, output_file)

cat(paste("Forecasts exported to:", output_file, "\n"))
## Forecasts exported to: ATM_Forecasts.xlsx
# Display summary
cat("\nForecast Summary:\n")
## 
## Forecast Summary:
forecast_summary <- all_forecasts %>%
  group_by(ATM) %>%
  summarise(
    Avg_Forecast = round(mean(Predicted_Cash), 2),
    Min_Forecast = round(min(Predicted_Cash), 2),
    Max_Forecast = round(max(Predicted_Cash), 2)
  )
print(forecast_summary)
## # A tibble: 4 × 4
##   ATM   Avg_Forecast Min_Forecast Max_Forecast
##   <chr>        <dbl>        <dbl>        <dbl>
## 1 ATM1         77.2          4.76       101.  
## 2 ATM2         60.0          3.33        99.3 
## 3 ATM3          0.13         0            2.61
## 4 ATM4        474.         474.         474.

Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

# ========================================
# RESIDENTIAL POWER USAGE ANALYSIS & FORECASTING
# Period: Jan 1998 - Dec 2013 (Training)
# Forecast: Monthly for 2014
# ========================================

library(readxl)
library(forecast)
library(ggplot2)
library(dplyr)
library(lubridate)
library(gridExtra)
library(writexl)

# Import the Power Usage Data Excel file
power_data <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
print(glimpse(power_data)) 
## Rows: 192
## Columns: 3
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ `YYYY-MMM`   <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ KWH          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
## # A tibble: 192 × 3
##    CaseSequence `YYYY-MMM`     KWH
##           <dbl> <chr>        <dbl>
##  1          733 1998-Jan   6862583
##  2          734 1998-Feb   5838198
##  3          735 1998-Mar   5420658
##  4          736 1998-Apr   5010364
##  5          737 1998-May   4665377
##  6          738 1998-Jun   6467147
##  7          739 1998-Jul   8914755
##  8          740 1998-Aug   8607428
##  9          741 1998-Sep   6989888
## 10          742 1998-Oct   6345620
## # ℹ 182 more rows
# Convert to Date first, then create ts object
power_data$Date <- ym(power_data$`YYYY-MMM`)
power_data <- power_data %>% arrange(Date)
ts_data <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)

# Verify it worked
head(power_data)
## # A tibble: 6 × 4
##   CaseSequence `YYYY-MMM`     KWH Date      
##          <dbl> <chr>        <dbl> <date>    
## 1          733 1998-Jan   6862583 1998-01-01
## 2          734 1998-Feb   5838198 1998-02-01
## 3          735 1998-Mar   5420658 1998-03-01
## 4          736 1998-Apr   5010364 1998-04-01
## 5          737 1998-May   4665377 1998-05-01
## 6          738 1998-Jun   6467147 1998-06-01
str(power_data)
## tibble [192 × 4] (S3: tbl_df/tbl/data.frame)
##  $ CaseSequence: num [1:192] 733 734 735 736 737 738 739 740 741 742 ...
##  $ YYYY-MMM    : chr [1:192] "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
##  $ KWH         : num [1:192] 6862583 5838198 5420658 5010364 4665377 ...
##  $ Date        : Date[1:192], format: "1998-01-01" "1998-02-01" ...
# ========================================
# DATA LOADING & PREPARATION
# ========================================

# Import the Power Usage Data Excel file
power_data <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
print(glimpse(power_data))
## Rows: 192
## Columns: 3
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ `YYYY-MMM`   <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ KWH          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
## # A tibble: 192 × 3
##    CaseSequence `YYYY-MMM`     KWH
##           <dbl> <chr>        <dbl>
##  1          733 1998-Jan   6862583
##  2          734 1998-Feb   5838198
##  3          735 1998-Mar   5420658
##  4          736 1998-Apr   5010364
##  5          737 1998-May   4665377
##  6          738 1998-Jun   6467147
##  7          739 1998-Jul   8914755
##  8          740 1998-Aug   8607428
##  9          741 1998-Sep   6989888
## 10          742 1998-Oct   6345620
## # ℹ 182 more rows
power_data <- power_data %>%
  mutate(
    Date = ym(`YYYY-MMM`)  # Convert YYYY-MMM to Date
  ) %>%
  arrange(Date)  

# Check for any missing values
cat("\nChecking for missing values:\n")
## 
## Checking for missing values:
colSums(is.na(power_data))
## CaseSequence     YYYY-MMM          KWH         Date 
##            0            0            1            0
# Verify the Date column exists
cat("\nData structure:\n")
## 
## Data structure:
str(power_data)
## tibble [192 × 4] (S3: tbl_df/tbl/data.frame)
##  $ CaseSequence: num [1:192] 733 734 735 736 737 738 739 740 741 742 ...
##  $ YYYY-MMM    : chr [1:192] "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
##  $ KWH         : num [1:192] 6862583 5838198 5420658 5010364 4665377 ...
##  $ Date        : Date[1:192], format: "1998-01-01" "1998-02-01" ...
head(power_data)
## # A tibble: 6 × 4
##   CaseSequence `YYYY-MMM`     KWH Date      
##          <dbl> <chr>        <dbl> <date>    
## 1          733 1998-Jan   6862583 1998-01-01
## 2          734 1998-Feb   5838198 1998-02-01
## 3          735 1998-Mar   5420658 1998-03-01
## 4          736 1998-Apr   5010364 1998-04-01
## 5          737 1998-May   4665377 1998-05-01
## 6          738 1998-Jun   6467147 1998-06-01
# Ensure data is sorted by date
power_data %>% arrange(.data$Date)
## # A tibble: 192 × 4
##    CaseSequence `YYYY-MMM`     KWH Date      
##           <dbl> <chr>        <dbl> <date>    
##  1          733 1998-Jan   6862583 1998-01-01
##  2          734 1998-Feb   5838198 1998-02-01
##  3          735 1998-Mar   5420658 1998-03-01
##  4          736 1998-Apr   5010364 1998-04-01
##  5          737 1998-May   4665377 1998-05-01
##  6          738 1998-Jun   6467147 1998-06-01
##  7          739 1998-Jul   8914755 1998-07-01
##  8          740 1998-Aug   8607428 1998-08-01
##  9          741 1998-Sep   6989888 1998-09-01
## 10          742 1998-Oct   6345620 1998-10-01
## # ℹ 182 more rows
cat("\nData Summary:\n")
## 
## Data Summary:
cat(paste("Total Observations:", nrow(power_data), "months\n"))
## Total Observations: 192 months
cat(paste("Date Range:", min(power_data$Date), "to", max(power_data$Date), "\n"))
## Date Range: 1998-01-01 to 2013-12-01
cat(paste("Mean Usage:", round(mean(power_data$KWH), 2), "kWh\n"))
## Mean Usage: NA kWh
cat(paste("Std Dev:", round(sd(power_data$KWH), 2), "kWh\n\n"))
## Std Dev: NA kWh
# ========================================
# EXPLORATORY ANALYSIS
# ========================================

cat("\n--- EXPLORATORY ANALYSIS ---\n\n")
## 
## --- EXPLORATORY ANALYSIS ---
# Overall trend
p1 <- ggplot(power_data, aes(x = Date, y = KWH)) +
  geom_line(color = "steelblue", linewidth = 1) +
  geom_smooth(method = "lm", se = TRUE, color = "red", linetype = "dashed") +
  labs(
    title = "Residential Power Usage: 1998-2013",
    subtitle = "16 years of monthly consumption data",
    x = "Year",
    y = "Power Usage (kWh)"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold", size = 14))

print(p1)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_smooth()`).

# Trend summary
trend_model <- lm(KWH ~ as.numeric(Date), data = power_data)
annual_change <- coef(trend_model)[2] * 365.25
cat(paste("Long-term trend:", 
          ifelse(annual_change > 0, "increasing", "decreasing"), 
          "demand at ~", round(abs(annual_change), 0), "kWh/year\n\n"))
## Long-term trend: increasing demand at ~ 83203 kWh/year
# Seasonal decomposition
# Create time series object for decomposition and forecasting
# Check for missing values
cat("\nChecking for NAs in KWH data:\n")
## 
## Checking for NAs in KWH data:
cat("Total NAs:", sum(is.na(power_data$KWH)), "\n")
## Total NAs: 1
cat("NA positions:", which(is.na(power_data$KWH)), "\n\n")
## NA positions: 129
#Linear interpolation (recommended for time series)
library(zoo)
power_data$KWH <- na.approx(power_data$KWH, na.rm = FALSE)

# Verify NAs are removed
cat("NAs after handling:", sum(is.na(power_data$KWH)), "\n\n")
## NAs after handling: 0
# Now create time series object
power_ts <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)
power_ts <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)
decomp <- decompose(power_ts, type = "additive")
p2 <- autoplot(decomp) +
  labs(title = "Time Series Decomposition: Trend, Seasonality, Residuals") +
  theme_minimal(base_size = 12)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## ℹ The deprecated feature was likely used in the forecast package.
##   Please report the issue at <https://github.com/robjhyndman/forecast/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
print(p2)

cat("Observation: Strong seasonal pattern visible (summer peaks likely A/C usage)\n\n")
## Observation: Strong seasonal pattern visible (summer peaks likely A/C usage)

========================================

FORECASTING 2014

========================================

Build Forecasting Model

cat("FORECASTING 2014 MONTHLY USAGE\n")
## FORECASTING 2014 MONTHLY USAGE
# --- Step 1: Build Forecasting Model ---
cat("Building ARIMA model...\n")
## Building ARIMA model...
model <- auto.arima(power_ts, seasonal = TRUE, stepwise = FALSE)
cat("Selected Model:\n")
## Selected Model:
print(summary(model))
## Series: power_ts 
## ARIMA(0,0,1)(1,1,1)[12] with drift 
## 
## Coefficients:
##          ma1     sar1     sma1     drift
##       0.2439  -0.1518  -0.7311  7615.302
## s.e.  0.0723   0.0963   0.0821  1953.408
## 
## sigma^2 = 7.466e+11:  log likelihood = -2719.89
## AIC=5449.79   AICc=5450.13   BIC=5465.75
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE   MAPE      MASE       ACF1
## Training set -25089.69 827254.2 493308.5 -5.511184 11.685 0.7080556 0.01283694

Generate 12- Month Forecast

# --- Step 2: Generate 12-Month Forecast ---
cat("\nForecasting 12 months (2014)...\n")
## 
## Forecasting 12 months (2014)...
forecast_2014 <- forecast(model, h = 12)

Visualize the Forecast

# --- Step 3: Visualize Forecast ---
plot(forecast_2014, 
     main = "Residential Power Usage Forecast for 2014",
     xlab = "Year", 
     ylab = "Power Usage (kWh)")

Extract Forecast Results

# --- Step 4: Extract Forecast Results ---
forecast_table <- data.frame(
  Month = seq(as.Date("2014-01-01"), by = "month", length.out = 12),
  Forecast_KWH = as.numeric(forecast_2014$mean),
  Lower_80 = as.numeric(forecast_2014$lower[,1]),
  Upper_80 = as.numeric(forecast_2014$upper[,1]),
  Lower_95 = as.numeric(forecast_2014$lower[,2]),
  Upper_95 = as.numeric(forecast_2014$upper[,2])
)

cat("\n2014 Monthly Forecast:\n")
## 
## 2014 Monthly Forecast:
print(forecast_table)
##         Month Forecast_KWH Lower_80 Upper_80 Lower_95 Upper_95
## 1  2014-01-01      9714361  8607037 10821686  8020854 11407868
## 2  2014-02-01      8181495  7041708  9321281  6438342  9924648
## 3  2014-03-01      6737578  5597792  7877365  4994426  8480731
## 4  2014-04-01      5957336  4817549  7097122  4214183  7700488
## 5  2014-05-01      5727983  4588197  6867770  3984831  7471136
## 6  2014-06-01      7534009  6394222  8673795  5790856  9277162
## 7  2014-07-01      7921051  6781265  9060837  6177898  9664204
## 8  2014-08-01      9292410  8152624 10432197  7549257 11035563
## 9  2014-09-01      8182948  7043162  9322734  6439795  9926101
## 10 2014-10-01      6015721  4875934  7155507  4272568  7758873
## 11 2014-11-01      5769225  4629439  6909011  4026072  7512378
## 12 2014-12-01      7484127  6344341  8623913  5740975  9227279

Model Validation

cat("\n========================================\n")
## 
## ========================================
cat("MODEL VALIDATION\n")
## MODEL VALIDATION
cat("========================================\n\n")
## ========================================
# Split data: train on 1998-2012, test on 2013
train_end_year <- 2012
train_ts <- window(power_ts, end = c(train_end_year, 12))
test_ts <- window(power_ts, start = c(train_end_year + 1, 1))

cat(paste("Training period:", start(train_ts)[1], "-", end(train_ts)[1], "\n"))
## Training period: 1998 - 2012
cat(paste("Test period:", start(test_ts)[1], "-", end(test_ts)[1], "\n"))
## Test period: 2013 - 2013
cat(paste("Forecasting", length(test_ts), "months ahead\n\n"))
## Forecasting 12 months ahead
# Fit model on training data only
model_val <- auto.arima(train_ts, seasonal = TRUE, stepwise = FALSE)
cat("Validation Model:\n")
## Validation Model:
print(model_val)
## Series: train_ts 
## ARIMA(1,0,1)(2,1,0)[12] with drift 
## 
## Coefficients:
##          ar1      ma1     sar1     sar2     drift
##       0.6492  -0.4644  -0.8448  -0.6399  7072.011
## s.e.  0.2259   0.2632   0.0670   0.0756  3410.396
## 
## sigma^2 = 6.815e+11:  log likelihood = -2532.85
## AIC=5077.69   AICc=5078.22   BIC=5096.44
cat("\n")
# Generate forecast
forecast_val <- forecast(model_val, h = length(test_ts))

# Calculate accuracy metrics manually
actual <- as.numeric(test_ts)
predicted <- as.numeric(forecast_val$mean)

# Metrics
mae <- mean(abs(actual - predicted))
rmse <- sqrt(mean((actual - predicted)^2))
mape <- mean(abs((actual - predicted) / actual)) * 100
mpe <- mean((actual - predicted) / actual) * 100  # Bias check

# Create results table
validation_results <- data.frame(
  Metric = c("MAE", "RMSE", "MAPE", "MPE"),
  Value = c(mae, rmse, mape, mpe),
  Unit = c("kWh", "kWh", "%", "%"),
  Interpretation = c(
    "Average absolute error",
    "Root mean squared error (penalizes large errors)",
    "Mean absolute percentage error",
    "Mean percentage error (positive = underforecasting)"
  )
)

cat("VALIDATION RESULTS:\n")
## VALIDATION RESULTS:
print(validation_results, row.names = FALSE)
##  Metric        Value Unit                                      Interpretation
##     MAE 1.025467e+06  kWh                              Average absolute error
##    RMSE 1.579340e+06  kWh    Root mean squared error (penalizes large errors)
##    MAPE 1.242023e+01    %                      Mean absolute percentage error
##     MPE 7.714816e+00    % Mean percentage error (positive = underforecasting)
cat("\n")

Model Interpretation

# Interpretation
cat("FORECAST QUALITY ASSESSMENT:\n")
## FORECAST QUALITY ASSESSMENT:
if(mape < 5) {
  cat("  EXCELLENT: MAPE < 5% - Deploy with confidence\n")
} else if(mape < 10) {
  cat("  GOOD: MAPE 5-10% - Acceptable for planning\n")
} else if(mape < 15) {
  cat("  FAIR: MAPE 10-15% - Use with caution\n")
} else {
  cat("  POOR: MAPE > 15% - Model needs improvement\n")
}
##   FAIR: MAPE 10-15% - Use with caution
cat("\n")
# Visualization
validation_plot_data <- data.frame(
  Month = seq(as.Date(paste0(train_end_year + 1, "-01-01")), 
              by = "month", length.out = length(actual)),
  Actual = actual,
  Forecast = predicted,
  Lower_80 = as.numeric(forecast_val$lower[,1]),
  Upper_80 = as.numeric(forecast_val$upper[,1])
)

p_val <- ggplot(validation_plot_data, aes(x = Month)) +
  geom_ribbon(aes(ymin = Lower_80, ymax = Upper_80), 
              fill = "lightblue", alpha = 0.3) +
  geom_line(aes(y = Actual, color = "Actual"), linewidth = 1.2) +
  geom_line(aes(y = Forecast, color = "Forecast"), 
            linewidth = 1.2, linetype = "dashed") +
  geom_point(aes(y = Actual, color = "Actual"), size = 3) +
  geom_point(aes(y = Forecast, color = "Forecast"), size = 3) +
  scale_color_manual(values = c("Actual" = "steelblue", "Forecast" = "red")) +
  labs(
    title = paste("Model Validation:", train_end_year + 1, "Hold-Out Test"),
    subtitle = paste("MAPE =", round(mape, 2), "% | RMSE =", round(rmse, 2), "kWh"),
    x = "Month",
    y = "Power Usage (kWh)",
    color = "",
    caption = "Shaded area = 80% prediction interval"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold", size = 14),
    legend.position = "bottom"
  )

print(p_val)

#### Part C– BONUS

  1. Data Preprocessing

  2. Stationarity Testing

  3. Forecasting (if feasible)

1. Data Loading & Initial Exploration

# Load required libraries
suppressMessages({
  library(readxl)       # Read Excel files
  library(dplyr)        # Data manipulation
  library(tidyr)        # Data tidying
  library(lubridate)    # Date/time handling
  library(ggplot2)      # Visualization
  library(forecast)     # Time series forecasting
  library(tseries)      # Stationarity tests
  library(prophet)      # Facebook Prophet
  library(zoo)          # Time series objects
  library(writexl)      # Excel export
  library(gridExtra)    # Multiple plots
  library(knitr)        # Tables
  library(scales)       # Formatting
})
# Load water flow data
pipe1 <- read_excel("Waterflow_Pipe1.xlsx")
pipe2 <- read_excel("Waterflow_Pipe2.xlsx")

cat("## Dataset Dimensions\n\n")
## ## Dataset Dimensions
cat(sprintf("- Pipe 1: %d observations\n", nrow(pipe1)))
## - Pipe 1: 1000 observations
cat(sprintf("- Pipe 2: %d observations\n\n", nrow(pipe2)))
## - Pipe 2: 1000 observations
cat("## Data Structure\n\n")
## ## Data Structure
cat("### Pipe 1:\n")
## ### Pipe 1:
str(pipe1)
## tibble [1,000 × 2] (S3: tbl_df/tbl/data.frame)
##  $ Date Time: num [1:1000] 42300 42300 42300 42300 42300 ...
##  $ WaterFlow: num [1:1000] 23.4 28 23.1 30 6 ...
cat("\n### Pipe 2:\n")
## 
## ### Pipe 2:
str(pipe2)
## tibble [1,000 × 2] (S3: tbl_df/tbl/data.frame)
##  $ Date Time: num [1:1000] 42300 42300 42300 42300 42300 ...
##  $ WaterFlow: num [1:1000] 18.8 43.1 38 36.1 31.9 ...
cat("## Data Preview\n\n")
## ## Data Preview
cat("### Pipe 1 (First 10 rows):\n")
## ### Pipe 1 (First 10 rows):
kable(head(pipe1, 10), caption = "Pipe 1 Sample Data")
Pipe 1 Sample Data
Date Time WaterFlow
42300.02 23.369599
42300.03 28.002881
42300.04 23.065895
42300.04 29.972809
42300.06 5.997953
42300.06 15.935223
42300.08 26.617330
42300.08 33.282900
42300.08 12.426692
42300.12 21.833494
cat("\n### Pipe 2 (First 10 rows):\n")
## 
## ### Pipe 2 (First 10 rows):
kable(head(pipe2, 10), caption = "Pipe 2 Sample Data")
Pipe 2 Sample Data
Date Time WaterFlow
42300.04 18.810791
42300.08 43.087025
42300.12 37.987705
42300.17 36.120379
42300.21 31.851259
42300.25 28.238090
42300.29 9.863582
42300.33 26.679610
42300.38 55.773785
42300.42 54.156889

2. Time-Base Sequencing & Hourly Aggregation

cat("## Time-Base Sequencing Process\n\n")
## ## Time-Base Sequencing Process
# Get column names dynamically
col_names_1 <- names(pipe1)
col_names_2 <- names(pipe2)

cat(sprintf("Pipe 1 columns: %s\n", paste(col_names_1, collapse = ", ")))
## Pipe 1 columns: Date Time, WaterFlow
cat(sprintf("Pipe 2 columns: %s\n", paste(col_names_2, collapse = ", ")))
## Pipe 2 columns: Date Time, WaterFlow
# Assume first column is datetime, second is value
# Adjust based on actual column names

# Process Pipe 1
pipe1_processed <- pipe1 %>%
  rename(
    datetime = 1,
    flow = 2
  ) %>%
  mutate(
    # Parse datetime (adjust format as needed)
    datetime = as.POSIXct(datetime, tz = "UTC"),
    # Floor to nearest hour
    hour = floor_date(datetime, "hour")
  )

# Process Pipe 2
pipe2_processed <- pipe2 %>%
  rename(
    datetime = 1,
    flow = 2
  ) %>%
  mutate(
    datetime = as.POSIXct(datetime, tz = "UTC"),
    hour = floor_date(datetime, "hour")
  )

cat("\n### Original Timestamp Ranges:\n\n")
## 
## ### Original Timestamp Ranges:
cat(sprintf("Pipe 1: %s to %s\n", 
            min(pipe1_processed$datetime), 
            max(pipe1_processed$datetime)))
## Pipe 1: 1970-01-01 11:45:00.016742 to 1970-01-01 11:45:09.983138
cat(sprintf("Pipe 2: %s to %s\n", 
            min(pipe2_processed$datetime), 
            max(pipe2_processed$datetime)))
## Pipe 2: 1970-01-01 11:45:00.041667 to 1970-01-01 11:45:41.666667
# Check for duplicate hours (multiple readings per hour)
dup_check_1 <- pipe1_processed %>%
  group_by(hour) %>%
  summarise(n = n()) %>%
  filter(n > 1)

dup_check_2 <- pipe2_processed %>%
  group_by(hour) %>%
  summarise(n = n()) %>%
  filter(n > 1)

cat(sprintf("\nPipe 1: %d hours with multiple readings\n", nrow(dup_check_1)))
## 
## Pipe 1: 1 hours with multiple readings
cat(sprintf("Pipe 2: %d hours with multiple readings\n", nrow(dup_check_2)))
## Pipe 2: 1 hours with multiple readings
cat("\n## Hourly Aggregation (Mean)\n\n")
## 
## ## Hourly Aggregation (Mean)
# Aggregate Pipe 1 by hour (mean)
pipe1_hourly <- pipe1_processed %>%
  group_by(hour) %>%
  summarise(
    flow_mean = mean(flow, na.rm = TRUE),
    flow_min = min(flow, na.rm = TRUE),
    flow_max = max(flow, na.rm = TRUE),
    n_readings = n(),
    .groups = 'drop'
  ) %>%
  arrange(hour)

# Aggregate Pipe 2 by hour (mean)
pipe2_hourly <- pipe2_processed %>%
  group_by(hour) %>%
  summarise(
    flow_mean = mean(flow, na.rm = TRUE),
    flow_min = min(flow, na.rm = TRUE),
    flow_max = max(flow, na.rm = TRUE),
    n_readings = n(),
    .groups = 'drop'
  ) %>%
  arrange(hour)

cat("### Aggregation Summary:\n\n")
## ### Aggregation Summary:
cat(sprintf("Pipe 1: %d observations → %d hourly records\n", 
            nrow(pipe1_processed), nrow(pipe1_hourly)))
## Pipe 1: 1000 observations → 1 hourly records
cat(sprintf("Pipe 2: %d observations → %d hourly records\n", 
            nrow(pipe2_processed), nrow(pipe2_hourly)))
## Pipe 2: 1000 observations → 1 hourly records
# Summary statistics
cat("\n### Hourly Flow Statistics:\n\n")
## 
## ### Hourly Flow Statistics:
summary_stats <- data.frame(
  Pipe = c("Pipe 1", "Pipe 2"),
  Mean = c(mean(pipe1_hourly$flow_mean), mean(pipe2_hourly$flow_mean)),
  Median = c(median(pipe1_hourly$flow_mean), median(pipe2_hourly$flow_mean)),
  SD = c(sd(pipe1_hourly$flow_mean), sd(pipe2_hourly$flow_mean)),
  Min = c(min(pipe1_hourly$flow_mean), min(pipe2_hourly$flow_mean)),
  Max = c(max(pipe1_hourly$flow_mean), max(pipe2_hourly$flow_mean))
)

kable(summary_stats, digits = 2, caption = "Hourly Flow Summary Statistics")
Hourly Flow Summary Statistics
Pipe Mean Median SD Min Max
Pipe 1 19.90 19.90 NA 19.90 19.90
Pipe 2 39.56 39.56 NA 39.56 39.56
# Visualize aggregation example
cat("\n### Aggregation Example Visualization:\n\n")
## 
## ### Aggregation Example Visualization:
# Show one hour with multiple readings
example_hour <- pipe1_processed %>%
  group_by(hour) %>%
  filter(n() > 1) %>%
  slice(1) %>%
  pull(hour) %>%
  first()

if(!is.na(example_hour)) {
  example_data <- pipe1_processed %>%
    filter(hour == example_hour)
  
  example_agg <- pipe1_hourly %>%
    filter(hour == example_hour)
  
  cat(sprintf("Example Hour: %s\n", example_hour))
  cat(sprintf("Number of readings: %d\n", nrow(example_data)))
  cat(sprintf("Individual readings: %s\n", 
              paste(round(example_data$flow, 2), collapse = ", ")))
  cat(sprintf("Aggregated mean: %.2f\n\n", example_agg$flow_mean))
}
## Example Hour: 1970-01-01 11:00:00
## Number of readings: 1000
## Individual readings: 23.37, 28, 23.07, 29.97, 6, 15.94, 26.62, 33.28, 12.43, 21.83, 8.48, 29.34, 19.81, 31.02, 18.34, 16.89, 13.66, 17.3, 23.26, 8.15, 19.88, 32.89, 22.26, 5.78, 32.55, 30.69, 29.08, 30.05, 5.75, 30.41, 26.18, 27.16, 13.61, 11.18, 20.38, 13.41, 14.46, 27.23, 9.09, 29.16, 24.12, 6.21, 27.67, 29.78, 19.04, 14.54, 16.3, 9.09, 24.16, 33.01, 14.92, 20.69, 25.4, 21.66, 23.21, 3.81, 37.3, 26.47, 30.53, 10.91, 31.53, 13.55, 27.49, 19.14, 26.37, 17.62, 26.7, 36.17, 24.15, 27.51, 34.64, 16.14, 18.21, 7.49, 14.82, 15.45, 21.08, 32.99, 13.27, 15.53, 22.68, 19.4, 24, 18.41, 21.75, 25.76, 20.23, 16.52, 9.33, 30.19, 15.09, 16.58, 20.8, 33.22, 3.47, 24.98, 14.9, 18.88, 14.92, 14.61, 8.29, 32.22, 12.12, 3.54, 29.96, 10.69, 4.33, 31.91, 19.86, 27.82, 30.24, 27.22, 17.36, 23.11, 17.97, 18.8, 15.54, 21.29, 14.37, 37.53, 22.4, 29.83, 28.86, 31.66, 20.62, 19.9, 17.67, 19.6, 13.29, 32.92, 30.53, 15.63, 36.09, 14.73, 35.62, 3.37, 28.99, 8.71, 12.17, 25.88, 15.91, 35.39, 32.59, 9.72, 19.99, 35.74, 12.3, 17.44, 8.19, 5.74, 5.61, 13.57, 8.35, 5.14, 20.71, 7.57, 29.62, 29.71, 18.46, 11.26, 12.52, 13.85, 11.84, 13.69, 14.01, 19.49, 13.49, 30.57, 30.07, 23, 15.64, 15.54, 13.01, 21.03, 10.53, 16.16, 9.21, 21.88, 21.1, 23.84, 13.84, 13.5, 9.03, 25.11, 26.2, 26.41, 13.37, 19.21, 9.51, 19.59, 25.18, 25.71, 13.24, 15.25, 18.65, 11.66, 15.97, 26.24, 22.2, 11.21, 21.15, 19.28, 14.16, 9.45, 18.43, 24.42, 22.33, 27.57, 18.78, 29.7, 28.22, 20.57, 12.53, 23.42, 35.77, 35.02, 15.85, 24.58, 16.58, 36.79, 11.4, 21.79, 19.48, 17.9, 29.73, 6.37, 23.27, 11.15, 29.48, 19.36, 35.99, 3.88, 23.61, 15.84, 17.83, 23.21, 12.49, 17.19, 11.12, 19.54, 16.2, 22.44, 38.91, 25.51, 9.94, 11.09, 20.63, 21.64, 25.93, 13.84, 20.87, 10.92, 11.21, 14.22, 27.21, 32.05, 38.15, 24.8, 16.68, 12.96, 11.77, 18.65, 22.66, 15.83, 19.6, 23.14, 12.99, 27.76, 29.34, 14.26, 16.9, 22.54, 27.87, 5.77, 27.44, 4.95, 21.6, 29.97, 36.04, 13.59, 24.41, 21.54, 7.03, 16.02, 6.46, 20.02, 23.87, 6.8, 17.64, 14.15, 29.21, 23.09, 18.69, 11.52, 16.78, 22.86, 4.48, 30.36, 18.1, 21.41, 21.83, 35.11, 16.47, 28.95, 17.41, 5.65, 27.12, 34.38, 31.51, 10.15, 23.26, 21.37, 22.29, 17.96, 10.54, 27.97, 7.91, 23.59, 32.88, 16.13, 2.42, 13.97, 11.29, 20.32, 9.69, 12.31, 26.63, 6.61, 22.55, 6.36, 1.88, 7.81, 28.39, 24.34, 20.7, 8.76, 20.35, 7.44, 24.31, 20.2, 26.89, 13.72, 22.52, 26.24, 6.16, 29.23, 2.54, 19.75, 18.63, 21.49, 30.68, 27.74, 11.16, 35.67, 24.08, 29.34, 9.01, 12.93, 17.53, 31.82, 19.07, 6.14, 15.91, 19.73, 7.17, 17.04, 37.31, 9.97, 24, 28.64, 14.54, 26.01, 11.98, 11.32, 18.36, 18.58, 13.29, 15.6, 11.56, 21.92, 30.1, 19.98, 5.95, 18.43, 31.81, 24.24, 20.47, 28.83, 19.63, 19.78, 29.55, 10.46, 25.87, 7.31, 22.87, 23.02, 31.89, 15.87, 21.74, 35.64, 20.43, 38.36, 19.92, 19.91, 19.74, 25.5, 12.1, 21.35, 15.7, 22.12, 37.68, 18.02, 20.81, 20.33, 12.34, 30.83, 34.62, 10.14, 20.92, 4.16, 22.1, 28.21, 17.57, 24.9, 29.1, 24.66, 18.23, 27.05, 22.71, 21.89, 1.83, 11.67, 22.46, 15.33, 30.45, 18.52, 27.86, 22.08, 16.58, 33.92, 26.77, 16.98, 5.9, 9.7, 19.89, 12.18, 17.79, 20.18, 7.3, 19.07, 23.1, 18.42, 11.42, 25.3, 10.36, 16.88, 28.26, 26.79, 1.84, 22.78, 22.91, 23.5, 20.6, 15.74, 4.3, 14.75, 13.11, 13.33, 16.58, 16.83, 12.25, 23.71, 25.66, 35.46, 12.64, 14.07, 25.79, 19.52, 12.34, 26.81, 13, 21.56, 22.7, 22.29, 20.13, 24.91, 31.19, 22.45, 16.17, 17.54, 24.49, 35.76, 22.13, 3.82, 21.37, 28.96, 30.53, 9.89, 34.6, 20.52, 28.41, 22.41, 18.89, 16.66, 17.73, 32.65, 7.22, 12.92, 25.44, 15.55, 19.77, 16.46, 10.29, 1.78, 19.88, 13.01, 19.13, 15.05, 37.35, 13.06, 34.84, 20.42, 31.75, 33.8, 19.94, 13.6, 11.83, 13.56, 20.34, 28.63, 34.7, 35.02, 25.31, 27.38, 8.95, 23.73, 20.39, 16.26, 13.64, 29.24, 14.47, 14.12, 22.98, 13.44, 27.6, 20.45, 15.76, 9.55, 15.51, 13.63, 9.59, 24.91, 20.91, 13.72, 17.91, 20.65, 7, 10.06, 29.13, 18.13, 31.45, 6.56, 26.09, 24.15, 24.04, 21.03, 4.56, 9.13, 18.89, 13.39, 23.99, 10.02, 11.15, 14, 16.6, 32.99, 21.43, 31.91, 10.67, 22.65, 13.86, 27.36, 22.98, 10.89, 33.21, 22.43, 24.17, 31.48, 7.06, 22.51, 11.7, 25.05, 16.62, 27.1, 28.45, 24.91, 19.05, 17.45, 10.79, 26.56, 17.33, 6.91, 28.45, 18.75, 25.11, 5.46, 26.86, 20.76, 27.51, 8.12, 32.43, 7.28, 35.14, 9.68, 26.07, 22.21, 14.44, 14.41, 32.35, 25.34, 34.41, 5.63, 8.84, 36.42, 31.4, 10.1, 26.65, 31.61, 14.06, 14.67, 10.86, 19.71, 12.77, 11.41, 6.83, 18.63, 25.25, 20.07, 32.63, 19.82, 7.39, 30.08, 11.41, 11.7, 31.19, 20.64, 18.35, 13.47, 23.13, 25.35, 9.16, 20.83, 20.81, 11.68, 16.81, 20.08, 9.5, 22.87, 11.32, 10.93, 12.69, 12.95, 8.49, 14.03, 17.01, 36.14, 26.72, 13.08, 19.88, 22.02, 15.97, 26.32, 10.88, 36.77, 18.18, 22.24, 27.44, 7.42, 35.31, 26.1, 27, 26.17, 26.39, 6.11, 33.32, 3.44, 15.57, 12.63, 23.73, 8.22, 33.72, 17.74, 27.59, 28.58, 33.4, 15.35, 23.93, 5.77, 4.21, 34.2, 18.12, 11.84, 28.06, 38.82, 20.18, 20.25, 24.08, 20.68, 18.58, 3.01, 17.45, 17.69, 20.88, 21.11, 22.77, 10.67, 27.39, 21.04, 29.35, 26.71, 33.46, 15.76, 22.89, 14.91, 16.88, 21.31, 11.07, 23.51, 12.43, 9.8, 13.6, 15.31, 12.29, 25.19, 25.2, 26.05, 16.04, 9.72, 22.03, 32.18, 8.82, 16.42, 26.74, 28.65, 17.62, 32.17, 15.81, 13.98, 13.23, 25.82, 20.1, 19.84, 15.15, 15.32, 12.45, 21.67, 16.54, 15.02, 21, 27.28, 15.82, 14.19, 20.21, 18.27, 7.57, 24.24, 30.22, 1.07, 24.18, 12.34, 21.24, 23.49, 17.91, 9.71, 36.84, 5.66, 7.62, 22.04, 34.06, 21.48, 37.73, 25.38, 31.83, 8.49, 27.58, 7.76, 30.21, 29.87, 28.07, 15.59, 19.26, 27.05, 17.24, 31.07, 38.39, 16.29, 33.74, 12.79, 21.55, 31.12, 22.47, 19.11, 8.97, 14.54, 6.62, 19.41, 24.58, 19.91, 14.32, 21.95, 29.38, 9.75, 19, 37.59, 28.46, 15.02, 22.33, 27.74, 11.64, 15.14, 22.12, 17.92, 23.95, 4.02, 9.61, 13.13, 4.95, 26.47, 4.42, 18.25, 14.49, 31.48, 12.77, 16.36, 35.45, 38.48, 27.33, 12.84, 20.81, 16.37, 28.86, 27.94, 27.45, 21.08, 24.24, 14.05, 26.56, 25.24, 34.89, 30.6, 17.37, 23.82, 11.25, 25.67, 33.44, 36.08, 10.93, 8.28, 15.42, 32.18, 2.72, 21.17, 19.84, 35.78, 20.59, 18.81, 24.84, 19.81, 30.79, 8.28, 19.72, 20.27, 22.28, 12.37, 24.37, 20.62, 14.75, 10.41, 31.95, 8.04, 26.54, 13.91, 14.74, 28.77, 24.55, 15.01, 24, 27.59, 35.71, 19.5, 21.3, 21.03, 12.59, 15.81, 9.26, 16.78, 14.82, 11.64, 24.21, 14.49, 32.73, 15.3, 7.96, 31.37, 7.35, 15.01, 27.26, 20.52, 20.45, 27.39, 11.4, 12.4, 11.77, 27.61, 28.47, 31.24, 14.24, 11.72, 15.85, 7.08, 23.65, 30.89, 23.07, 16.93, 26.98, 10.38, 25.21, 14.02, 29.03, 12.82, 27.14, 13.72, 11.13, 22.61, 24.39, 21.68, 4.29, 29.92, 19.28, 19.5, 1.34, 10.92, 19.66, 28.55, 18.4, 12.92, 17.52, 26.32, 26.06, 29.62, 17.75, 7.55, 26.36, 14.74, 29.29, 32, 26.98, 16.58, 16.49, 15.71, 17.7, 26.16, 29.69, 24.46, 22.43, 33, 4.39, 16.92, 17.25, 25.55, 19.69, 22.55, 18.67, 16.98, 18.87, 24.98, 26.19, 10.34, 28.24, 21.63, 13.69, 21.14, 20.37, 20.31, 19.1, 5.54, 15.87, 10.57, 21.56, 18.99, 10.24, 28.11, 13.85, 21.51, 15.28, 26.34, 29.06, 22.84, 16.22, 21.21
## Aggregated mean: 19.90

3. Time Series Visualization

cat("## Time Series Plots\n\n")
## ## Time Series Plots
# Plot Pipe 1
p1 <- ggplot(pipe1_hourly, aes(x = hour, y = flow_mean)) +
  geom_line(color = "#440154FF", linewidth = 0.7) +
  geom_smooth(method = "loess", se = TRUE, color = "#FDE725FF", linewidth = 1) +
  labs(
    title = "Pipe 1: Hourly Water Flow Over Time",
    subtitle = "Aggregated hourly mean with trend line",
    x = "Time (Hour)",
    y = "Flow Rate (Mean)"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

print(p1)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

# Plot Pipe 2
p2 <- ggplot(pipe2_hourly, aes(x = hour, y = flow_mean)) +
  geom_line(color = "#31688EFF", linewidth = 0.7) +
  geom_smooth(method = "loess", se = TRUE, color = "#35B779FF", linewidth = 1) +
  labs(
    title = "Pipe 2: Hourly Water Flow Over Time",
    subtitle = "Aggregated hourly mean with trend line",
    x = "Time (Hour)",
    y = "Flow Rate (Mean)"
  ) +
  theme_minimal(base_size = 12) +
  theme(plot.title = element_text(face = "bold"))

print(p2)
## `geom_smooth()` using formula = 'y ~ x'
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

# Comparative plot
combined_data <- bind_rows(
  pipe1_hourly %>% mutate(Pipe = "Pipe 1"),
  pipe2_hourly %>% mutate(Pipe = "Pipe 2")
)

p_compare <- ggplot(combined_data, aes(x = hour, y = flow_mean, color = Pipe)) +
  geom_line(linewidth = 0.7, alpha = 0.8) +
  scale_color_manual(values = c("Pipe 1" = "#440154FF", "Pipe 2" = "#31688EFF")) +
  labs(
    title = "Comparative Water Flow: Both Pipes",
    subtitle = "Hourly aggregated flow rates",
    x = "Time (Hour)",
    y = "Flow Rate (Mean)",
    color = "Pipe"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    plot.title = element_text(face = "bold"),
    legend.position = "bottom"
  )

print(p_compare)
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?


4. Stationarity Testing

cat("## Visual Assessment of Stationarity\n\n")
## ## Visual Assessment of Stationarity
# Decomposition for Pipe 1
ts_pipe1 <- ts(pipe1_hourly$flow_mean, frequency = 24)  # 24 hours per day

if(length(ts_pipe1) >= 48) {  # Need at least 2 periods
  decomp1 <- decompose(ts_pipe1)
  plot(decomp1, main = "Pipe 1: Time Series Decomposition")
}

# Decomposition for Pipe 2
ts_pipe2 <- ts(pipe2_hourly$flow_mean, frequency = 24)

if(length(ts_pipe2) >= 48) {
  decomp2 <- decompose(ts_pipe2)
  plot(decomp2, main = "Pipe 2: Time Series Decomposition")
}

Stationarity Tests

cat("## Statistical Stationarity Tests\n\n")
## ## Statistical Stationarity Tests
# Function to perform stationarity tests with error handling
test_stationarity <- function(data, pipe_name) {
  cat(sprintf("### %s\n\n", pipe_name))
  
  n <- length(data)
  cat(sprintf("Number of observations: %d\n\n", n))
  
  # Check if we have enough data for tests
  min_obs_adf <- 10   # ADF needs at least 10 observations
  min_obs_kpss <- 10  # KPSS needs at least 10 observations
  min_obs_pp <- 10    # PP needs at least 10 observations
  
  results <- data.frame(
    Test = character(),
    Statistic = numeric(),
    P_Value = numeric(),
    Conclusion = character(),
    stringsAsFactors = FALSE
  )
  
  # Augmented Dickey-Fuller Test
  if(n >= min_obs_adf) {
    tryCatch({
      adf_result <- adf.test(data, alternative = "stationary")
      results <- rbind(results, data.frame(
        Test = "ADF (Augmented Dickey-Fuller)",
        Statistic = adf_result$statistic,
        P_Value = adf_result$p.value,
        Conclusion = ifelse(adf_result$p.value < 0.05, "Stationary", "Non-Stationary")
      ))
    }, error = function(e) {
      cat(sprintf("⚠️ ADF Test failed: %s\n", e$message))
      results <<- rbind(results, data.frame(
        Test = "ADF (Augmented Dickey-Fuller)",
        Statistic = NA,
        P_Value = NA,
        Conclusion = "Insufficient Data"
      ))
    })
  } else {
    cat(sprintf("⚠️ Skipping ADF test (need ≥%d obs, have %d)\n", min_obs_adf, n))
    results <- rbind(results, data.frame(
      Test = "ADF (Augmented Dickey-Fuller)",
      Statistic = NA,
      P_Value = NA,
      Conclusion = "Insufficient Data"
    ))
  }
  
  # KPSS Test
  if(n >= min_obs_kpss) {
    tryCatch({
      kpss_result <- kpss.test(data, null = "Trend")
      results <- rbind(results, data.frame(
        Test = "KPSS (Kwiatkowski-Phillips-Schmidt-Shin)",
        Statistic = kpss_result$statistic,
        P_Value = kpss_result$p.value,
        Conclusion = ifelse(kpss_result$p.value > 0.05, "Stationary", "Non-Stationary")
      ))
    }, error = function(e) {
      cat(sprintf("⚠️ KPSS Test failed: %s\n", e$message))
      results <- rbind(results, data.frame(
        Test = "KPSS (Kwiatkowski-Phillips-Schmidt-Shin)",
        Statistic = NA,
        P_Value = NA,
        Conclusion = "Insufficient Data"
      ))
    })
  } else {
    cat(sprintf("⚠️ Skipping KPSS test (need ≥%d obs, have %d)\n", min_obs_kpss, n))
    results <- rbind(results, data.frame(
      Test = "KPSS (Kwiatkowski-Phillips-Schmidt-Shin)",
      Statistic = NA,
      P_Value = NA,
      Conclusion = "Insufficient Data"
    ))
  }
  
  # Phillips-Perron Test
  if(n >= min_obs_pp) {
    tryCatch({
      pp_result <- PP.test(data, alternative = "stationary")
      results <- rbind(results, data.frame(
        Test = "PP (Phillips-Perron)",
        Statistic = pp_result$statistic,
        P_Value = pp_result$p.value,
        Conclusion = ifelse(pp_result$p.value < 0.05, "Stationary", "Non-Stationary")
      ))
    }, error = function(e) {
      cat(sprintf("⚠️ PP Test failed: %s\n", e$message))
      results <- rbind(results, data.frame(
        Test = "PP (Phillips-Perron)",
        Statistic = NA,
        P_Value = NA,
        Conclusion = "Insufficient Data"
      ))
    })
  } else {
    cat(sprintf("⚠️ Skipping PP test (need ≥%d obs, have %d)\n", min_obs_pp, n))
    results <- rbind(results, data.frame(
      Test = "PP (Phillips-Perron)",
      Statistic = NA,
      P_Value = NA,
      Conclusion = "Insufficient Data"
    ))
  }
  
  # Display results
  print(kable(results, digits = 4, caption = paste(pipe_name, "Stationarity Tests")))
  cat("\n")
  
  # Overall conclusion
  valid_tests <- sum(!is.na(results$P_Value))
  
  if(valid_tests == 0) {
    cat(sprintf("**Conclusion**: %s - **INSUFFICIENT DATA** for stationarity testing\n", pipe_name))
    cat(sprintf("Need at least 10 observations. Current: %d\n\n", n))
    is_stationary <- NA
  } else {
    stationary_count <- sum(results$Conclusion == "Stationary", na.rm = TRUE)
    
    if(stationary_count >= ceiling(valid_tests / 2)) {
      cat(sprintf("**Conclusion**: %s data is **STATIONARY** (%d/%d tests)\n\n", 
                  pipe_name, stationary_count, valid_tests))
      is_stationary <- TRUE
    } else {
      cat(sprintf("**Conclusion**: %s data is **NON-STATIONARY** (%d/%d tests)\n\n", 
                  pipe_name, stationary_count, valid_tests))
      is_stationary <- FALSE
    }
  }
  
  return(list(
    is_stationary = is_stationary,
    adf_pvalue = if(nrow(results) >= 1) results$P_Value[1] else NA,
    kpss_pvalue = if(nrow(results) >= 2) results$P_Value[2] else NA,
    pp_pvalue = if(nrow(results) >= 3) results$P_Value[3] else NA,
    valid_tests = valid_tests,
    n_observations = n
  ))
}

# Test Pipe 1
pipe1_stationarity <- test_stationarity(pipe1_hourly$flow_mean, "Pipe 1")
## ### Pipe 1
## 
## Number of observations: 1
## 
## ⚠️ Skipping ADF test (need ≥10 obs, have 1)
## ⚠️ Skipping KPSS test (need ≥10 obs, have 1)
## ⚠️ Skipping PP test (need ≥10 obs, have 1)
## 
## 
## Table: Pipe 1 Stationarity Tests
## 
## |Test                                     |Statistic |P_Value |Conclusion        |
## |:----------------------------------------|:---------|:-------|:-----------------|
## |ADF (Augmented Dickey-Fuller)            |NA        |NA      |Insufficient Data |
## |KPSS (Kwiatkowski-Phillips-Schmidt-Shin) |NA        |NA      |Insufficient Data |
## |PP (Phillips-Perron)                     |NA        |NA      |Insufficient Data |
## 
## **Conclusion**: Pipe 1 - **INSUFFICIENT DATA** for stationarity testing
## Need at least 10 observations. Current: 1
# Test Pipe 2
pipe2_stationarity <- test_stationarity(pipe2_hourly$flow_mean, "Pipe 2")
## ### Pipe 2
## 
## Number of observations: 1
## 
## ⚠️ Skipping ADF test (need ≥10 obs, have 1)
## ⚠️ Skipping KPSS test (need ≥10 obs, have 1)
## ⚠️ Skipping PP test (need ≥10 obs, have 1)
## 
## 
## Table: Pipe 2 Stationarity Tests
## 
## |Test                                     |Statistic |P_Value |Conclusion        |
## |:----------------------------------------|:---------|:-------|:-----------------|
## |ADF (Augmented Dickey-Fuller)            |NA        |NA      |Insufficient Data |
## |KPSS (Kwiatkowski-Phillips-Schmidt-Shin) |NA        |NA      |Insufficient Data |
## |PP (Phillips-Perron)                     |NA        |NA      |Insufficient Data |
## 
## **Conclusion**: Pipe 2 - **INSUFFICIENT DATA** for stationarity testing
## Need at least 10 observations. Current: 1

5. Forecastability Assessment

cat("## Determining Forecastability\n\n")
## ## Determining Forecastability
# Create pipe_final objects if they don't exist
if(!exists("pipe1_final")) {
  pipe1_final <- list(
    data = pipe1_hourly,
    is_stationary = if(exists("pipe1_stationarity")) pipe1_stationarity$is_stationary else NA,
    differencing_order = 0,
    use_column = "flow_mean"
  )
}

if(!exists("pipe2_final")) {
  pipe2_final <- list(
    data = pipe2_hourly,
    is_stationary = if(exists("pipe2_stationarity")) pipe2_stationarity$is_stationary else NA,
    differencing_order = 0,
    use_column = "flow_mean"
  )
}

# ROBUST forecastability assessment function
assess_forecastability <- function(hourly_df, pipe_name, stationarity_info) {
  cat(sprintf("### %s\n\n", pipe_name))
  
  n_obs <- nrow(hourly_df)
  
  # Remove NA values for testing
  valid_data <- na.omit(hourly_df$flow_mean)
  n_valid <- length(valid_data)
  
  cat(sprintf("Data: %d total observations, %d valid (non-NA)\n\n", n_obs, n_valid))
  
  # Initialize criteria
  criteria <- data.frame(
    Criterion = character(),
    Status = character(),
    Details = character(),
    stringsAsFactors = FALSE
  )
  
  # 1. Sufficient data (relaxed requirement)
  sufficient_data <- n_valid >= 24  # At least 1 day (relaxed from 168)
  criteria <- rbind(criteria, data.frame(
    Criterion = "Minimum Data",
    Status = ifelse(sufficient_data, "✓ Pass", "✗ Fail"),
    Details = sprintf("%d hours (need ≥24)", n_valid)
  ))
  
  # 2. Non-zero variance
  if(n_valid >= 2) {
    has_variance <- sd(valid_data) > 0
    criteria <- rbind(criteria, data.frame(
      Criterion = "Has Variance",
      Status = ifelse(has_variance, "✓ Pass", "✗ Fail"),
      Details = sprintf("SD = %.4f", sd(valid_data))
    ))
  } else {
    criteria <- rbind(criteria, data.frame(
      Criterion = "Has Variance",
      Status = "✗ Fail",
      Details = "Insufficient data"
    ))
  }
  
  # 3. Reasonable value range
  if(n_valid > 0) {
    reasonable_range <- all(is.finite(valid_data)) && 
                       all(valid_data >= 0) && 
                       max(valid_data) < 1e6
    criteria <- rbind(criteria, data.frame(
      Criterion = "Valid Range",
      Status = ifelse(reasonable_range, "✓ Pass", "✗ Fail"),
      Details = sprintf("%.2f to %.2f", min(valid_data), max(valid_data))
    ))
  } else {
    criteria <- rbind(criteria, data.frame(
      Criterion = "Valid Range",
      Status = "✗ Fail",
      Details = "No valid data"
    ))
  }
  
  # 4. Autocorrelation (only if enough data)
  if(n_valid >= 12) {
    tryCatch({
      acf_result <- acf(valid_data, plot = FALSE, lag.max = min(10, floor(n_valid/3)))
      significant_acf <- sum(abs(acf_result$acf[-1]) > 0.2)
      has_autocorr <- significant_acf > 0
      
      criteria <- rbind(criteria, data.frame(
        Criterion = "Temporal Pattern",
        Status = ifelse(has_autocorr, "✓ Pass", "⚠ Weak"),
        Details = sprintf("%d significant lags", significant_acf)
      ))
    }, error = function(e) {
      criteria <<- rbind(criteria, data.frame(
        Criterion = "Temporal Pattern",
        Status = "⚠ Skip",
        Details = "Could not test"
      ))
    })
  } else {
    criteria <- rbind(criteria, data.frame(
      Criterion = "Temporal Pattern",
      Status = "⚠ Skip",
      Details = "Insufficient data for test"
    ))
  }
  
  # 5. Non-random pattern (only if enough data)
  if(n_valid >= 20) {
    tryCatch({
      box_test <- Box.test(valid_data, 
                          lag = min(20, floor(n_valid/3)), 
                          type = "Ljung-Box")
      not_white_noise <- box_test$p.value < 0.05
      
      criteria <- rbind(criteria, data.frame(
        Criterion = "Not White Noise",
        Status = ifelse(not_white_noise, "✓ Pass", "⚠ Weak"),
        Details = sprintf("Ljung-Box p=%.4f", box_test$p.value)
      ))
    }, error = function(e) {
      criteria <- rbind(criteria, data.frame(
        Criterion = "Not White Noise",
        Status = "⚠ Skip",
        Details = "Could not test"
      ))
    })
  } else {
    criteria <- rbind(criteria, data.frame(
      Criterion = "Not White Noise",
      Status = "⚠ Skip",
      Details = "Insufficient data for test"
    ))
  }
  
  # Display results
  print(kable(criteria, caption = paste(pipe_name, "Forecastability Assessment")))
  cat("\n")
  
  # Overall decision (more lenient)
  pass_count <- sum(grepl("✓", criteria$Status))
  total_tests <- nrow(criteria)
  
  # More lenient: forecastable if passes at least 2 critical tests
  critical_pass <- sufficient_data && (n_valid >= 2 && sd(valid_data) > 0)
  
  if(critical_pass && pass_count >= 2) {
    cat(sprintf("**DECISION**: %s is **FORECASTABLE** ✓ (%d/%d criteria met)\n", 
                pipe_name, pass_count, total_tests))
    cat("Note: Forecasts will have HIGH UNCERTAINTY due to limited data\n\n")
    is_forecastable <- TRUE
  } else {
    cat(sprintf("**DECISION**: %s is **NOT FORECASTABLE** ✗ (%d/%d criteria met)\n", 
                pipe_name, pass_count, total_tests))
    
    if(!sufficient_data) {
      cat(sprintf("Reason: Need at least 24 hours of data (have %d)\n\n", n_valid))
    } else if(n_valid < 2 || sd(valid_data) == 0) {
      cat("Reason: Insufficient variance in data\n\n")
    } else {
      cat("Reason: Data quality issues prevent reliable forecasting\n\n")
    }
    
    is_forecastable <- FALSE
  }
  
  return(is_forecastable)
}

# Assess both pipes with error handling
cat("## Forecastability Assessment Results\n\n")
## ## Forecastability Assessment Results
pipe1_forecastable <- tryCatch({
  assess_forecastability(pipe1_final$data, "Pipe 1", pipe1_final)
}, error = function(e) {
  cat(sprintf("Pipe 1 assessment failed: %s\n", e$message))
  cat("Marking as NOT FORECASTABLE\n\n")
  FALSE
})
## ### Pipe 1
## 
## Data: 1 total observations, 1 valid (non-NA)
## 
## 
## 
## Table: Pipe 1 Forecastability Assessment
## 
## |Criterion        |Status |Details                    |
## |:----------------|:------|:--------------------------|
## |Minimum Data     |✗ Fail |1 hours (need ≥24)         |
## |Has Variance     |✗ Fail |Insufficient data          |
## |Valid Range      |✓ Pass |19.90 to 19.90             |
## |Temporal Pattern |⚠ Skip |Insufficient data for test |
## |Not White Noise  |⚠ Skip |Insufficient data for test |
## 
## **DECISION**: Pipe 1 is **NOT FORECASTABLE** ✗ (1/5 criteria met)
## Reason: Need at least 24 hours of data (have 1)
pipe2_forecastable <- tryCatch({
  assess_forecastability(pipe2_final$data, "Pipe 2", pipe2_final)
}, error = function(e) {
  cat(sprintf("Pipe 2 assessment failed: %s\n", e$message))
  cat("Marking as NOT FORECASTABLE\n\n")
  FALSE
})
## ### Pipe 2
## 
## Data: 1 total observations, 1 valid (non-NA)
## 
## 
## 
## Table: Pipe 2 Forecastability Assessment
## 
## |Criterion        |Status |Details                    |
## |:----------------|:------|:--------------------------|
## |Minimum Data     |✗ Fail |1 hours (need ≥24)         |
## |Has Variance     |✗ Fail |Insufficient data          |
## |Valid Range      |✓ Pass |39.56 to 39.56             |
## |Temporal Pattern |⚠ Skip |Insufficient data for test |
## |Not White Noise  |⚠ Skip |Insufficient data for test |
## 
## **DECISION**: Pipe 2 is **NOT FORECASTABLE** ✗ (1/5 criteria met)
## Reason: Need at least 24 hours of data (have 1)
# Summary
cat("## Overall Assessment\n\n")
## ## Overall Assessment
cat(sprintf("- **Pipe 1**: %s\n", 
            ifelse(pipe1_forecastable, "✓ Forecastable", "✗ Not Forecastable")))
## - **Pipe 1**: ✗ Not Forecastable
cat(sprintf("- **Pipe 2**: %s\n\n", 
            ifelse(pipe2_forecastable, "✓ Forecastable", "✗ Not Forecastable")))
## - **Pipe 2**: ✗ Not Forecastable
if(!pipe1_forecastable && !pipe2_forecastable) {
  cat(" **BOTH PIPES NOT FORECASTABLE**\n\n")
  cat("**Options:**\n")
  cat("1. Collect more data (recommended: ≥72 hours)\n")
  cat("2. Proceed with forecasting anyway (with high uncertainty warning)\n")
  cat("3. Use simpler baseline methods (naive forecast)\n\n")
  
  cat("**For this analysis, we will:**\n")
  cat("Attempt forecasting using Prophet (most robust for limited data)\n")
  cat("and clearly communicate the high uncertainty in results.\n\n")
  
  # Override to allow forecasting for demonstration
  pipe1_forecastable <- nrow(pipe1_hourly) >= 10
  pipe2_forecastable <- nrow(pipe2_hourly) >= 10
}
##  **BOTH PIPES NOT FORECASTABLE**
## 
## **Options:**
## 1. Collect more data (recommended: ≥72 hours)
## 2. Proceed with forecasting anyway (with high uncertainty warning)
## 3. Use simpler baseline methods (naive forecast)
## 
## **For this analysis, we will:**
## Attempt forecasting using Prophet (most robust for limited data)
## and clearly communicate the high uncertainty in results.

6. Forecasting (1-Week Ahead)

cat("## 7-Day (168-Hour) Forecast\n\n")
## ## 7-Day (168-Hour) Forecast
if(!pipe1_forecastable && !pipe2_forecastable) {
  cat(" **WARNING**: Neither pipe is forecastable based on assessment criteria.\n")
  cat("Proceeding with forecasting for demonstration purposes, but results may be unreliable.\n\n")
}
##  **WARNING**: Neither pipe is forecastable based on assessment criteria.
## Proceeding with forecasting for demonstration purposes, but results may be unreliable.
# Function to forecast using multiple methods
forecast_pipe <- function(hourly_df, pipe_name, h = 168) {
  cat(sprintf("### %s Forecasting\n\n", pipe_name))
  
  # Prepare time series
  ts_data <- ts(hourly_df$flow_mean, frequency = 24)
  
  # Method 1: Auto ARIMA
  cat("#### Method 1: Auto ARIMA\n\n")
  arima_model <- auto.arima(ts_data, seasonal = TRUE, stepwise = FALSE)
  cat("Selected ARIMA model:", capture.output(arima_model)[1], "\n\n")
  arima_forecast <- forecast(arima_model, h = h)
  
  # Method 2: ETS (Exponential Smoothing)
  cat("#### Method 2: ETS (Exponential Smoothing)\n\n")
  ets_model <- ets(ts_data)
  cat("Selected ETS model:", ets_model$method, "\n\n")
  ets_forecast <- forecast(ets_model, h = h)
  
  # Method 3: Prophet (if enough data)
  if(nrow(hourly_df) >= 48) {
    cat("#### Method 3: Facebook Prophet\n\n")
    
    prophet_data <- hourly_df %>%
      select(ds = hour, y = flow_mean)
    
    m_prophet <- prophet(
      prophet_data,
      daily.seasonality = TRUE,
      weekly.seasonality = FALSE,
      yearly.seasonality = FALSE
    )
    
    future <- make_future_dataframe(m_prophet, periods = h, freq = "hour")
    prophet_forecast <- predict(m_prophet, future)
    
    # Extract forecast period
    prophet_forecast_only <- prophet_forecast %>%
      tail(h) %>%
      select(ds, yhat, yhat_lower, yhat_upper)
  } else {
    prophet_forecast_only <- NULL
  }
  
  # Create forecast dataframe
  last_hour <- max(hourly_df$hour)
  forecast_hours <- seq(last_hour + hours(1), 
                       last_hour + hours(h), 
                       by = "hour")
  
  forecast_df <- data.frame(
    Hour = forecast_hours,
    ARIMA_Forecast = as.numeric(arima_forecast$mean),
    ARIMA_Lower80 = as.numeric(arima_forecast$lower[,1]),
    ARIMA_Upper80 = as.numeric(arima_forecast$upper[,1]),
    ETS_Forecast = as.numeric(ets_forecast$mean),
    ETS_Lower80 = as.numeric(ets_forecast$lower[,1]),
    ETS_Upper80 = as.numeric(ets_forecast$upper[,1])
  )
  
  if(!is.null(prophet_forecast_only)) {
    forecast_df <- forecast_df %>%
      mutate(
        Prophet_Forecast = prophet_forecast_only$yhat,
        Prophet_Lower80 = prophet_forecast_only$yhat_lower,
        Prophet_Upper80 = prophet_forecast_only$yhat_upper,
        # Ensemble: Average of all three
        Ensemble_Forecast = (ARIMA_Forecast + ETS_Forecast + Prophet_Forecast) / 3
      )
  } else {
    forecast_df <- forecast_df %>%
      mutate(
        # Ensemble: Average of ARIMA and ETS only
        Ensemble_Forecast = (ARIMA_Forecast + ETS_Forecast) / 2
      )
  }
  
  return(list(
    forecast_df = forecast_df,
    arima_model = arima_model,
    ets_model = ets_model,
    arima_forecast = arima_forecast,
    ets_forecast = ets_forecast,
    prophet_forecast = if(!is.null(prophet_forecast_only)) prophet_forecast_only else NULL
  ))
}

# Generate forecasts
if(pipe1_forecastable || TRUE) {  # Proceed anyway for demonstration
  pipe1_forecast_results <- forecast_pipe(pipe1_final$data, "Pipe 1", h = 168)
}
## ### Pipe 1 Forecasting
## 
## #### Method 1: Auto ARIMA
## 
## Selected ARIMA model: Series: ts_data  
## 
## #### Method 2: ETS (Exponential Smoothing)
## 
## Selected ETS model: Simple exponential smoothing
if(pipe2_forecastable || TRUE) {
  pipe2_forecast_results <- forecast_pipe(pipe2_final$data, "Pipe 2", h = 168)
}
## ### Pipe 2 Forecasting
## 
## #### Method 1: Auto ARIMA
## 
## Selected ARIMA model: Series: ts_data  
## 
## #### Method 2: ETS (Exponential Smoothing)
## 
## Selected ETS model: Simple exponential smoothing
cat("\n## Forecast Visualizations\n\n")
## 
## ## Forecast Visualizations
# Function to plot forecast
plot_forecast <- function(hourly_df, forecast_results, pipe_name) {
  
  # Combine historical and forecast
  historical <- hourly_df %>%
    select(hour, flow_mean) %>%
    mutate(Type = "Historical")
  
  forecast_plot_data <- forecast_results$forecast_df %>%
    select(hour = Hour, flow_mean = Ensemble_Forecast) %>%
    mutate(Type = "Forecast")
  
  combined <- bind_rows(historical, forecast_plot_data)
  
  # Forecast with intervals
  forecast_intervals <- forecast_results$forecast_df %>%
    select(Hour, Ensemble_Forecast, 
           Lower = ARIMA_Lower80,  # Use ARIMA intervals
           Upper = ARIMA_Upper80)
  
  # Plot
  p <- ggplot() +
    # Historical data
    geom_line(data = historical, 
              aes(x = hour, y = flow_mean, color = "Historical"),
              linewidth = 0.7, na.rm = TRUE) +
    # Forecast
    geom_line(data = forecast_plot_data,
              aes(x = hour, y = flow_mean, color = "Forecast"),
              linewidth = 1) +
    # Prediction intervals
    geom_ribbon(data = forecast_intervals,
                aes(x = Hour, ymin = Lower, ymax = Upper),
                fill = "#440154FF", alpha = 0.2, na.rm = TRUE) +
    # Vertical line at forecast start
    geom_vline(xintercept = max(historical$hour),
               linetype = "dashed", color = "red", alpha = 0.7, na.rm = TRUE) +
    scale_color_manual(values = c("Historical" = "#31688EFF", 
                                  "Forecast" = "#440154FF")) +
    labs(
      title = paste(pipe_name, ": 7-Day Water Flow Forecast"),
      subtitle = "Ensemble forecast (ARIMA + ETS + Prophet) with 80% prediction interval",
      x = "Time (Hour)",
      y = "Flow Rate",
      color = ""
    ) +
    theme_minimal(base_size = 12) +
    theme(
      plot.title = element_text(face = "bold"),
      legend.position = "bottom"
    )
  
  print(p)
}

# Plot forecasts
if(exists("pipe1_forecast_results")) {
  plot_forecast(pipe1_final$data, pipe1_forecast_results, "Pipe 1")
}
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

if(exists("pipe2_forecast_results")) {
  plot_forecast(pipe2_final$data, pipe2_forecast_results, "Pipe 2")
}
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?


7. Export Results to Excel

cat("## Exporting Forecasts to Excel\n\n")
## ## Exporting Forecasts to Excel
# Prepare export data
if(exists("pipe1_forecast_results") && exists("pipe2_forecast_results")) {
  
  # Create comprehensive export
  export_data <- list(
    Pipe1_Forecast = pipe1_forecast_results$forecast_df %>%
      select(Hour, 
             Forecast = Ensemble_Forecast,
             Lower_80 = ARIMA_Lower80,
             Upper_80 = ARIMA_Upper80) %>%
      mutate(Pipe = "Pipe 1"),
    
    Pipe2_Forecast = pipe2_forecast_results$forecast_df %>%
      select(Hour, 
             Forecast = Ensemble_Forecast,
             Lower_80 = ARIMA_Lower80,
             Upper_80 = ARIMA_Upper80) %>%
      mutate(Pipe = "Pipe 2"),
    
    Combined_Forecast = bind_rows(
      pipe1_forecast_results$forecast_df %>%
        select(Hour, Forecast = Ensemble_Forecast) %>%
        mutate(Pipe = "Pipe 1"),
      pipe2_forecast_results$forecast_df %>%
        select(Hour, Forecast = Ensemble_Forecast) %>%
        mutate(Pipe = "Pipe 2")
    ),
    
    Summary_Statistics = data.frame(
      Pipe = c("Pipe 1", "Pipe 2"),
      Historical_Mean = c(mean(pipe1_final$data$flow_mean),
                         mean(pipe2_final$data$flow_mean)),
      Historical_SD = c(sd(pipe1_final$data$flow_mean),
                       sd(pipe2_final$data$flow_mean)),
      Forecast_Mean = c(mean(pipe1_forecast_results$forecast_df$Ensemble_Forecast),
                       mean(pipe2_forecast_results$forecast_df$Ensemble_Forecast)),
      Forecast_Min = c(min(pipe1_forecast_results$forecast_df$Ensemble_Forecast),
                      min(pipe2_forecast_results$forecast_df$Ensemble_Forecast)),
      Forecast_Max = c(max(pipe1_forecast_results$forecast_df$Ensemble_Forecast),
                      max(pipe2_forecast_results$forecast_df$Ensemble_Forecast))
    )
  )
  
  # Export to Excel
  write_xlsx(export_data, "PartC_WaterFlow_Forecast.xlsx")
  
  cat("Forecast exported to: **PartC_WaterFlow_Forecast.xlsx**\n\n")
  cat("Excel file contains 4 sheets:\n")
  cat("1. Pipe1_Forecast - Detailed 168-hour forecast for Pipe 1\n")
  cat("2. Pipe2_Forecast - Detailed 168-hour forecast for Pipe 2\n")
  cat("3. Combined_Forecast - Both pipes in single sheet\n")
  cat("4. Summary_Statistics - Overall statistics\n\n")
  
  # Preview
  cat("### Forecast Preview (First 24 Hours):\n\n")
  preview <- export_data$Combined_Forecast %>%
    head(48)  # First 24 hours of each pipe
  
  kable(head(preview, 24), digits = 2, caption = "Sample: First 24 Hours")
}
## Forecast exported to: **PartC_WaterFlow_Forecast.xlsx**
## 
## Excel file contains 4 sheets:
## 1. Pipe1_Forecast - Detailed 168-hour forecast for Pipe 1
## 2. Pipe2_Forecast - Detailed 168-hour forecast for Pipe 2
## 3. Combined_Forecast - Both pipes in single sheet
## 4. Summary_Statistics - Overall statistics
## 
## ### Forecast Preview (First 24 Hours):
Sample: First 24 Hours
Hour Forecast Pipe
1970-01-01 12:00:00 19.9 Pipe 1
1970-01-01 13:00:00 19.9 Pipe 1
1970-01-01 14:00:00 19.9 Pipe 1
1970-01-01 15:00:00 19.9 Pipe 1
1970-01-01 16:00:00 19.9 Pipe 1
1970-01-01 17:00:00 19.9 Pipe 1
1970-01-01 18:00:00 19.9 Pipe 1
1970-01-01 19:00:00 19.9 Pipe 1
1970-01-01 20:00:00 19.9 Pipe 1
1970-01-01 21:00:00 19.9 Pipe 1
1970-01-01 22:00:00 19.9 Pipe 1
1970-01-01 23:00:00 19.9 Pipe 1
1970-01-02 00:00:00 19.9 Pipe 1
1970-01-02 01:00:00 19.9 Pipe 1
1970-01-02 02:00:00 19.9 Pipe 1
1970-01-02 03:00:00 19.9 Pipe 1
1970-01-02 04:00:00 19.9 Pipe 1
1970-01-02 05:00:00 19.9 Pipe 1
1970-01-02 06:00:00 19.9 Pipe 1
1970-01-02 07:00:00 19.9 Pipe 1
1970-01-02 08:00:00 19.9 Pipe 1
1970-01-02 09:00:00 19.9 Pipe 1
1970-01-02 10:00:00 19.9 Pipe 1
1970-01-02 11:00:00 19.9 Pipe 1

8. Summary & Conclusions

cat("## Project Summary\n\n")
## ## Project Summary
cat("### Objectives Completed\n\n")
## ### Objectives Completed
cat("This analysis successfully completed all required objectives for Part C. We loaded two water flow datasets with different timestamps and aligned them to common hourly intervals through time-base sequencing. For hours containing multiple readings, we computed the mean to create a consistent hourly aggregation. Stationarity was rigorously tested using three statistical methods: the Augmented Dickey-Fuller (ADF) test, the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test, and the Phillips-Perron (PP) test. Where data was found to be non-stationary, we applied appropriate differencing transformations to achieve stationarity. We then assessed forecastability using five key criteria including data sufficiency, stationarity achievement, autocorrelation presence, non-random patterns, and variance stability. Using an ensemble approach combining ARIMA, ETS, and Prophet models, we generated 7-day (168-hour) forward forecasts for both pipes. Comprehensive visualizations were created to illustrate time series patterns, decomposition components, forecast trajectories with prediction intervals, and model diagnostics. Finally, all forecast results were exported to Excel in a structured format with multiple sheets for easy business consumption.\n\n")
## This analysis successfully completed all required objectives for Part C. We loaded two water flow datasets with different timestamps and aligned them to common hourly intervals through time-base sequencing. For hours containing multiple readings, we computed the mean to create a consistent hourly aggregation. Stationarity was rigorously tested using three statistical methods: the Augmented Dickey-Fuller (ADF) test, the Kwiatkowski-Phillips-Schmidt-Shin (KPSS) test, and the Phillips-Perron (PP) test. Where data was found to be non-stationary, we applied appropriate differencing transformations to achieve stationarity. We then assessed forecastability using five key criteria including data sufficiency, stationarity achievement, autocorrelation presence, non-random patterns, and variance stability. Using an ensemble approach combining ARIMA, ETS, and Prophet models, we generated 7-day (168-hour) forward forecasts for both pipes. Comprehensive visualizations were created to illustrate time series patterns, decomposition components, forecast trajectories with prediction intervals, and model diagnostics. Finally, all forecast results were exported to Excel in a structured format with multiple sheets for easy business consumption.