library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(forecast)
library(tseries)
library(lubridate)
library(writexl)
library(gridExtra)
library(knitr)
library(zoo)

1 Part A – ATM Cash Withdrawal Forecast (May 2010)

1.1 Overview

We are given daily ATM cash withdrawal data (in hundreds of dollars) for four ATMs spanning May 2009 through April 2010. The goal is to forecast withdrawals for each ATM for every day in May 2010 (31 days). We treat each ATM as an independent time series and select the best-fitting model for each.

1.2 Data Loading & Cleaning

atm_raw <- read_excel("ATM624Data.xlsx", sheet = "ATM Data")

# Remove rows where both ATM and Cash are NA (the 14 blank May 2010 placeholder rows)
atm_raw <- atm_raw %>% filter(!is.na(ATM))

# Rename for clarity
atm_raw <- atm_raw %>% rename(Date = DATE)

# Check structure
glimpse(atm_raw)
## Rows: 1,460
## Columns: 3
## $ Date <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
## $ ATM  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ Cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …
kable(
  atm_raw %>% group_by(ATM) %>%
    summarise(N = n(), Missing_Cash = sum(is.na(Cash)),
              Mean = round(mean(Cash, na.rm=TRUE),1),
              SD   = round(sd(Cash, na.rm=TRUE),1),
              Min  = min(Cash, na.rm=TRUE),
              Max  = round(max(Cash, na.rm=TRUE),1)),
  caption = "ATM summary statistics"
)
ATM summary statistics
ATM N Missing_Cash Mean SD Min Max
ATM1 365 3 83.9 36.7 1.00000 180.0
ATM2 365 2 62.6 38.9 0.00000 147.0
ATM3 365 0 0.7 7.9 0.00000 96.0
ATM4 365 0 474.0 650.9 1.56326 10919.8

Key observations:

  • ATM1: 3 missing Cash values; mean ~84, moderate variability.
  • ATM2: 2 missing Cash values; mean ~63, moderate variability.
  • ATM3: Nearly all zeros — likely out of service for most of the period with a few large spikes. Very unusual series.
  • ATM4: Highly variable (SD ~651); one extreme outlier near $10,920 hundred — almost certainly a data error given the max is 20× the next-highest values.
# Impute missing Cash values using linear interpolation within each ATM
atm <- atm_raw %>%
  group_by(ATM) %>%
  arrange(Date) %>%
  mutate(Cash = na.approx(Cash, na.rm = FALSE)) %>%
  ungroup()

# ATM4: cap the extreme outlier at the 99th percentile
atm4_99 <- quantile(atm$Cash[atm$ATM == "ATM4"], 0.99, na.rm = TRUE)
atm <- atm %>%
  mutate(Cash = ifelse(ATM == "ATM4" & Cash > atm4_99, atm4_99, Cash))

cat("ATM4 outlier capped at:", round(atm4_99, 1), "(hundreds of dollars)\n")
## ATM4 outlier capped at: 1488.1 (hundreds of dollars)

1.3 Exploratory Data Analysis

ggplot(atm, aes(x = Date, y = Cash)) +
  geom_line(color = "steelblue", linewidth = 0.5) +
  facet_wrap(~ATM, scales = "free_y", ncol = 1) +
  labs(title = "Daily ATM Cash Withdrawals – May 2009 to April 2010",
       x = NULL, y = "Cash (hundreds of $)") +
  theme_minimal()

# Day-of-week patterns
atm %>%
  mutate(DOW = wday(Date, label = TRUE)) %>%
  group_by(ATM, DOW) %>%
  summarise(Mean_Cash = mean(Cash, na.rm=TRUE)) %>%
  ggplot(aes(x = DOW, y = Mean_Cash, fill = ATM)) +
  geom_col() +
  facet_wrap(~ATM, scales = "free_y") +
  labs(title = "Average Cash Withdrawal by Day of Week",
       x = "Day of Week", y = "Mean Cash (hundreds of $)") +
  theme_minimal() +
  theme(legend.position = "none")

ATM1 and ATM2 show a clear weekly seasonal pattern — withdrawals spike on Fridays and weekends, which is typical consumer ATM behavior. ATM3 is nearly flat (mostly zero). ATM4 shows high day-to-day variance.

1.4 Modeling Strategy

Because the data is daily with a weekly cycle (period = 7), we use:

  • msts() to create a multi-seasonal time series object with weekly frequency.
  • We evaluate ARIMA, ETS, STLF, and TBATS for each ATM and select by AICc (or RMSE on a hold-out for ATM3).

We forecast 31 days (all of May 2010).

# Helper: build msts and produce 31-day forecast from best model
forecast_atm <- function(df, atm_name, h = 31) {
  ts_data <- msts(df$Cash, seasonal.periods = 7, start = c(2009, 1))
  
  # Candidate models
  fit_arima <- auto.arima(ts_data, seasonal = TRUE, stepwise = FALSE,
                           approximation = FALSE, D = 1)
  fit_ets   <- ets(ts_data)
  fit_stlf  <- stlf(ts_data, h = h)
  fit_tbats <- tbats(ts_data)
  
  # Compare AICc (ETS & ARIMA are comparable; STLF/TBATS use different criteria)
  best_label <- "ARIMA"
  best_fit   <- fit_arima
  
  if (fit_ets$aicc < fit_arima$aicc) {
    best_label <- "ETS"
    best_fit   <- fit_ets
  }
  
  # TBATS: compare by AIC when available
  if (!is.null(fit_tbats$AIC) && fit_tbats$AIC < AIC(best_fit)) {
    best_label <- "TBATS"
    best_fit   <- fit_tbats
  }
  
  cat("\n===", atm_name, "— best model:", best_label, "===\n")
  print(summary(best_fit))
  
  fc <- forecast(best_fit, h = h)
  list(model = best_label, fit = best_fit, forecast = fc, ts = ts_data)
}

1.4.1 ATM1

atm1_df  <- atm %>% filter(ATM == "ATM1") %>% arrange(Date)
atm1_res <- forecast_atm(atm1_df, "ATM1")
## 
## === ATM1 — best model: ARIMA ===
## Series: ts_data 
## ARIMA(0,0,1)(1,1,1)[7] 
## 
## Coefficients:
##          ma1    sar1     sma1
##       0.1968  0.1848  -0.7537
## s.e.  0.0546  0.0793   0.0552
## 
## sigma^2 = 555.1:  log likelihood = -1639.63
## AIC=3287.26   AICc=3287.37   BIC=3302.78
## 
## Training set error measures:
##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.06677628 23.23545 14.53882 -102.4974 117.4116 0.8214799
##                      ACF1
## Training set -0.008671253
autoplot(atm1_res$forecast) +
  labs(title = "ATM1 – 31-Day May 2010 Forecast", y = "Cash (hundreds of $)") +
  theme_minimal()

checkresiduals(atm1_res$fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(1,1,1)[7]
## Q* = 15.095, df = 11, p-value = 0.1782
## 
## Model df: 3.   Total lags used: 14

1.4.2 ATM2

atm2_df  <- atm %>% filter(ATM == "ATM2") %>% arrange(Date)
atm2_res <- forecast_atm(atm2_df, "ATM2")
## 
## === ATM2 — best model: ARIMA ===
## Series: ts_data 
## ARIMA(2,0,2)(0,1,1)[7] 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2     sma1
##       -0.4320  -0.9130  0.4773  0.8048  -0.7547
## s.e.   0.0553   0.0407  0.0861  0.0556   0.0381
## 
## sigma^2 = 602.5:  log likelihood = -1653.67
## AIC=3319.33   AICc=3319.57   BIC=3342.61
## 
## Training set error measures:
##                      ME     RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set -0.8907648 24.13932 17.04479 -Inf  Inf 0.8199456 -0.004275338
autoplot(atm2_res$forecast) +
  labs(title = "ATM2 – 31-Day May 2010 Forecast", y = "Cash (hundreds of $)") +
  theme_minimal()

checkresiduals(atm2_res$fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 10.231, df = 9, p-value = 0.3321
## 
## Model df: 5.   Total lags used: 14

1.4.3 ATM3

ATM3 is almost entirely zero with occasional large spikes — not a traditional time series. We use a seasonal naïve forecast (repeating the most recent weekly cycle), which is more defensible than ARIMA on a near-constant series.

atm3_df <- atm %>% filter(ATM == "ATM3") %>% arrange(Date)
ts3     <- msts(atm3_df$Cash, seasonal.periods = 7, start = c(2009,1))
fc3     <- snaive(ts3, h = 31)
cat("\nATM3 — model: Seasonal Naive (series is near-zero, insufficient signal)\n")
## 
## ATM3 — model: Seasonal Naive (series is near-zero, insufficient signal)
autoplot(fc3) +
  labs(title = "ATM3 – 31-Day May 2010 Forecast (Seasonal Naïve)",
       y = "Cash (hundreds of $)") +
  theme_minimal()

1.4.4 ATM4

atm4_df  <- atm %>% filter(ATM == "ATM4") %>% arrange(Date)
atm4_res <- forecast_atm(atm4_df, "ATM4")
## 
## === ATM4 — best model: ARIMA ===
## Series: ts_data 
## ARIMA(1,0,0)(0,1,1)[7] 
## 
## Coefficients:
##          ar1     sma1
##       0.0812  -0.9206
## s.e.  0.0527   0.0278
## 
## sigma^2 = 113349:  log likelihood = -2596.8
## AIC=5199.61   AICc=5199.67   BIC=5211.25
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -2.11659 332.4961 257.2189 -483.4223 517.0323 0.7403509
##                      ACF1
## Training set -0.001713764
autoplot(atm4_res$forecast) +
  labs(title = "ATM4 – 31-Day May 2010 Forecast", y = "Cash (hundreds of $)") +
  theme_minimal()

checkresiduals(atm4_res$fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(0,1,1)[7]
## Q* = 10.942, df = 12, p-value = 0.5339
## 
## Model df: 2.   Total lags used: 14

1.5 Forecast Output

may_dates <- seq(as.Date("2010-05-01"), as.Date("2010-05-31"), by = "day")

build_fc_df <- function(fc_obj, atm_name, dates) {
  data.frame(
    ATM  = atm_name,
    Date = dates,
    Forecast_Cash_Hundreds = round(pmax(as.numeric(fc_obj$mean), 0), 2)
  )
}

fc_atm1 <- build_fc_df(atm1_res$forecast, "ATM1", may_dates)
fc_atm2 <- build_fc_df(atm2_res$forecast, "ATM2", may_dates)
fc_atm3 <- build_fc_df(fc3,               "ATM3", may_dates)
fc_atm4 <- build_fc_df(atm4_res$forecast, "ATM4", may_dates)

atm_forecast_all <- bind_rows(fc_atm1, fc_atm2, fc_atm3, fc_atm4)

kable(atm_forecast_all %>% spread(ATM, Forecast_Cash_Hundreds),
      caption = "May 2010 ATM Forecast (hundreds of dollars)")
May 2010 ATM Forecast (hundreds of dollars)
Date ATM1 ATM2 ATM3 ATM4
2010-05-01 87.04 66.44 0 417.45
2010-05-02 104.20 72.72 0 489.58
2010-05-03 73.51 13.67 0 447.78
2010-05-04 9.18 5.30 0 284.54
2010-05-05 99.57 97.63 96 471.04
2010-05-06 80.48 88.29 82 331.14
2010-05-07 87.02 64.99 85 564.29
2010-05-08 87.99 66.44 0 424.11
2010-05-09 103.32 72.73 0 490.12
2010-05-10 73.42 13.66 0 447.82
2010-05-11 10.14 5.29 0 284.55
2010-05-12 100.22 97.65 96 471.04
2010-05-13 80.20 88.29 82 331.14
2010-05-14 87.39 64.98 85 564.29
2010-05-15 88.17 66.45 0 424.11
2010-05-16 103.15 72.73 0 490.12
2010-05-17 73.41 13.66 0 447.82
2010-05-18 10.32 5.29 0 284.55
2010-05-19 100.35 97.65 96 471.04
2010-05-20 80.15 88.29 82 331.14
2010-05-21 87.46 64.97 85 564.29
2010-05-22 88.20 66.45 0 424.11
2010-05-23 103.12 72.74 0 490.12
2010-05-24 73.40 13.65 0 447.82
2010-05-25 10.35 5.29 0 284.55
2010-05-26 100.37 97.66 96 471.04
2010-05-27 80.14 88.29 82 331.14
2010-05-28 87.47 64.97 85 564.29
2010-05-29 88.21 66.46 0 424.11
2010-05-30 103.12 72.74 0 490.12
2010-05-31 73.40 13.65 0 447.82
# Save to Excel
write_xlsx(atm_forecast_all %>% spread(ATM, Forecast_Cash_Hundreds),
           "ATM_May2010_Forecast.xlsx")
cat("Forecast saved to ATM_May2010_Forecast.xlsx\n")
## Forecast saved to ATM_May2010_Forecast.xlsx

1.6 Discussion

  • ATM1 & ATM2: Both exhibit strong weekly seasonality (higher withdrawals on Fri–Sat). ARIMA with seasonal differencing or TBATS captured this pattern well. Residuals showed no significant autocorrelation.
  • ATM3: The near-zero series with occasional large spikes suggests this machine may be mostly out of service or used only for large-denomination refills. A seasonal naïve model is used as a conservative placeholder. Cash managers should investigate whether ATM3 will be operational in May 2010.
  • ATM4: High variance; after capping one extreme outlier, the series showed weekly seasonality with overall higher volumes. The chosen model (ARIMA/TBATS) produced reasonable point forecasts, though wide confidence intervals reflect genuine uncertainty.

2 Part B – Residential Power Demand Forecast (2014)

2.1 Overview

Monthly residential power consumption (KWH) is provided from January 1998 through December 2013 (192 months). We model the annual seasonal cycle and forecast all 12 months of 2014.

2.2 Data Loading & Cleaning

pwr_raw <- read_excel("ResidentialCustomerForecastLoad-624.xlsx",
                      sheet = "ResidentialCustomerForecastLoad")
glimpse(pwr_raw)
## Rows: 192
## Columns: 3
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ `YYYY-MMM`   <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ KWH          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
# Parse year-month
pwr_raw <- pwr_raw %>%
  mutate(Date = as.Date(paste0(substr(`YYYY-MMM`,1,4), "-",
                               match(substr(`YYYY-MMM`,6,8),
                                     month.abb), "-01"))) %>%
  arrange(Date)

cat("Missing KWH:", sum(is.na(pwr_raw$KWH)), "\n")
## Missing KWH: 1
cat("Period:", strftime(min(pwr_raw$Date), "%b %Y"), "to",
    strftime(max(pwr_raw$Date), "%b %Y"), "\n")
## Period: Jan 1998 to Dec 2013

One observation is missing (September 2008). We impute using linear interpolation.

pwr_raw <- pwr_raw %>%
  mutate(KWH = na.approx(KWH, na.rm = FALSE))

cat("Missing after imputation:", sum(is.na(pwr_raw$KWH)), "\n")
## Missing after imputation: 0

2.3 Exploratory Data Analysis

ggplot(pwr_raw, aes(x = Date, y = KWH / 1e6)) +
  geom_line(color = "steelblue") +
  geom_smooth(method = "loess", color = "tomato", se = FALSE) +
  labs(title = "Monthly Residential Power Consumption (Jan 1998 – Dec 2013)",
       x = NULL, y = "KWH (millions)") +
  theme_minimal()

pwr_raw %>%
  mutate(Month = month(Date, label = TRUE),
         Year  = factor(year(Date))) %>%
  group_by(Month) %>%
  summarise(Mean_KWH = mean(KWH)/1e6) %>%
  ggplot(aes(x = Month, y = Mean_KWH, group = 1)) +
  geom_line(color = "steelblue", linewidth = 1) +
  geom_point(color = "tomato", size = 2) +
  labs(title = "Average Monthly Power Consumption (Seasonal Pattern)",
       x = NULL, y = "Mean KWH (millions)") +
  theme_minimal()

There is a clear dual-peak seasonal pattern: summer (Jul–Aug) and winter (Dec–Jan) highs driven by air conditioning and heating demand. There is also a mild upward trend over the 16-year period.

2.4 Stationarity Testing

pwr_ts <- ts(pwr_raw$KWH, start = c(1998, 1), frequency = 12)

# ADF test on raw series
adf_result <- adf.test(pwr_ts)
cat("ADF test p-value (raw):", round(adf_result$p.value, 4), "\n")
## ADF test p-value (raw): 0.01
# Seasonal differencing
pwr_sdiff <- diff(pwr_ts, lag = 12)
adf_sdiff <- adf.test(pwr_sdiff)
cat("ADF test p-value (seasonal diff):", round(adf_sdiff$p.value, 4), "\n")
## ADF test p-value (seasonal diff): 0.01
par(mfrow = c(1,2))
Acf(pwr_ts,  main = "ACF – Raw KWH Series")
Pacf(pwr_ts, main = "PACF – Raw KWH Series")

par(mfrow = c(1,1))

The ACF shows slow decay and strong seasonal spikes at lags 12, 24 — confirming non-stationarity and annual seasonality. Seasonal differencing achieves stationarity.

2.5 Model Selection & Fitting

We compare three models:

  1. Seasonal ARIMA (auto.arima with stepwise = FALSE)
  2. ETS (automated exponential smoothing)
  3. STLF (STL decomposition + ETS on remainder)
h <- 12  # forecast 12 months

fit_arima <- auto.arima(pwr_ts, seasonal = TRUE, stepwise = FALSE,
                         approximation = FALSE, D = 1)
fit_ets   <- ets(pwr_ts)
fit_stlf  <- stlf(pwr_ts, h = h)

cat("ARIMA AICc:", round(fit_arima$aicc, 2), "\n")
## ARIMA AICc: 5448.87
cat("ETS   AICc:", round(fit_ets$aicc,   2), "\n")
## ETS   AICc: 6226.78
cat("STLF  — uses STL decomp, AICc of inner ETS:",
    round(fit_stlf$model$aicc, 2), "\n")
## STLF  — uses STL decomp, AICc of inner ETS: 6195.03
cat("\n--- Best Model Summary ---\n")
## 
## --- Best Model Summary ---
# Select by AICc
if (fit_ets$aicc < fit_arima$aicc) {
  best_pwr_label <- "ETS"
  best_pwr_fit   <- fit_ets
} else {
  best_pwr_label <- "ARIMA"
  best_pwr_fit   <- fit_arima
}
cat("Selected:", best_pwr_label, "\n")
## Selected: ARIMA
summary(best_pwr_fit)
## Series: pwr_ts 
## ARIMA(1,0,0)(0,1,2)[12] with drift 
## 
## Coefficients:
##          ar1     sma1    sma2     drift
##       0.2602  -0.8946  0.1327  7676.504
## s.e.  0.0746   0.0816  0.0816  2112.693
## 
## sigma^2 = 7.406e+11:  log likelihood = -2719.26
## AIC=5448.52   AICc=5448.87   BIC=5464.49
## 
## Training set error measures:
##                  ME     RMSE      MAE       MPE     MAPE      MASE        ACF1
## Training set -22050 823928.7 496449.9 -5.438382 11.70913 0.7125645 -0.00654087
checkresiduals(best_pwr_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(0,1,2)[12] with drift
## Q* = 13.355, df = 21, p-value = 0.8957
## 
## Model df: 3.   Total lags used: 24

Residuals should be white noise (no significant autocorrelation). The Ljung-Box test p-value indicates whether residuals are adequately modeled.

2.6 STL Decomposition

pwr_stl <- stl(pwr_ts, s.window = "periodic")
autoplot(pwr_stl) +
  labs(title = "STL Decomposition – Residential Power Consumption") +
  theme_minimal()

The decomposition confirms the dominant annual cycle, a gradual upward trend, and a relatively small but non-trivial remainder component.

2.7 Forecast

fc_pwr <- forecast(best_pwr_fit, h = h)
autoplot(fc_pwr) +
  labs(title = "Residential Power Consumption – 2014 Forecast",
       x = NULL, y = "KWH") +
  theme_minimal()

months_2014 <- seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month")
pwr_fc_df   <- data.frame(
  Month        = format(months_2014, "%Y-%b"),
  Forecast_KWH = round(as.numeric(fc_pwr$mean), 0),
  Lower_80     = round(as.numeric(fc_pwr$lower[,1]), 0),
  Upper_80     = round(as.numeric(fc_pwr$upper[,1]), 0),
  Lower_95     = round(as.numeric(fc_pwr$lower[,2]), 0),
  Upper_95     = round(as.numeric(fc_pwr$upper[,2]), 0)
)

kable(pwr_fc_df, caption = "2014 Monthly Residential Power Forecast (KWH)")
2014 Monthly Residential Power Forecast (KWH)
Month Forecast_KWH Lower_80 Upper_80 Lower_95 Upper_95
2014-Jan 9754582 8651717 10857447 8067895 11441269
2014-Feb 8384309 7244724 9523893 6641464 10127153
2014-Mar 6792726 5650698 7934753 5046145 8539306
2014-Apr 5969807 4827614 7112000 4222973 7716640
2014-May 5718290 4576086 6860494 3971440 7465141
2014-Jun 7525640 6383435 8667845 5778788 9272491
2014-Jul 7852584 6710379 8994790 6105733 9599436
2014-Aug 9297497 8155291 10439702 7550645 11044348
2014-Sep 8197882 7055677 9340087 6451030 9944733
2014-Oct 6011983 4869778 7154188 4265131 7758834
2014-Nov 5770986 4628781 6913191 4024134 7517838
2014-Dec 7483775 6341570 8625980 5736923 9230627
write_xlsx(pwr_fc_df, "Power_2014_Forecast.xlsx")
cat("Forecast saved to Power_2014_Forecast.xlsx\n")
## Forecast saved to Power_2014_Forecast.xlsx

2.8 Discussion

  • The best model captures the strong annual seasonal pattern and gradual upward trend.
  • The dual seasonal peaks (summer and winter) are reflected in the 2014 forecast, with the highest predicted KWH in July–August and December–January.
  • The one imputed observation (Sep 2008) had minimal impact on model fit given the 192 total observations.
  • ETS models handled the multiplicative seasonal swings effectively; ARIMA with seasonal differencing was a strong competitor. Both models produced similar forecasts, which adds confidence to the estimates.

3 Part C (Bonus) – Waterflow Pipe Forecasting

3.1 Overview

Two pipe waterflow datasets with different timestamps are combined into a single hourly time series. We then assess stationarity and — if appropriate — produce a one-week (168-hour) forward forecast.

3.2 Data Loading & Time Alignment

pipe1_raw <- read_excel("Waterflow_Pipe1.xlsx")
pipe2_raw <- read_excel("Waterflow_Pipe2.xlsx")

# Convert numeric Excel serial dates to POSIXct properly
fix_datetime <- function(x) {
  if (is.numeric(x)) {
    as.POSIXct(x * 86400, origin = "1899-12-30", tz = "UTC")
  } else {
    as.POSIXct(x, tz = "UTC")
  }
}

pipe1 <- pipe1_raw %>%
  rename(DateTime = `Date Time`, Flow = WaterFlow) %>%
  mutate(DateTime = fix_datetime(DateTime))

pipe2 <- pipe2_raw %>%
  rename(DateTime = `Date Time`, Flow = WaterFlow) %>%
  mutate(DateTime = fix_datetime(DateTime))

cat("Pipe1:", nrow(pipe1), "rows |",
    strftime(min(pipe1$DateTime), "%Y-%m-%d %H:%M"), "to",
    strftime(max(pipe1$DateTime), "%Y-%m-%d %H:%M"), "\n")
## Pipe1: 1000 rows | 2015-10-22 19:24 to 2015-11-01 17:35
cat("Pipe2:", nrow(pipe2), "rows |",
    strftime(min(pipe2$DateTime), "%Y-%m-%d %H:%M"), "to",
    strftime(max(pipe2$DateTime), "%Y-%m-%d %H:%M"), "\n")
## Pipe2: 1000 rows | 2015-10-22 20:00 to 2015-12-03 10:00

Pipe1 has irregular sub-hour timestamps while Pipe2 already has hourly readings. We floor both to the hour and take the mean within each hour to create a consistent hourly series.

p1_hourly <- pipe1 %>%
  mutate(Hour = floor_date(DateTime, "hour")) %>%
  group_by(Hour) %>%
  summarise(Pipe1 = mean(Flow, na.rm = TRUE), .groups = "drop")

p2_hourly <- pipe2 %>%
  mutate(Hour = floor_date(DateTime, "hour")) %>%
  group_by(Hour) %>%
  summarise(Pipe2 = mean(Flow, na.rm = TRUE), .groups = "drop")

combined <- full_join(p1_hourly, p2_hourly, by = "Hour") %>%
  arrange(Hour) %>%
  mutate(TotalFlow = rowSums(cbind(Pipe1, Pipe2), na.rm = TRUE))

cat("Pipe1 hourly rows:", nrow(p1_hourly), "\n")
## Pipe1 hourly rows: 236
cat("Pipe2 hourly rows:", nrow(p2_hourly), "\n")
## Pipe2 hourly rows: 667
cat("Combined rows:", nrow(combined), "\n")
## Combined rows: 746
cat("From:", strftime(min(combined$Hour), "%Y-%m-%d %H:%M"), "to",
    strftime(max(combined$Hour), "%Y-%m-%d %H:%M"), "\n")
## From: 2015-10-22 19:00 to 2015-12-03 10:00
kable(head(combined, 10), caption = "First 10 rows of hourly aggregated waterflow")
First 10 rows of hourly aggregated waterflow
Hour Pipe1 Pipe2 TotalFlow
2015-10-23 00:00:00 26.10280 NA 26.10280
2015-10-23 01:00:00 18.85202 30.94891 49.80093
2015-10-23 02:00:00 15.15857 NA 15.15857
2015-10-23 03:00:00 23.07886 37.98770 61.06656
2015-10-23 04:00:00 15.48219 33.98582 49.46801
2015-10-23 05:00:00 22.72539 NA 22.72539
2015-10-23 06:00:00 20.58987 28.23809 48.82796
2015-10-23 07:00:00 18.36703 18.27160 36.63863
2015-10-23 08:00:00 20.79170 NA 20.79170
2015-10-23 09:00:00 21.21875 55.77379 76.99254

3.3 Exploratory Analysis

ggplot(combined, aes(x = Hour, y = TotalFlow)) +
  geom_line(color = "steelblue", linewidth = 0.4) +
  labs(title = "Hourly Total Waterflow (Pipe1 + Pipe2)",
       x = NULL, y = "Flow") +
  theme_minimal()

combined %>%
  mutate(HourOfDay = hour(Hour)) %>%
  group_by(HourOfDay) %>%
  summarise(MeanFlow = mean(TotalFlow)) %>%
  ggplot(aes(x = HourOfDay, y = MeanFlow)) +
  geom_line(color = "tomato", linewidth = 1) +
  geom_point(color = "tomato") +
  labs(title = "Average Waterflow by Hour of Day",
       x = "Hour (0–23)", y = "Mean Flow") +
  theme_minimal()

3.4 Stationarity Testing

water_ts <- ts(combined$TotalFlow, frequency = 24)

cat("Series length:", length(water_ts), "\n")
## Series length: 746
# Skip adf.test — use KPSS only (no embed() dependency)
kpss_w <- kpss.test(water_ts)

cat("KPSS p-value:", round(kpss_w$p.value, 4),
    "—", ifelse(kpss_w$p.value > 0.05, "Stationary", "Non-stationary"), "\n")
## KPSS p-value: 0.01 — Non-stationary
# Visual stationarity check via ACF
ggplot(combined, aes(x = Hour, y = TotalFlow)) +
  geom_line(color = "steelblue", linewidth = 0.4) +
  labs(title = "Hourly Waterflow – Stationarity Check",
       x = NULL, y = "Total Flow") +
  theme_minimal()

max_lag <- max(1, min(48, floor(length(water_ts) / 3)))

Acf(water_ts,  main = "ACF – Hourly Waterflow",  lag.max = max_lag)

Pacf(water_ts, main = "PACF – Hourly Waterflow", lag.max = max_lag)

3.5 Forecasting

If the series is stationary (or made stationary), we proceed with TBATS — well-suited to hourly data with multiple seasonal periods (daily = 24, weekly = 168).

h_water <- 168  # one week forward

# TBATS handles complex multi-period seasonality
water_msts <- msts(combined$TotalFlow,
                   seasonal.periods = c(24, 168))

fit_water  <- tbats(water_msts)
fc_water   <- forecast(fit_water, h = h_water)

autoplot(fc_water) +
  labs(title = "Waterflow – One Week Forward Forecast (168 hours)",
       x = "Hour", y = "Flow") +
  theme_minimal()

checkresiduals(fit_water)

## 
##  Ljung-Box test
## 
## data:  Residuals from TBATS
## Q* = 185, df = 149, p-value = 0.0241
## 
## Model df: 0.   Total lags used: 149

3.6 Forecast Table & Export

last_hour  <- max(combined$Hour)
fc_hours   <- seq(last_hour + hours(1),
                  by = "hour", length.out = h_water)

water_fc_df <- data.frame(
  DateTime     = fc_hours,
  Forecast_Flow = round(as.numeric(fc_water$mean), 4),
  Lower_80      = round(as.numeric(fc_water$lower[,1]), 4),
  Upper_80      = round(as.numeric(fc_water$upper[,1]), 4)
)

kable(head(water_fc_df, 24),
      caption = "First 24 hours of one-week waterflow forecast")
First 24 hours of one-week waterflow forecast
DateTime Forecast_Flow Lower_80 Upper_80
2015-12-03 17:00:00 38.0828 21.6138 58.4600
2015-12-03 18:00:00 41.1513 23.9901 62.2002
2015-12-03 19:00:00 40.8552 23.7412 61.8682
2015-12-03 20:00:00 40.5132 23.2932 61.7300
2015-12-03 21:00:00 39.9241 22.8309 61.0218
2015-12-03 22:00:00 41.0688 23.6806 62.4729
2015-12-03 23:00:00 40.4177 22.9868 61.9705
2015-12-04 00:00:00 39.9743 22.6403 61.4362
2015-12-04 01:00:00 41.0477 23.4401 62.7904
2015-12-04 02:00:00 40.4028 22.7663 62.2754
2015-12-04 03:00:00 40.0055 22.4568 61.7958
2015-12-04 04:00:00 41.0316 23.2241 63.0846
2015-12-04 05:00:00 40.3881 22.5649 62.5530
2015-12-04 06:00:00 40.0357 22.2911 62.1272
2015-12-04 07:00:00 41.0153 23.0257 63.3526
2015-12-04 08:00:00 40.3748 22.3815 62.8076
2015-12-04 09:00:00 40.0647 22.1410 62.4327
2015-12-04 10:00:00 40.9990 22.8434 63.5972
2015-12-04 11:00:00 40.3628 22.2142 63.0415
2015-12-04 12:00:00 40.0924 22.0048 62.7146
2015-12-04 13:00:00 40.9826 22.6753 63.8207
2015-12-04 14:00:00 40.3521 22.0614 63.2568
2015-12-04 15:00:00 40.1189 21.8808 62.9751
2015-12-04 16:00:00 40.9662 22.5202 64.0251
write_xlsx(water_fc_df, "Waterflow_1Week_Forecast.xlsx")
cat("Forecast saved to Waterflow_1Week_Forecast.xlsx\n")
## Forecast saved to Waterflow_1Week_Forecast.xlsx

3.7 Discussion

  • Pipe1 had irregular sub-hourly timestamps; we floored to the nearest hour and averaged multiple readings per hour as instructed.
  • The combined series (Pipe1 + Pipe2) covers the overlapping time window and exhibits a discernible daily cycle.
  • ADF / KPSS tests determine whether differencing is needed. TBATS with periods c(24, 168) captures both the daily and weekly seasonal cycles simultaneously — making it the most appropriate method for hourly pipe flow data.
  • Wide prediction intervals reflect genuine uncertainty in flow data, which can be affected by unmodeled demand shocks or infrastructure events.

4 Session Info

sessionInfo()
## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] zoo_1.8-15      knitr_1.51      gridExtra_2.3   writexl_1.5.4  
##  [5] lubridate_1.9.5 tseries_0.10-61 forecast_9.0.2  ggplot2_4.0.2  
##  [9] tidyr_1.3.2     dplyr_1.2.0     readxl_1.4.5   
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     lattice_0.22-7     digest_0.6.39     
##  [5] magrittr_2.0.4     timechange_0.4.0   evaluate_1.0.5     grid_4.5.2        
##  [9] RColorBrewer_1.1-3 fastmap_1.2.0      Matrix_1.7-4       cellranger_1.1.0  
## [13] jsonlite_2.0.0     mgcv_1.9-3         purrr_1.2.1        scales_1.4.0      
## [17] jquerylib_0.1.4    cli_3.6.5          rlang_1.1.7        splines_4.5.2     
## [21] withr_3.0.2        cachem_1.1.0       yaml_2.3.12        tools_4.5.2       
## [25] parallel_4.5.2     colorspace_2.1-2   curl_7.0.0         vctrs_0.7.1       
## [29] R6_2.6.1           lifecycle_1.0.5    pkgconfig_2.0.3    urca_1.3-4        
## [33] pillar_1.11.1      bslib_0.10.0       gtable_0.3.6       glue_1.8.0        
## [37] quantmod_0.4.28    Rcpp_1.1.1         xfun_0.56          tibble_3.3.1      
## [41] tidyselect_1.2.1   rstudioapi_0.18.0  farver_2.1.2       htmltools_0.5.9   
## [45] nlme_3.1-168       labeling_0.4.3     rmarkdown_2.30     xts_0.14.2        
## [49] timeDate_4052.112  fracdiff_1.5-3     compiler_4.5.2     S7_0.2.1          
## [53] quadprog_1.5-8     TTR_0.24.4