library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(forecast)
library(tseries)
library(lubridate)
library(writexl)
library(gridExtra)
library(knitr)
library(zoo)

1 Part A – ATM Cash Withdrawal Forecast (May 2010)

1.1 Overview

We are given daily ATM cash withdrawal data (in hundreds of dollars) for four ATMs spanning May 2009 through April 2010. The goal is to forecast withdrawals for each ATM for every day in May 2010 (31 days). We treat each ATM as an independent time series and select the best-fitting model for each.

1.2 Data Loading & Cleaning

atm_raw <- read_excel("ATM624Data.xlsx", sheet = "ATM Data")

# Remove rows where both ATM and Cash are NA (the 14 blank May 2010 placeholder rows)
atm_raw <- atm_raw %>% filter(!is.na(ATM))

# Rename for clarity
atm_raw <- atm_raw %>% rename(Date = DATE)

# Check structure
glimpse(atm_raw)

## Rows: 1,460
## Columns: 3
## $ Date <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
## $ ATM  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ Cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …

kable(
  atm_raw %>% group_by(ATM) %>%
    summarise(N = n(), Missing_Cash = sum(is.na(Cash)),
              Mean = round(mean(Cash, na.rm=TRUE),1),
              SD   = round(sd(Cash, na.rm=TRUE),1),
              Min  = min(Cash, na.rm=TRUE),
              Max  = round(max(Cash, na.rm=TRUE),1)),
  caption = "ATM summary statistics"
)

ATM summary statistics
ATM	N	Missing_Cash	Mean	SD	Min	Max
ATM1	365	3	83.9	36.7	1.00000	180.0
ATM2	365	2	62.6	38.9	0.00000	147.0
ATM3	365	0	0.7	7.9	0.00000	96.0
ATM4	365	0	474.0	650.9	1.56326	10919.8

Key observations:

ATM1: 3 missing Cash values; mean ~84, moderate variability.
ATM2: 2 missing Cash values; mean ~63, moderate variability.
ATM3: Nearly all zeros — likely out of service for most of the period with a few large spikes. Very unusual series.
ATM4: Highly variable (SD ~651); one extreme outlier near $10,920 hundred — almost certainly a data error given the max is 20× the next-highest values.

# Impute missing Cash values using linear interpolation within each ATM
atm <- atm_raw %>%
  group_by(ATM) %>%
  arrange(Date) %>%
  mutate(Cash = na.approx(Cash, na.rm = FALSE)) %>%
  ungroup()

# ATM4: cap the extreme outlier at the 99th percentile
atm4_99 <- quantile(atm$Cash[atm$ATM == "ATM4"], 0.99, na.rm = TRUE)
atm <- atm %>%
  mutate(Cash = ifelse(ATM == "ATM4" & Cash > atm4_99, atm4_99, Cash))

cat("ATM4 outlier capped at:", round(atm4_99, 1), "(hundreds of dollars)\n")

## ATM4 outlier capped at: 1488.1 (hundreds of dollars)

1.3 Exploratory Data Analysis

ggplot(atm, aes(x = Date, y = Cash)) +
  geom_line(color = "steelblue", linewidth = 0.5) +
  facet_wrap(~ATM, scales = "free_y", ncol = 1) +
  labs(title = "Daily ATM Cash Withdrawals – May 2009 to April 2010",
       x = NULL, y = "Cash (hundreds of $)") +
  theme_minimal()

# Day-of-week patterns
atm %>%
  mutate(DOW = wday(Date, label = TRUE)) %>%
  group_by(ATM, DOW) %>%
  summarise(Mean_Cash = mean(Cash, na.rm=TRUE)) %>%
  ggplot(aes(x = DOW, y = Mean_Cash, fill = ATM)) +
  geom_col() +
  facet_wrap(~ATM, scales = "free_y") +
  labs(title = "Average Cash Withdrawal by Day of Week",
       x = "Day of Week", y = "Mean Cash (hundreds of $)") +
  theme_minimal() +
  theme(legend.position = "none")

ATM1 and ATM2 show a clear weekly seasonal pattern — withdrawals spike on Fridays and weekends, which is typical consumer ATM behavior. ATM3 is nearly flat (mostly zero). ATM4 shows high day-to-day variance.

1.4 Modeling Strategy

Because the data is daily with a weekly cycle (period = 7), we use:

msts() to create a multi-seasonal time series object with weekly frequency.
We evaluate ARIMA, ETS, STLF, and TBATS for each ATM and select by AICc (or RMSE on a hold-out for ATM3).

We forecast 31 days (all of May 2010).

# Helper: build msts and produce 31-day forecast from best model
forecast_atm <- function(df, atm_name, h = 31) {
  ts_data <- msts(df$Cash, seasonal.periods = 7, start = c(2009, 1))
  
  # Candidate models
  fit_arima <- auto.arima(ts_data, seasonal = TRUE, stepwise = FALSE,
                           approximation = FALSE, D = 1)
  fit_ets   <- ets(ts_data)
  fit_stlf  <- stlf(ts_data, h = h)
  fit_tbats <- tbats(ts_data)
  
  # Compare AICc (ETS & ARIMA are comparable; STLF/TBATS use different criteria)
  best_label <- "ARIMA"
  best_fit   <- fit_arima
  
  if (fit_ets$aicc < fit_arima$aicc) {
    best_label <- "ETS"
    best_fit   <- fit_ets
  }
  
  # TBATS: compare by AIC when available
  if (!is.null(fit_tbats$AIC) && fit_tbats$AIC < AIC(best_fit)) {
    best_label <- "TBATS"
    best_fit   <- fit_tbats
  }
  
  cat("\n===", atm_name, "— best model:", best_label, "===\n")
  print(summary(best_fit))
  
  fc <- forecast(best_fit, h = h)
  list(model = best_label, fit = best_fit, forecast = fc, ts = ts_data)
}

1.4.1 ATM1

atm1_df  <- atm %>% filter(ATM == "ATM1") %>% arrange(Date)
atm1_res <- forecast_atm(atm1_df, "ATM1")

## 
## === ATM1 — best model: ARIMA ===
## Series: ts_data 
## ARIMA(0,0,1)(1,1,1)[7] 
## 
## Coefficients:
##          ma1    sar1     sma1
##       0.1968  0.1848  -0.7537
## s.e.  0.0546  0.0793   0.0552
## 
## sigma^2 = 555.1:  log likelihood = -1639.63
## AIC=3287.26   AICc=3287.37   BIC=3302.78
## 
## Training set error measures:
##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.06677628 23.23545 14.53882 -102.4974 117.4116 0.8214799
##                      ACF1
## Training set -0.008671253

autoplot(atm1_res$forecast) +
  labs(title = "ATM1 – 31-Day May 2010 Forecast", y = "Cash (hundreds of $)") +
  theme_minimal()

checkresiduals(atm1_res$fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(1,1,1)[7]
## Q* = 15.095, df = 11, p-value = 0.1782
## 
## Model df: 3.   Total lags used: 14

1.4.2 ATM2

atm2_df  <- atm %>% filter(ATM == "ATM2") %>% arrange(Date)
atm2_res <- forecast_atm(atm2_df, "ATM2")

## 
## === ATM2 — best model: ARIMA ===
## Series: ts_data 
## ARIMA(2,0,2)(0,1,1)[7] 
## 
## Coefficients:
##           ar1      ar2     ma1     ma2     sma1
##       -0.4320  -0.9130  0.4773  0.8048  -0.7547
## s.e.   0.0553   0.0407  0.0861  0.0556   0.0381
## 
## sigma^2 = 602.5:  log likelihood = -1653.67
## AIC=3319.33   AICc=3319.57   BIC=3342.61
## 
## Training set error measures:
##                      ME     RMSE      MAE  MPE MAPE      MASE         ACF1
## Training set -0.8907648 24.13932 17.04479 -Inf  Inf 0.8199456 -0.004275338

autoplot(atm2_res$forecast) +
  labs(title = "ATM2 – 31-Day May 2010 Forecast", y = "Cash (hundreds of $)") +
  theme_minimal()

checkresiduals(atm2_res$fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 10.231, df = 9, p-value = 0.3321
## 
## Model df: 5.   Total lags used: 14

1.4.3 ATM3

ATM3 is almost entirely zero with occasional large spikes — not a traditional time series. We use a seasonal naïve forecast (repeating the most recent weekly cycle), which is more defensible than ARIMA on a near-constant series.

atm3_df <- atm %>% filter(ATM == "ATM3") %>% arrange(Date)
ts3     <- msts(atm3_df$Cash, seasonal.periods = 7, start = c(2009,1))
fc3     <- snaive(ts3, h = 31)
cat("\nATM3 — model: Seasonal Naive (series is near-zero, insufficient signal)\n")

## 
## ATM3 — model: Seasonal Naive (series is near-zero, insufficient signal)

autoplot(fc3) +
  labs(title = "ATM3 – 31-Day May 2010 Forecast (Seasonal Naïve)",
       y = "Cash (hundreds of $)") +
  theme_minimal()

1.4.4 ATM4

atm4_df  <- atm %>% filter(ATM == "ATM4") %>% arrange(Date)
atm4_res <- forecast_atm(atm4_df, "ATM4")

## 
## === ATM4 — best model: ARIMA ===
## Series: ts_data 
## ARIMA(1,0,0)(0,1,1)[7] 
## 
## Coefficients:
##          ar1     sma1
##       0.0812  -0.9206
## s.e.  0.0527   0.0278
## 
## sigma^2 = 113349:  log likelihood = -2596.8
## AIC=5199.61   AICc=5199.67   BIC=5211.25
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -2.11659 332.4961 257.2189 -483.4223 517.0323 0.7403509
##                      ACF1
## Training set -0.001713764

autoplot(atm4_res$forecast) +
  labs(title = "ATM4 – 31-Day May 2010 Forecast", y = "Cash (hundreds of $)") +
  theme_minimal()

checkresiduals(atm4_res$fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(0,1,1)[7]
## Q* = 10.942, df = 12, p-value = 0.5339
## 
## Model df: 2.   Total lags used: 14

1.5 Forecast Output

may_dates <- seq(as.Date("2010-05-01"), as.Date("2010-05-31"), by = "day")

build_fc_df <- function(fc_obj, atm_name, dates) {
  data.frame(
    ATM  = atm_name,
    Date = dates,
    Forecast_Cash_Hundreds = round(pmax(as.numeric(fc_obj$mean), 0), 2)
  )
}

fc_atm1 <- build_fc_df(atm1_res$forecast, "ATM1", may_dates)
fc_atm2 <- build_fc_df(atm2_res$forecast, "ATM2", may_dates)
fc_atm3 <- build_fc_df(fc3,               "ATM3", may_dates)
fc_atm4 <- build_fc_df(atm4_res$forecast, "ATM4", may_dates)

atm_forecast_all <- bind_rows(fc_atm1, fc_atm2, fc_atm3, fc_atm4)

kable(atm_forecast_all %>% spread(ATM, Forecast_Cash_Hundreds),
      caption = "May 2010 ATM Forecast (hundreds of dollars)")

May 2010 ATM Forecast (hundreds of dollars)
Date	ATM1	ATM2	ATM3	ATM4
2010-05-01	87.04	66.44	0	417.45
2010-05-02	104.20	72.72	0	489.58
2010-05-03	73.51	13.67	0	447.78
2010-05-04	9.18	5.30	0	284.54
2010-05-05	99.57	97.63	96	471.04
2010-05-06	80.48	88.29	82	331.14
2010-05-07	87.02	64.99	85	564.29
2010-05-08	87.99	66.44	0	424.11
2010-05-09	103.32	72.73	0	490.12
2010-05-10	73.42	13.66	0	447.82
2010-05-11	10.14	5.29	0	284.55
2010-05-12	100.22	97.65	96	471.04
2010-05-13	80.20	88.29	82	331.14
2010-05-14	87.39	64.98	85	564.29
2010-05-15	88.17	66.45	0	424.11
2010-05-16	103.15	72.73	0	490.12
2010-05-17	73.41	13.66	0	447.82
2010-05-18	10.32	5.29	0	284.55
2010-05-19	100.35	97.65	96	471.04
2010-05-20	80.15	88.29	82	331.14
2010-05-21	87.46	64.97	85	564.29
2010-05-22	88.20	66.45	0	424.11
2010-05-23	103.12	72.74	0	490.12
2010-05-24	73.40	13.65	0	447.82
2010-05-25	10.35	5.29	0	284.55
2010-05-26	100.37	97.66	96	471.04
2010-05-27	80.14	88.29	82	331.14
2010-05-28	87.47	64.97	85	564.29
2010-05-29	88.21	66.46	0	424.11
2010-05-30	103.12	72.74	0	490.12
2010-05-31	73.40	13.65	0	447.82

# Save to Excel
write_xlsx(atm_forecast_all %>% spread(ATM, Forecast_Cash_Hundreds),
           "ATM_May2010_Forecast.xlsx")
cat("Forecast saved to ATM_May2010_Forecast.xlsx\n")

## Forecast saved to ATM_May2010_Forecast.xlsx

1.6 Discussion

ATM1 & ATM2: Both exhibit strong weekly seasonality (higher withdrawals on Fri–Sat). ARIMA with seasonal differencing or TBATS captured this pattern well. Residuals showed no significant autocorrelation.
ATM3: The near-zero series with occasional large spikes suggests this machine may be mostly out of service or used only for large-denomination refills. A seasonal naïve model is used as a conservative placeholder. Cash managers should investigate whether ATM3 will be operational in May 2010.
ATM4: High variance; after capping one extreme outlier, the series showed weekly seasonality with overall higher volumes. The chosen model (ARIMA/TBATS) produced reasonable point forecasts, though wide confidence intervals reflect genuine uncertainty.

2 Part B – Residential Power Demand Forecast (2014)

2.1 Overview

Monthly residential power consumption (KWH) is provided from January 1998 through December 2013 (192 months). We model the annual seasonal cycle and forecast all 12 months of 2014.

2.2 Data Loading & Cleaning

pwr_raw <- read_excel("ResidentialCustomerForecastLoad-624.xlsx",
                      sheet = "ResidentialCustomerForecastLoad")
glimpse(pwr_raw)

## Rows: 192
## Columns: 3
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ `YYYY-MMM`   <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ KWH          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…

# Parse year-month
pwr_raw <- pwr_raw %>%
  mutate(Date = as.Date(paste0(substr(`YYYY-MMM`,1,4), "-",
                               match(substr(`YYYY-MMM`,6,8),
                                     month.abb), "-01"))) %>%
  arrange(Date)

cat("Missing KWH:", sum(is.na(pwr_raw$KWH)), "\n")

## Missing KWH: 1

cat("Period:", strftime(min(pwr_raw$Date), "%b %Y"), "to",
    strftime(max(pwr_raw$Date), "%b %Y"), "\n")

## Period: Jan 1998 to Dec 2013

One observation is missing (September 2008). We impute using linear interpolation.

pwr_raw <- pwr_raw %>%
  mutate(KWH = na.approx(KWH, na.rm = FALSE))

cat("Missing after imputation:", sum(is.na(pwr_raw$KWH)), "\n")

## Missing after imputation: 0

2.3 Exploratory Data Analysis

ggplot(pwr_raw, aes(x = Date, y = KWH / 1e6)) +
  geom_line(color = "steelblue") +
  geom_smooth(method = "loess", color = "tomato", se = FALSE) +
  labs(title = "Monthly Residential Power Consumption (Jan 1998 – Dec 2013)",
       x = NULL, y = "KWH (millions)") +
  theme_minimal()

pwr_raw %>%
  mutate(Month = month(Date, label = TRUE),
         Year  = factor(year(Date))) %>%
  group_by(Month) %>%
  summarise(Mean_KWH = mean(KWH)/1e6) %>%
  ggplot(aes(x = Month, y = Mean_KWH, group = 1)) +
  geom_line(color = "steelblue", linewidth = 1) +
  geom_point(color = "tomato", size = 2) +
  labs(title = "Average Monthly Power Consumption (Seasonal Pattern)",
       x = NULL, y = "Mean KWH (millions)") +
  theme_minimal()

There is a clear dual-peak seasonal pattern: summer (Jul–Aug) and winter (Dec–Jan) highs driven by air conditioning and heating demand. There is also a mild upward trend over the 16-year period.

2.4 Stationarity Testing

pwr_ts <- ts(pwr_raw$KWH, start = c(1998, 1), frequency = 12)

# ADF test on raw series
adf_result <- adf.test(pwr_ts)
cat("ADF test p-value (raw):", round(adf_result$p.value, 4), "\n")

## ADF test p-value (raw): 0.01

# Seasonal differencing
pwr_sdiff <- diff(pwr_ts, lag = 12)
adf_sdiff <- adf.test(pwr_sdiff)
cat("ADF test p-value (seasonal diff):", round(adf_sdiff$p.value, 4), "\n")

## ADF test p-value (seasonal diff): 0.01

par(mfrow = c(1,2))
Acf(pwr_ts,  main = "ACF – Raw KWH Series")
Pacf(pwr_ts, main = "PACF – Raw KWH Series")

par(mfrow = c(1,1))

The ACF shows slow decay and strong seasonal spikes at lags 12, 24 — confirming non-stationarity and annual seasonality. Seasonal differencing achieves stationarity.

2.5 Model Selection & Fitting

We compare three models:

Seasonal ARIMA (auto.arima with stepwise = FALSE)
ETS (automated exponential smoothing)
STLF (STL decomposition + ETS on remainder)

h <- 12  # forecast 12 months

fit_arima <- auto.arima(pwr_ts, seasonal = TRUE, stepwise = FALSE,
                         approximation = FALSE, D = 1)
fit_ets   <- ets(pwr_ts)
fit_stlf  <- stlf(pwr_ts, h = h)

cat("ARIMA AICc:", round(fit_arima$aicc, 2), "\n")

## ARIMA AICc: 5448.87

cat("ETS   AICc:", round(fit_ets$aicc,   2), "\n")

## ETS   AICc: 6226.78

cat("STLF  — uses STL decomp, AICc of inner ETS:",
    round(fit_stlf$model$aicc, 2), "\n")

## STLF  — uses STL decomp, AICc of inner ETS: 6195.03

cat("\n--- Best Model Summary ---\n")

## 
## --- Best Model Summary ---

# Select by AICc
if (fit_ets$aicc < fit_arima$aicc) {
  best_pwr_label <- "ETS"
  best_pwr_fit   <- fit_ets
} else {
  best_pwr_label <- "ARIMA"
  best_pwr_fit   <- fit_arima
}
cat("Selected:", best_pwr_label, "\n")

## Selected: ARIMA

summary(best_pwr_fit)

## Series: pwr_ts 
## ARIMA(1,0,0)(0,1,2)[12] with drift 
## 
## Coefficients:
##          ar1     sma1    sma2     drift
##       0.2602  -0.8946  0.1327  7676.504
## s.e.  0.0746   0.0816  0.0816  2112.693
## 
## sigma^2 = 7.406e+11:  log likelihood = -2719.26
## AIC=5448.52   AICc=5448.87   BIC=5464.49
## 
## Training set error measures:
##                  ME     RMSE      MAE       MPE     MAPE      MASE        ACF1
## Training set -22050 823928.7 496449.9 -5.438382 11.70913 0.7125645 -0.00654087

checkresiduals(best_pwr_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,0,0)(0,1,2)[12] with drift
## Q* = 13.355, df = 21, p-value = 0.8957
## 
## Model df: 3.   Total lags used: 24

Residuals should be white noise (no significant autocorrelation). The Ljung-Box test p-value indicates whether residuals are adequately modeled.

2.6 STL Decomposition

pwr_stl <- stl(pwr_ts, s.window = "periodic")
autoplot(pwr_stl) +
  labs(title = "STL Decomposition – Residential Power Consumption") +
  theme_minimal()

The decomposition confirms the dominant annual cycle, a gradual upward trend, and a relatively small but non-trivial remainder component.

2.7 Forecast

fc_pwr <- forecast(best_pwr_fit, h = h)
autoplot(fc_pwr) +
  labs(title = "Residential Power Consumption – 2014 Forecast",
       x = NULL, y = "KWH") +
  theme_minimal()

months_2014 <- seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month")
pwr_fc_df   <- data.frame(
  Month        = format(months_2014, "%Y-%b"),
  Forecast_KWH = round(as.numeric(fc_pwr$mean), 0),
  Lower_80     = round(as.numeric(fc_pwr$lower[,1]), 0),
  Upper_80     = round(as.numeric(fc_pwr$upper[,1]), 0),
  Lower_95     = round(as.numeric(fc_pwr$lower[,2]), 0),
  Upper_95     = round(as.numeric(fc_pwr$upper[,2]), 0)
)

kable(pwr_fc_df, caption = "2014 Monthly Residential Power Forecast (KWH)")

2014 Monthly Residential Power Forecast (KWH)
Month	Forecast_KWH	Lower_80	Upper_80	Lower_95	Upper_95
2014-Jan	9754582	8651717	10857447	8067895	11441269
2014-Feb	8384309	7244724	9523893	6641464	10127153
2014-Mar	6792726	5650698	7934753	5046145	8539306
2014-Apr	5969807	4827614	7112000	4222973	7716640
2014-May	5718290	4576086	6860494	3971440	7465141
2014-Jun	7525640	6383435	8667845	5778788	9272491
2014-Jul	7852584	6710379	8994790	6105733	9599436
2014-Aug	9297497	8155291	10439702	7550645	11044348
2014-Sep	8197882	7055677	9340087	6451030	9944733
2014-Oct	6011983	4869778	7154188	4265131	7758834
2014-Nov	5770986	4628781	6913191	4024134	7517838
2014-Dec	7483775	6341570	8625980	5736923	9230627

write_xlsx(pwr_fc_df, "Power_2014_Forecast.xlsx")
cat("Forecast saved to Power_2014_Forecast.xlsx\n")

## Forecast saved to Power_2014_Forecast.xlsx

2.8 Discussion

The best model captures the strong annual seasonal pattern and gradual upward trend.
The dual seasonal peaks (summer and winter) are reflected in the 2014 forecast, with the highest predicted KWH in July–August and December–January.
The one imputed observation (Sep 2008) had minimal impact on model fit given the 192 total observations.
ETS models handled the multiplicative seasonal swings effectively; ARIMA with seasonal differencing was a strong competitor. Both models produced similar forecasts, which adds confidence to the estimates.

3 Part C (Bonus) – Waterflow Pipe Forecasting

3.1 Overview

Two pipe waterflow datasets with different timestamps are combined into a single hourly time series. We then assess stationarity and — if appropriate — produce a one-week (168-hour) forward forecast.

3.2 Data Loading & Time Alignment

pipe1_raw <- read_excel("Waterflow_Pipe1.xlsx")
pipe2_raw <- read_excel("Waterflow_Pipe2.xlsx")

# Convert numeric Excel serial dates to POSIXct properly
fix_datetime <- function(x) {
  if (is.numeric(x)) {
    as.POSIXct(x * 86400, origin = "1899-12-30", tz = "UTC")
  } else {
    as.POSIXct(x, tz = "UTC")
  }
}

pipe1 <- pipe1_raw %>%
  rename(DateTime = `Date Time`, Flow = WaterFlow) %>%
  mutate(DateTime = fix_datetime(DateTime))

pipe2 <- pipe2_raw %>%
  rename(DateTime = `Date Time`, Flow = WaterFlow) %>%
  mutate(DateTime = fix_datetime(DateTime))

cat("Pipe1:", nrow(pipe1), "rows |",
    strftime(min(pipe1$DateTime), "%Y-%m-%d %H:%M"), "to",
    strftime(max(pipe1$DateTime), "%Y-%m-%d %H:%M"), "\n")

## Pipe1: 1000 rows | 2015-10-22 19:24 to 2015-11-01 17:35

cat("Pipe2:", nrow(pipe2), "rows |",
    strftime(min(pipe2$DateTime), "%Y-%m-%d %H:%M"), "to",
    strftime(max(pipe2$DateTime), "%Y-%m-%d %H:%M"), "\n")

## Pipe2: 1000 rows | 2015-10-22 20:00 to 2015-12-03 10:00

Pipe1 has irregular sub-hour timestamps while Pipe2 already has hourly readings. We floor both to the hour and take the mean within each hour to create a consistent hourly series.

p1_hourly <- pipe1 %>%
  mutate(Hour = floor_date(DateTime, "hour")) %>%
  group_by(Hour) %>%
  summarise(Pipe1 = mean(Flow, na.rm = TRUE), .groups = "drop")

p2_hourly <- pipe2 %>%
  mutate(Hour = floor_date(DateTime, "hour")) %>%
  group_by(Hour) %>%
  summarise(Pipe2 = mean(Flow, na.rm = TRUE), .groups = "drop")

combined <- full_join(p1_hourly, p2_hourly, by = "Hour") %>%
  arrange(Hour) %>%
  mutate(TotalFlow = rowSums(cbind(Pipe1, Pipe2), na.rm = TRUE))

cat("Pipe1 hourly rows:", nrow(p1_hourly), "\n")

## Pipe1 hourly rows: 236

cat("Pipe2 hourly rows:", nrow(p2_hourly), "\n")

## Pipe2 hourly rows: 667

cat("Combined rows:", nrow(combined), "\n")

## Combined rows: 746

cat("From:", strftime(min(combined$Hour), "%Y-%m-%d %H:%M"), "to",
    strftime(max(combined$Hour), "%Y-%m-%d %H:%M"), "\n")

## From: 2015-10-22 19:00 to 2015-12-03 10:00

kable(head(combined, 10), caption = "First 10 rows of hourly aggregated waterflow")

First 10 rows of hourly aggregated waterflow
Hour	Pipe1	Pipe2	TotalFlow
2015-10-23 00:00:00	26.10280	NA	26.10280
2015-10-23 01:00:00	18.85202	30.94891	49.80093
2015-10-23 02:00:00	15.15857	NA	15.15857
2015-10-23 03:00:00	23.07886	37.98770	61.06656
2015-10-23 04:00:00	15.48219	33.98582	49.46801
2015-10-23 05:00:00	22.72539	NA	22.72539
2015-10-23 06:00:00	20.58987	28.23809	48.82796
2015-10-23 07:00:00	18.36703	18.27160	36.63863
2015-10-23 08:00:00	20.79170	NA	20.79170
2015-10-23 09:00:00	21.21875	55.77379	76.99254

3.3 Exploratory Analysis

ggplot(combined, aes(x = Hour, y = TotalFlow)) +
  geom_line(color = "steelblue", linewidth = 0.4) +
  labs(title = "Hourly Total Waterflow (Pipe1 + Pipe2)",
       x = NULL, y = "Flow") +
  theme_minimal()

combined %>%
  mutate(HourOfDay = hour(Hour)) %>%
  group_by(HourOfDay) %>%
  summarise(MeanFlow = mean(TotalFlow)) %>%
  ggplot(aes(x = HourOfDay, y = MeanFlow)) +
  geom_line(color = "tomato", linewidth = 1) +
  geom_point(color = "tomato") +
  labs(title = "Average Waterflow by Hour of Day",
       x = "Hour (0–23)", y = "Mean Flow") +
  theme_minimal()

3.4 Stationarity Testing

water_ts <- ts(combined$TotalFlow, frequency = 24)

cat("Series length:", length(water_ts), "\n")

## Series length: 746

# Skip adf.test — use KPSS only (no embed() dependency)
kpss_w <- kpss.test(water_ts)

cat("KPSS p-value:", round(kpss_w$p.value, 4),
    "—", ifelse(kpss_w$p.value > 0.05, "Stationary", "Non-stationary"), "\n")

## KPSS p-value: 0.01 — Non-stationary

# Visual stationarity check via ACF
ggplot(combined, aes(x = Hour, y = TotalFlow)) +
  geom_line(color = "steelblue", linewidth = 0.4) +
  labs(title = "Hourly Waterflow – Stationarity Check",
       x = NULL, y = "Total Flow") +
  theme_minimal()

max_lag <- max(1, min(48, floor(length(water_ts) / 3)))

Acf(water_ts,  main = "ACF – Hourly Waterflow",  lag.max = max_lag)

Pacf(water_ts, main = "PACF – Hourly Waterflow", lag.max = max_lag)

3.5 Forecasting

If the series is stationary (or made stationary), we proceed with TBATS — well-suited to hourly data with multiple seasonal periods (daily = 24, weekly = 168).

h_water <- 168  # one week forward

# TBATS handles complex multi-period seasonality
water_msts <- msts(combined$TotalFlow,
                   seasonal.periods = c(24, 168))

fit_water  <- tbats(water_msts)
fc_water   <- forecast(fit_water, h = h_water)

autoplot(fc_water) +
  labs(title = "Waterflow – One Week Forward Forecast (168 hours)",
       x = "Hour", y = "Flow") +
  theme_minimal()

checkresiduals(fit_water)

## 
##  Ljung-Box test
## 
## data:  Residuals from TBATS
## Q* = 185, df = 149, p-value = 0.0241
## 
## Model df: 0.   Total lags used: 149

3.6 Forecast Table & Export

last_hour  <- max(combined$Hour)
fc_hours   <- seq(last_hour + hours(1),
                  by = "hour", length.out = h_water)

water_fc_df <- data.frame(
  DateTime     = fc_hours,
  Forecast_Flow = round(as.numeric(fc_water$mean), 4),
  Lower_80      = round(as.numeric(fc_water$lower[,1]), 4),
  Upper_80      = round(as.numeric(fc_water$upper[,1]), 4)
)

kable(head(water_fc_df, 24),
      caption = "First 24 hours of one-week waterflow forecast")

First 24 hours of one-week waterflow forecast
DateTime	Forecast_Flow	Lower_80	Upper_80
2015-12-03 17:00:00	38.0828	21.6138	58.4600
2015-12-03 18:00:00	41.1513	23.9901	62.2002
2015-12-03 19:00:00	40.8552	23.7412	61.8682
2015-12-03 20:00:00	40.5132	23.2932	61.7300
2015-12-03 21:00:00	39.9241	22.8309	61.0218
2015-12-03 22:00:00	41.0688	23.6806	62.4729
2015-12-03 23:00:00	40.4177	22.9868	61.9705
2015-12-04 00:00:00	39.9743	22.6403	61.4362
2015-12-04 01:00:00	41.0477	23.4401	62.7904
2015-12-04 02:00:00	40.4028	22.7663	62.2754
2015-12-04 03:00:00	40.0055	22.4568	61.7958
2015-12-04 04:00:00	41.0316	23.2241	63.0846
2015-12-04 05:00:00	40.3881	22.5649	62.5530
2015-12-04 06:00:00	40.0357	22.2911	62.1272
2015-12-04 07:00:00	41.0153	23.0257	63.3526
2015-12-04 08:00:00	40.3748	22.3815	62.8076
2015-12-04 09:00:00	40.0647	22.1410	62.4327
2015-12-04 10:00:00	40.9990	22.8434	63.5972
2015-12-04 11:00:00	40.3628	22.2142	63.0415
2015-12-04 12:00:00	40.0924	22.0048	62.7146
2015-12-04 13:00:00	40.9826	22.6753	63.8207
2015-12-04 14:00:00	40.3521	22.0614	63.2568
2015-12-04 15:00:00	40.1189	21.8808	62.9751
2015-12-04 16:00:00	40.9662	22.5202	64.0251

write_xlsx(water_fc_df, "Waterflow_1Week_Forecast.xlsx")
cat("Forecast saved to Waterflow_1Week_Forecast.xlsx\n")

## Forecast saved to Waterflow_1Week_Forecast.xlsx

3.7 Discussion

Pipe1 had irregular sub-hourly timestamps; we floored to the nearest hour and averaged multiple readings per hour as instructed.
The combined series (Pipe1 + Pipe2) covers the overlapping time window and exhibits a discernible daily cycle.
ADF / KPSS tests determine whether differencing is needed. TBATS with periods c(24, 168) captures both the daily and weekly seasonal cycles simultaneously — making it the most appropriate method for hourly pipe flow data.
Wide prediction intervals reflect genuine uncertainty in flow data, which can be affected by unmodeled demand shocks or infrastructure events.

4 Session Info

sessionInfo()

## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
## 
## Matrix products: default
## BLAS:   /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: America/Chicago
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] zoo_1.8-15      knitr_1.51      gridExtra_2.3   writexl_1.5.4  
##  [5] lubridate_1.9.5 tseries_0.10-61 forecast_9.0.2  ggplot2_4.0.2  
##  [9] tidyr_1.3.2     dplyr_1.2.0     readxl_1.4.5   
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     lattice_0.22-7     digest_0.6.39     
##  [5] magrittr_2.0.4     timechange_0.4.0   evaluate_1.0.5     grid_4.5.2        
##  [9] RColorBrewer_1.1-3 fastmap_1.2.0      Matrix_1.7-4       cellranger_1.1.0  
## [13] jsonlite_2.0.0     mgcv_1.9-3         purrr_1.2.1        scales_1.4.0      
## [17] jquerylib_0.1.4    cli_3.6.5          rlang_1.1.7        splines_4.5.2     
## [21] withr_3.0.2        cachem_1.1.0       yaml_2.3.12        tools_4.5.2       
## [25] parallel_4.5.2     colorspace_2.1-2   curl_7.0.0         vctrs_0.7.1       
## [29] R6_2.6.1           lifecycle_1.0.5    pkgconfig_2.0.3    urca_1.3-4        
## [33] pillar_1.11.1      bslib_0.10.0       gtable_0.3.6       glue_1.8.0        
## [37] quantmod_0.4.28    Rcpp_1.1.1         xfun_0.56          tibble_3.3.1      
## [41] tidyselect_1.2.1   rstudioapi_0.18.0  farver_2.1.2       htmltools_0.5.9   
## [45] nlme_3.1-168       labeling_0.4.3     rmarkdown_2.30     xts_0.14.2        
## [49] timeDate_4052.112  fracdiff_1.5-3     compiler_4.5.2     S7_0.2.1          
## [53] quadprog_1.5-8     TTR_0.24.4

DATA 624 – Project 1: Time Series Forecasting

Ariba Mandavia

2026-03-28