library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(forecast)
library(tseries)
library(lubridate)
library(writexl)
library(gridExtra)
library(knitr)
library(zoo)We are given daily ATM cash withdrawal data (in hundreds of dollars) for four ATMs spanning May 2009 through April 2010. The goal is to forecast withdrawals for each ATM for every day in May 2010 (31 days). We treat each ATM as an independent time series and select the best-fitting model for each.
atm_raw <- read_excel("ATM624Data.xlsx", sheet = "ATM Data")
# Remove rows where both ATM and Cash are NA (the 14 blank May 2010 placeholder rows)
atm_raw <- atm_raw %>% filter(!is.na(ATM))
# Rename for clarity
atm_raw <- atm_raw %>% rename(Date = DATE)
# Check structure
glimpse(atm_raw)## Rows: 1,460
## Columns: 3
## $ Date <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
## $ ATM <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ Cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …
kable(
atm_raw %>% group_by(ATM) %>%
summarise(N = n(), Missing_Cash = sum(is.na(Cash)),
Mean = round(mean(Cash, na.rm=TRUE),1),
SD = round(sd(Cash, na.rm=TRUE),1),
Min = min(Cash, na.rm=TRUE),
Max = round(max(Cash, na.rm=TRUE),1)),
caption = "ATM summary statistics"
)| ATM | N | Missing_Cash | Mean | SD | Min | Max |
|---|---|---|---|---|---|---|
| ATM1 | 365 | 3 | 83.9 | 36.7 | 1.00000 | 180.0 |
| ATM2 | 365 | 2 | 62.6 | 38.9 | 0.00000 | 147.0 |
| ATM3 | 365 | 0 | 0.7 | 7.9 | 0.00000 | 96.0 |
| ATM4 | 365 | 0 | 474.0 | 650.9 | 1.56326 | 10919.8 |
Key observations:
# Impute missing Cash values using linear interpolation within each ATM
atm <- atm_raw %>%
group_by(ATM) %>%
arrange(Date) %>%
mutate(Cash = na.approx(Cash, na.rm = FALSE)) %>%
ungroup()
# ATM4: cap the extreme outlier at the 99th percentile
atm4_99 <- quantile(atm$Cash[atm$ATM == "ATM4"], 0.99, na.rm = TRUE)
atm <- atm %>%
mutate(Cash = ifelse(ATM == "ATM4" & Cash > atm4_99, atm4_99, Cash))
cat("ATM4 outlier capped at:", round(atm4_99, 1), "(hundreds of dollars)\n")## ATM4 outlier capped at: 1488.1 (hundreds of dollars)
ggplot(atm, aes(x = Date, y = Cash)) +
geom_line(color = "steelblue", linewidth = 0.5) +
facet_wrap(~ATM, scales = "free_y", ncol = 1) +
labs(title = "Daily ATM Cash Withdrawals – May 2009 to April 2010",
x = NULL, y = "Cash (hundreds of $)") +
theme_minimal()# Day-of-week patterns
atm %>%
mutate(DOW = wday(Date, label = TRUE)) %>%
group_by(ATM, DOW) %>%
summarise(Mean_Cash = mean(Cash, na.rm=TRUE)) %>%
ggplot(aes(x = DOW, y = Mean_Cash, fill = ATM)) +
geom_col() +
facet_wrap(~ATM, scales = "free_y") +
labs(title = "Average Cash Withdrawal by Day of Week",
x = "Day of Week", y = "Mean Cash (hundreds of $)") +
theme_minimal() +
theme(legend.position = "none")ATM1 and ATM2 show a clear weekly seasonal pattern — withdrawals spike on Fridays and weekends, which is typical consumer ATM behavior. ATM3 is nearly flat (mostly zero). ATM4 shows high day-to-day variance.
Because the data is daily with a weekly cycle (period = 7), we use:
msts() to create a multi-seasonal time series object
with weekly frequency.We forecast 31 days (all of May 2010).
# Helper: build msts and produce 31-day forecast from best model
forecast_atm <- function(df, atm_name, h = 31) {
ts_data <- msts(df$Cash, seasonal.periods = 7, start = c(2009, 1))
# Candidate models
fit_arima <- auto.arima(ts_data, seasonal = TRUE, stepwise = FALSE,
approximation = FALSE, D = 1)
fit_ets <- ets(ts_data)
fit_stlf <- stlf(ts_data, h = h)
fit_tbats <- tbats(ts_data)
# Compare AICc (ETS & ARIMA are comparable; STLF/TBATS use different criteria)
best_label <- "ARIMA"
best_fit <- fit_arima
if (fit_ets$aicc < fit_arima$aicc) {
best_label <- "ETS"
best_fit <- fit_ets
}
# TBATS: compare by AIC when available
if (!is.null(fit_tbats$AIC) && fit_tbats$AIC < AIC(best_fit)) {
best_label <- "TBATS"
best_fit <- fit_tbats
}
cat("\n===", atm_name, "— best model:", best_label, "===\n")
print(summary(best_fit))
fc <- forecast(best_fit, h = h)
list(model = best_label, fit = best_fit, forecast = fc, ts = ts_data)
}atm1_df <- atm %>% filter(ATM == "ATM1") %>% arrange(Date)
atm1_res <- forecast_atm(atm1_df, "ATM1")##
## === ATM1 — best model: ARIMA ===
## Series: ts_data
## ARIMA(0,0,1)(1,1,1)[7]
##
## Coefficients:
## ma1 sar1 sma1
## 0.1968 0.1848 -0.7537
## s.e. 0.0546 0.0793 0.0552
##
## sigma^2 = 555.1: log likelihood = -1639.63
## AIC=3287.26 AICc=3287.37 BIC=3302.78
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.06677628 23.23545 14.53882 -102.4974 117.4116 0.8214799
## ACF1
## Training set -0.008671253
autoplot(atm1_res$forecast) +
labs(title = "ATM1 – 31-Day May 2010 Forecast", y = "Cash (hundreds of $)") +
theme_minimal()##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(1,1,1)[7]
## Q* = 15.095, df = 11, p-value = 0.1782
##
## Model df: 3. Total lags used: 14
atm2_df <- atm %>% filter(ATM == "ATM2") %>% arrange(Date)
atm2_res <- forecast_atm(atm2_df, "ATM2")##
## === ATM2 — best model: ARIMA ===
## Series: ts_data
## ARIMA(2,0,2)(0,1,1)[7]
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4320 -0.9130 0.4773 0.8048 -0.7547
## s.e. 0.0553 0.0407 0.0861 0.0556 0.0381
##
## sigma^2 = 602.5: log likelihood = -1653.67
## AIC=3319.33 AICc=3319.57 BIC=3342.61
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.8907648 24.13932 17.04479 -Inf Inf 0.8199456 -0.004275338
autoplot(atm2_res$forecast) +
labs(title = "ATM2 – 31-Day May 2010 Forecast", y = "Cash (hundreds of $)") +
theme_minimal()##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,0,2)(0,1,1)[7]
## Q* = 10.231, df = 9, p-value = 0.3321
##
## Model df: 5. Total lags used: 14
ATM3 is almost entirely zero with occasional large spikes — not a traditional time series. We use a seasonal naïve forecast (repeating the most recent weekly cycle), which is more defensible than ARIMA on a near-constant series.
atm3_df <- atm %>% filter(ATM == "ATM3") %>% arrange(Date)
ts3 <- msts(atm3_df$Cash, seasonal.periods = 7, start = c(2009,1))
fc3 <- snaive(ts3, h = 31)
cat("\nATM3 — model: Seasonal Naive (series is near-zero, insufficient signal)\n")##
## ATM3 — model: Seasonal Naive (series is near-zero, insufficient signal)
autoplot(fc3) +
labs(title = "ATM3 – 31-Day May 2010 Forecast (Seasonal Naïve)",
y = "Cash (hundreds of $)") +
theme_minimal()atm4_df <- atm %>% filter(ATM == "ATM4") %>% arrange(Date)
atm4_res <- forecast_atm(atm4_df, "ATM4")##
## === ATM4 — best model: ARIMA ===
## Series: ts_data
## ARIMA(1,0,0)(0,1,1)[7]
##
## Coefficients:
## ar1 sma1
## 0.0812 -0.9206
## s.e. 0.0527 0.0278
##
## sigma^2 = 113349: log likelihood = -2596.8
## AIC=5199.61 AICc=5199.67 BIC=5211.25
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -2.11659 332.4961 257.2189 -483.4223 517.0323 0.7403509
## ACF1
## Training set -0.001713764
autoplot(atm4_res$forecast) +
labs(title = "ATM4 – 31-Day May 2010 Forecast", y = "Cash (hundreds of $)") +
theme_minimal()##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(0,1,1)[7]
## Q* = 10.942, df = 12, p-value = 0.5339
##
## Model df: 2. Total lags used: 14
may_dates <- seq(as.Date("2010-05-01"), as.Date("2010-05-31"), by = "day")
build_fc_df <- function(fc_obj, atm_name, dates) {
data.frame(
ATM = atm_name,
Date = dates,
Forecast_Cash_Hundreds = round(pmax(as.numeric(fc_obj$mean), 0), 2)
)
}
fc_atm1 <- build_fc_df(atm1_res$forecast, "ATM1", may_dates)
fc_atm2 <- build_fc_df(atm2_res$forecast, "ATM2", may_dates)
fc_atm3 <- build_fc_df(fc3, "ATM3", may_dates)
fc_atm4 <- build_fc_df(atm4_res$forecast, "ATM4", may_dates)
atm_forecast_all <- bind_rows(fc_atm1, fc_atm2, fc_atm3, fc_atm4)
kable(atm_forecast_all %>% spread(ATM, Forecast_Cash_Hundreds),
caption = "May 2010 ATM Forecast (hundreds of dollars)")| Date | ATM1 | ATM2 | ATM3 | ATM4 |
|---|---|---|---|---|
| 2010-05-01 | 87.04 | 66.44 | 0 | 417.45 |
| 2010-05-02 | 104.20 | 72.72 | 0 | 489.58 |
| 2010-05-03 | 73.51 | 13.67 | 0 | 447.78 |
| 2010-05-04 | 9.18 | 5.30 | 0 | 284.54 |
| 2010-05-05 | 99.57 | 97.63 | 96 | 471.04 |
| 2010-05-06 | 80.48 | 88.29 | 82 | 331.14 |
| 2010-05-07 | 87.02 | 64.99 | 85 | 564.29 |
| 2010-05-08 | 87.99 | 66.44 | 0 | 424.11 |
| 2010-05-09 | 103.32 | 72.73 | 0 | 490.12 |
| 2010-05-10 | 73.42 | 13.66 | 0 | 447.82 |
| 2010-05-11 | 10.14 | 5.29 | 0 | 284.55 |
| 2010-05-12 | 100.22 | 97.65 | 96 | 471.04 |
| 2010-05-13 | 80.20 | 88.29 | 82 | 331.14 |
| 2010-05-14 | 87.39 | 64.98 | 85 | 564.29 |
| 2010-05-15 | 88.17 | 66.45 | 0 | 424.11 |
| 2010-05-16 | 103.15 | 72.73 | 0 | 490.12 |
| 2010-05-17 | 73.41 | 13.66 | 0 | 447.82 |
| 2010-05-18 | 10.32 | 5.29 | 0 | 284.55 |
| 2010-05-19 | 100.35 | 97.65 | 96 | 471.04 |
| 2010-05-20 | 80.15 | 88.29 | 82 | 331.14 |
| 2010-05-21 | 87.46 | 64.97 | 85 | 564.29 |
| 2010-05-22 | 88.20 | 66.45 | 0 | 424.11 |
| 2010-05-23 | 103.12 | 72.74 | 0 | 490.12 |
| 2010-05-24 | 73.40 | 13.65 | 0 | 447.82 |
| 2010-05-25 | 10.35 | 5.29 | 0 | 284.55 |
| 2010-05-26 | 100.37 | 97.66 | 96 | 471.04 |
| 2010-05-27 | 80.14 | 88.29 | 82 | 331.14 |
| 2010-05-28 | 87.47 | 64.97 | 85 | 564.29 |
| 2010-05-29 | 88.21 | 66.46 | 0 | 424.11 |
| 2010-05-30 | 103.12 | 72.74 | 0 | 490.12 |
| 2010-05-31 | 73.40 | 13.65 | 0 | 447.82 |
# Save to Excel
write_xlsx(atm_forecast_all %>% spread(ATM, Forecast_Cash_Hundreds),
"ATM_May2010_Forecast.xlsx")
cat("Forecast saved to ATM_May2010_Forecast.xlsx\n")## Forecast saved to ATM_May2010_Forecast.xlsx
Monthly residential power consumption (KWH) is provided from January 1998 through December 2013 (192 months). We model the annual seasonal cycle and forecast all 12 months of 2014.
pwr_raw <- read_excel("ResidentialCustomerForecastLoad-624.xlsx",
sheet = "ResidentialCustomerForecastLoad")
glimpse(pwr_raw)## Rows: 192
## Columns: 3
## $ CaseSequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ `YYYY-MMM` <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ KWH <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
# Parse year-month
pwr_raw <- pwr_raw %>%
mutate(Date = as.Date(paste0(substr(`YYYY-MMM`,1,4), "-",
match(substr(`YYYY-MMM`,6,8),
month.abb), "-01"))) %>%
arrange(Date)
cat("Missing KWH:", sum(is.na(pwr_raw$KWH)), "\n")## Missing KWH: 1
cat("Period:", strftime(min(pwr_raw$Date), "%b %Y"), "to",
strftime(max(pwr_raw$Date), "%b %Y"), "\n")## Period: Jan 1998 to Dec 2013
One observation is missing (September 2008). We impute using linear interpolation.
pwr_raw <- pwr_raw %>%
mutate(KWH = na.approx(KWH, na.rm = FALSE))
cat("Missing after imputation:", sum(is.na(pwr_raw$KWH)), "\n")## Missing after imputation: 0
ggplot(pwr_raw, aes(x = Date, y = KWH / 1e6)) +
geom_line(color = "steelblue") +
geom_smooth(method = "loess", color = "tomato", se = FALSE) +
labs(title = "Monthly Residential Power Consumption (Jan 1998 – Dec 2013)",
x = NULL, y = "KWH (millions)") +
theme_minimal()pwr_raw %>%
mutate(Month = month(Date, label = TRUE),
Year = factor(year(Date))) %>%
group_by(Month) %>%
summarise(Mean_KWH = mean(KWH)/1e6) %>%
ggplot(aes(x = Month, y = Mean_KWH, group = 1)) +
geom_line(color = "steelblue", linewidth = 1) +
geom_point(color = "tomato", size = 2) +
labs(title = "Average Monthly Power Consumption (Seasonal Pattern)",
x = NULL, y = "Mean KWH (millions)") +
theme_minimal()There is a clear dual-peak seasonal pattern: summer (Jul–Aug) and winter (Dec–Jan) highs driven by air conditioning and heating demand. There is also a mild upward trend over the 16-year period.
pwr_ts <- ts(pwr_raw$KWH, start = c(1998, 1), frequency = 12)
# ADF test on raw series
adf_result <- adf.test(pwr_ts)
cat("ADF test p-value (raw):", round(adf_result$p.value, 4), "\n")## ADF test p-value (raw): 0.01
# Seasonal differencing
pwr_sdiff <- diff(pwr_ts, lag = 12)
adf_sdiff <- adf.test(pwr_sdiff)
cat("ADF test p-value (seasonal diff):", round(adf_sdiff$p.value, 4), "\n")## ADF test p-value (seasonal diff): 0.01
par(mfrow = c(1,2))
Acf(pwr_ts, main = "ACF – Raw KWH Series")
Pacf(pwr_ts, main = "PACF – Raw KWH Series")The ACF shows slow decay and strong seasonal spikes at lags 12, 24 — confirming non-stationarity and annual seasonality. Seasonal differencing achieves stationarity.
We compare three models:
h <- 12 # forecast 12 months
fit_arima <- auto.arima(pwr_ts, seasonal = TRUE, stepwise = FALSE,
approximation = FALSE, D = 1)
fit_ets <- ets(pwr_ts)
fit_stlf <- stlf(pwr_ts, h = h)
cat("ARIMA AICc:", round(fit_arima$aicc, 2), "\n")## ARIMA AICc: 5448.87
## ETS AICc: 6226.78
## STLF — uses STL decomp, AICc of inner ETS: 6195.03
##
## --- Best Model Summary ---
# Select by AICc
if (fit_ets$aicc < fit_arima$aicc) {
best_pwr_label <- "ETS"
best_pwr_fit <- fit_ets
} else {
best_pwr_label <- "ARIMA"
best_pwr_fit <- fit_arima
}
cat("Selected:", best_pwr_label, "\n")## Selected: ARIMA
## Series: pwr_ts
## ARIMA(1,0,0)(0,1,2)[12] with drift
##
## Coefficients:
## ar1 sma1 sma2 drift
## 0.2602 -0.8946 0.1327 7676.504
## s.e. 0.0746 0.0816 0.0816 2112.693
##
## sigma^2 = 7.406e+11: log likelihood = -2719.26
## AIC=5448.52 AICc=5448.87 BIC=5464.49
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -22050 823928.7 496449.9 -5.438382 11.70913 0.7125645 -0.00654087
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,0,0)(0,1,2)[12] with drift
## Q* = 13.355, df = 21, p-value = 0.8957
##
## Model df: 3. Total lags used: 24
Residuals should be white noise (no significant autocorrelation). The Ljung-Box test p-value indicates whether residuals are adequately modeled.
pwr_stl <- stl(pwr_ts, s.window = "periodic")
autoplot(pwr_stl) +
labs(title = "STL Decomposition – Residential Power Consumption") +
theme_minimal()The decomposition confirms the dominant annual cycle, a gradual upward trend, and a relatively small but non-trivial remainder component.
fc_pwr <- forecast(best_pwr_fit, h = h)
autoplot(fc_pwr) +
labs(title = "Residential Power Consumption – 2014 Forecast",
x = NULL, y = "KWH") +
theme_minimal()months_2014 <- seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month")
pwr_fc_df <- data.frame(
Month = format(months_2014, "%Y-%b"),
Forecast_KWH = round(as.numeric(fc_pwr$mean), 0),
Lower_80 = round(as.numeric(fc_pwr$lower[,1]), 0),
Upper_80 = round(as.numeric(fc_pwr$upper[,1]), 0),
Lower_95 = round(as.numeric(fc_pwr$lower[,2]), 0),
Upper_95 = round(as.numeric(fc_pwr$upper[,2]), 0)
)
kable(pwr_fc_df, caption = "2014 Monthly Residential Power Forecast (KWH)")| Month | Forecast_KWH | Lower_80 | Upper_80 | Lower_95 | Upper_95 |
|---|---|---|---|---|---|
| 2014-Jan | 9754582 | 8651717 | 10857447 | 8067895 | 11441269 |
| 2014-Feb | 8384309 | 7244724 | 9523893 | 6641464 | 10127153 |
| 2014-Mar | 6792726 | 5650698 | 7934753 | 5046145 | 8539306 |
| 2014-Apr | 5969807 | 4827614 | 7112000 | 4222973 | 7716640 |
| 2014-May | 5718290 | 4576086 | 6860494 | 3971440 | 7465141 |
| 2014-Jun | 7525640 | 6383435 | 8667845 | 5778788 | 9272491 |
| 2014-Jul | 7852584 | 6710379 | 8994790 | 6105733 | 9599436 |
| 2014-Aug | 9297497 | 8155291 | 10439702 | 7550645 | 11044348 |
| 2014-Sep | 8197882 | 7055677 | 9340087 | 6451030 | 9944733 |
| 2014-Oct | 6011983 | 4869778 | 7154188 | 4265131 | 7758834 |
| 2014-Nov | 5770986 | 4628781 | 6913191 | 4024134 | 7517838 |
| 2014-Dec | 7483775 | 6341570 | 8625980 | 5736923 | 9230627 |
write_xlsx(pwr_fc_df, "Power_2014_Forecast.xlsx")
cat("Forecast saved to Power_2014_Forecast.xlsx\n")## Forecast saved to Power_2014_Forecast.xlsx
Two pipe waterflow datasets with different timestamps are combined into a single hourly time series. We then assess stationarity and — if appropriate — produce a one-week (168-hour) forward forecast.
pipe1_raw <- read_excel("Waterflow_Pipe1.xlsx")
pipe2_raw <- read_excel("Waterflow_Pipe2.xlsx")
# Convert numeric Excel serial dates to POSIXct properly
fix_datetime <- function(x) {
if (is.numeric(x)) {
as.POSIXct(x * 86400, origin = "1899-12-30", tz = "UTC")
} else {
as.POSIXct(x, tz = "UTC")
}
}
pipe1 <- pipe1_raw %>%
rename(DateTime = `Date Time`, Flow = WaterFlow) %>%
mutate(DateTime = fix_datetime(DateTime))
pipe2 <- pipe2_raw %>%
rename(DateTime = `Date Time`, Flow = WaterFlow) %>%
mutate(DateTime = fix_datetime(DateTime))
cat("Pipe1:", nrow(pipe1), "rows |",
strftime(min(pipe1$DateTime), "%Y-%m-%d %H:%M"), "to",
strftime(max(pipe1$DateTime), "%Y-%m-%d %H:%M"), "\n")## Pipe1: 1000 rows | 2015-10-22 19:24 to 2015-11-01 17:35
cat("Pipe2:", nrow(pipe2), "rows |",
strftime(min(pipe2$DateTime), "%Y-%m-%d %H:%M"), "to",
strftime(max(pipe2$DateTime), "%Y-%m-%d %H:%M"), "\n")## Pipe2: 1000 rows | 2015-10-22 20:00 to 2015-12-03 10:00
Pipe1 has irregular sub-hour timestamps while Pipe2 already has hourly readings. We floor both to the hour and take the mean within each hour to create a consistent hourly series.
p1_hourly <- pipe1 %>%
mutate(Hour = floor_date(DateTime, "hour")) %>%
group_by(Hour) %>%
summarise(Pipe1 = mean(Flow, na.rm = TRUE), .groups = "drop")
p2_hourly <- pipe2 %>%
mutate(Hour = floor_date(DateTime, "hour")) %>%
group_by(Hour) %>%
summarise(Pipe2 = mean(Flow, na.rm = TRUE), .groups = "drop")
combined <- full_join(p1_hourly, p2_hourly, by = "Hour") %>%
arrange(Hour) %>%
mutate(TotalFlow = rowSums(cbind(Pipe1, Pipe2), na.rm = TRUE))
cat("Pipe1 hourly rows:", nrow(p1_hourly), "\n")## Pipe1 hourly rows: 236
## Pipe2 hourly rows: 667
## Combined rows: 746
cat("From:", strftime(min(combined$Hour), "%Y-%m-%d %H:%M"), "to",
strftime(max(combined$Hour), "%Y-%m-%d %H:%M"), "\n")## From: 2015-10-22 19:00 to 2015-12-03 10:00
| Hour | Pipe1 | Pipe2 | TotalFlow |
|---|---|---|---|
| 2015-10-23 00:00:00 | 26.10280 | NA | 26.10280 |
| 2015-10-23 01:00:00 | 18.85202 | 30.94891 | 49.80093 |
| 2015-10-23 02:00:00 | 15.15857 | NA | 15.15857 |
| 2015-10-23 03:00:00 | 23.07886 | 37.98770 | 61.06656 |
| 2015-10-23 04:00:00 | 15.48219 | 33.98582 | 49.46801 |
| 2015-10-23 05:00:00 | 22.72539 | NA | 22.72539 |
| 2015-10-23 06:00:00 | 20.58987 | 28.23809 | 48.82796 |
| 2015-10-23 07:00:00 | 18.36703 | 18.27160 | 36.63863 |
| 2015-10-23 08:00:00 | 20.79170 | NA | 20.79170 |
| 2015-10-23 09:00:00 | 21.21875 | 55.77379 | 76.99254 |
ggplot(combined, aes(x = Hour, y = TotalFlow)) +
geom_line(color = "steelblue", linewidth = 0.4) +
labs(title = "Hourly Total Waterflow (Pipe1 + Pipe2)",
x = NULL, y = "Flow") +
theme_minimal()combined %>%
mutate(HourOfDay = hour(Hour)) %>%
group_by(HourOfDay) %>%
summarise(MeanFlow = mean(TotalFlow)) %>%
ggplot(aes(x = HourOfDay, y = MeanFlow)) +
geom_line(color = "tomato", linewidth = 1) +
geom_point(color = "tomato") +
labs(title = "Average Waterflow by Hour of Day",
x = "Hour (0–23)", y = "Mean Flow") +
theme_minimal()## Series length: 746
# Skip adf.test — use KPSS only (no embed() dependency)
kpss_w <- kpss.test(water_ts)
cat("KPSS p-value:", round(kpss_w$p.value, 4),
"—", ifelse(kpss_w$p.value > 0.05, "Stationary", "Non-stationary"), "\n")## KPSS p-value: 0.01 — Non-stationary
# Visual stationarity check via ACF
ggplot(combined, aes(x = Hour, y = TotalFlow)) +
geom_line(color = "steelblue", linewidth = 0.4) +
labs(title = "Hourly Waterflow – Stationarity Check",
x = NULL, y = "Total Flow") +
theme_minimal()max_lag <- max(1, min(48, floor(length(water_ts) / 3)))
Acf(water_ts, main = "ACF – Hourly Waterflow", lag.max = max_lag)If the series is stationary (or made stationary), we proceed with TBATS — well-suited to hourly data with multiple seasonal periods (daily = 24, weekly = 168).
h_water <- 168 # one week forward
# TBATS handles complex multi-period seasonality
water_msts <- msts(combined$TotalFlow,
seasonal.periods = c(24, 168))
fit_water <- tbats(water_msts)
fc_water <- forecast(fit_water, h = h_water)
autoplot(fc_water) +
labs(title = "Waterflow – One Week Forward Forecast (168 hours)",
x = "Hour", y = "Flow") +
theme_minimal()##
## Ljung-Box test
##
## data: Residuals from TBATS
## Q* = 185, df = 149, p-value = 0.0241
##
## Model df: 0. Total lags used: 149
last_hour <- max(combined$Hour)
fc_hours <- seq(last_hour + hours(1),
by = "hour", length.out = h_water)
water_fc_df <- data.frame(
DateTime = fc_hours,
Forecast_Flow = round(as.numeric(fc_water$mean), 4),
Lower_80 = round(as.numeric(fc_water$lower[,1]), 4),
Upper_80 = round(as.numeric(fc_water$upper[,1]), 4)
)
kable(head(water_fc_df, 24),
caption = "First 24 hours of one-week waterflow forecast")| DateTime | Forecast_Flow | Lower_80 | Upper_80 |
|---|---|---|---|
| 2015-12-03 17:00:00 | 38.0828 | 21.6138 | 58.4600 |
| 2015-12-03 18:00:00 | 41.1513 | 23.9901 | 62.2002 |
| 2015-12-03 19:00:00 | 40.8552 | 23.7412 | 61.8682 |
| 2015-12-03 20:00:00 | 40.5132 | 23.2932 | 61.7300 |
| 2015-12-03 21:00:00 | 39.9241 | 22.8309 | 61.0218 |
| 2015-12-03 22:00:00 | 41.0688 | 23.6806 | 62.4729 |
| 2015-12-03 23:00:00 | 40.4177 | 22.9868 | 61.9705 |
| 2015-12-04 00:00:00 | 39.9743 | 22.6403 | 61.4362 |
| 2015-12-04 01:00:00 | 41.0477 | 23.4401 | 62.7904 |
| 2015-12-04 02:00:00 | 40.4028 | 22.7663 | 62.2754 |
| 2015-12-04 03:00:00 | 40.0055 | 22.4568 | 61.7958 |
| 2015-12-04 04:00:00 | 41.0316 | 23.2241 | 63.0846 |
| 2015-12-04 05:00:00 | 40.3881 | 22.5649 | 62.5530 |
| 2015-12-04 06:00:00 | 40.0357 | 22.2911 | 62.1272 |
| 2015-12-04 07:00:00 | 41.0153 | 23.0257 | 63.3526 |
| 2015-12-04 08:00:00 | 40.3748 | 22.3815 | 62.8076 |
| 2015-12-04 09:00:00 | 40.0647 | 22.1410 | 62.4327 |
| 2015-12-04 10:00:00 | 40.9990 | 22.8434 | 63.5972 |
| 2015-12-04 11:00:00 | 40.3628 | 22.2142 | 63.0415 |
| 2015-12-04 12:00:00 | 40.0924 | 22.0048 | 62.7146 |
| 2015-12-04 13:00:00 | 40.9826 | 22.6753 | 63.8207 |
| 2015-12-04 14:00:00 | 40.3521 | 22.0614 | 63.2568 |
| 2015-12-04 15:00:00 | 40.1189 | 21.8808 | 62.9751 |
| 2015-12-04 16:00:00 | 40.9662 | 22.5202 | 64.0251 |
write_xlsx(water_fc_df, "Waterflow_1Week_Forecast.xlsx")
cat("Forecast saved to Waterflow_1Week_Forecast.xlsx\n")## Forecast saved to Waterflow_1Week_Forecast.xlsx
c(24, 168) captures both the
daily and weekly seasonal cycles
simultaneously — making it the most appropriate method for hourly pipe
flow data.## R version 4.5.2 (2025-10-31)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.3.1
##
## Matrix products: default
## BLAS: /System/Library/Frameworks/Accelerate.framework/Versions/A/Frameworks/vecLib.framework/Versions/A/libBLAS.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: America/Chicago
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] zoo_1.8-15 knitr_1.51 gridExtra_2.3 writexl_1.5.4
## [5] lubridate_1.9.5 tseries_0.10-61 forecast_9.0.2 ggplot2_4.0.2
## [9] tidyr_1.3.2 dplyr_1.2.0 readxl_1.4.5
##
## loaded via a namespace (and not attached):
## [1] sass_0.4.10 generics_0.1.4 lattice_0.22-7 digest_0.6.39
## [5] magrittr_2.0.4 timechange_0.4.0 evaluate_1.0.5 grid_4.5.2
## [9] RColorBrewer_1.1-3 fastmap_1.2.0 Matrix_1.7-4 cellranger_1.1.0
## [13] jsonlite_2.0.0 mgcv_1.9-3 purrr_1.2.1 scales_1.4.0
## [17] jquerylib_0.1.4 cli_3.6.5 rlang_1.1.7 splines_4.5.2
## [21] withr_3.0.2 cachem_1.1.0 yaml_2.3.12 tools_4.5.2
## [25] parallel_4.5.2 colorspace_2.1-2 curl_7.0.0 vctrs_0.7.1
## [29] R6_2.6.1 lifecycle_1.0.5 pkgconfig_2.0.3 urca_1.3-4
## [33] pillar_1.11.1 bslib_0.10.0 gtable_0.3.6 glue_1.8.0
## [37] quantmod_0.4.28 Rcpp_1.1.1 xfun_0.56 tibble_3.3.1
## [41] tidyselect_1.2.1 rstudioapi_0.18.0 farver_2.1.2 htmltools_0.5.9
## [45] nlme_3.1-168 labeling_0.4.3 rmarkdown_2.30 xts_0.14.2
## [49] timeDate_4052.112 fracdiff_1.5-3 compiler_4.5.2 S7_0.2.1
## [53] quadprog_1.5-8 TTR_0.24.4