library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(forecast)
library(lubridate)
library(tseries)
library(zoo)
library(knitr)
theme_set(theme_minimal())
options(digits = 4)
get_file <- function(pattern) {
f <- list.files(pattern = pattern, ignore.case = TRUE)
if(length(f) == 0) stop(paste("Could not find file for pattern:", pattern))
f[1]
}
This project had two required parts and one bonus part. I tried to keep the report simple and just work through each dataset based on what the data looked like. I did not want to force the same exact model on everything because the series were pretty different from each other.
A few things stood out right away:
atm_file <- get_file("ATM624Data.*\\.xlsx$")
atm_raw <- read_excel(atm_file)
atm <- atm_raw %>%
filter(!is.na(ATM)) %>%
mutate(DATE = as.Date(DATE))
atm_wide <- atm %>%
select(DATE, ATM, Cash) %>%
pivot_wider(names_from = ATM, values_from = Cash) %>%
arrange(DATE)
# fill a few missing daily values using interpolation
atm_wide$ATM1 <- na.interp(ts(atm_wide$ATM1, frequency = 7))
atm_wide$ATM2 <- na.interp(ts(atm_wide$ATM2, frequency = 7))
atm_wide$ATM3 <- ts(atm_wide$ATM3, frequency = 7)
atm_wide$ATM4 <- ts(atm_wide$ATM4, frequency = 7)
glimpse(atm_wide)
## Rows: 365
## Columns: 5
## $ DATE <date> 2079-05-03, 2079-05-04, 2079-05-05, 2079-05-06, 2079-05-07, 2079…
## $ ATM1 <dbl> 96, 82, 85, 90, 99, 88, 8, 104, 87, 93, 86, 111, 75, 6, 102, 73, …
## $ ATM2 <dbl> 107, 89, 90, 55, 79, 19, 2, 103, 107, 118, 75, 111, 25, 16, 137, …
## $ ATM3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ATM4 <dbl> 776.99, 524.42, 792.81, 908.24, 52.83, 52.21, 55.47, 558.50, 904.…
atm_long <- atm_wide %>%
mutate(DATE = as.Date(DATE)) %>%
pivot_longer(cols = starts_with("ATM"), names_to = "ATM", values_to = "Cash")
ggplot(atm_long, aes(x = DATE, y = Cash)) +
geom_line(color = "steelblue") +
facet_wrap(~ATM, scales = "free_y", ncol = 2) +
labs(title = "Daily ATM Withdrawals", x = NULL, y = "Cash (hundreds of dollars)")
For this part, I mainly looked at the plots first and then picked models that seemed reasonable.
ATM1 and ATM2 both looked like they had a weekly pattern, so I used
Holt-Winters with weekly seasonality.
ATM4 was a lot more uneven and had some big jumps, so I felt like a
seasonal naive model fit it better than trying to over-smooth it.
ATM3 was the hardest one because it was basically zero for most of the
series and then only had a few nonzero values near the end. Because of
that, I did not think a normal time series model would
be very reliable there, so I kept that forecast conservative and used
zero for May.
atm1_ts <- ts(as.numeric(atm_wide$ATM1), frequency = 7)
atm2_ts <- ts(as.numeric(atm_wide$ATM2), frequency = 7)
atm3_ts <- ts(as.numeric(atm_wide$ATM3), frequency = 7)
atm4_ts <- ts(as.numeric(atm_wide$ATM4), frequency = 7)
fit_atm1 <- hw(atm1_ts, seasonal = "additive", h = 31)
fit_atm2 <- hw(atm2_ts, seasonal = "additive", h = 31)
fit_atm4 <- snaive(atm4_ts, h = 31)
may_dates <- seq(as.Date("2010-05-01"), by = "day", length.out = 31)
atm_forecast <- data.frame(
Date = may_dates,
ATM1 = as.numeric(fit_atm1$mean),
ATM2 = as.numeric(fit_atm2$mean),
ATM3 = rep(0, 31),
ATM4 = as.numeric(fit_atm4$mean)
)
atm_forecast
## Date ATM1 ATM2 ATM3 ATM4
## 1 2010-05-01 86.691 67.5548 0 5.452
## 2 2010-05-02 100.397 73.4032 0 542.281
## 3 2010-05-03 72.762 10.2464 0 403.839
## 4 2010-05-04 5.349 1.2780 0 13.697
## 5 2010-05-05 99.797 100.8661 0 348.201
## 6 2010-05-06 79.090 91.6572 0 44.245
## 7 2010-05-07 85.259 68.3555 0 482.287
## 8 2010-05-08 86.605 67.2992 0 5.452
## 9 2010-05-09 100.311 73.1476 0 542.281
## 10 2010-05-10 72.676 9.9909 0 403.839
## 11 2010-05-11 5.263 1.0224 0 13.697
## 12 2010-05-12 99.711 100.6105 0 348.201
## 13 2010-05-13 79.004 91.4016 0 44.245
## 14 2010-05-14 85.173 68.0999 0 482.287
## 15 2010-05-15 86.519 67.0437 0 5.452
## 16 2010-05-16 100.225 72.8920 0 542.281
## 17 2010-05-17 72.590 9.7353 0 403.839
## 18 2010-05-18 5.177 0.7669 0 13.697
## 19 2010-05-19 99.625 100.3549 0 348.201
## 20 2010-05-20 78.918 91.1460 0 44.245
## 21 2010-05-21 85.087 67.8443 0 482.287
## 22 2010-05-22 86.433 66.7881 0 5.452
## 23 2010-05-23 100.139 72.6364 0 542.281
## 24 2010-05-24 72.504 9.4797 0 403.839
## 25 2010-05-25 5.091 0.5113 0 13.697
## 26 2010-05-26 99.539 100.0994 0 348.201
## 27 2010-05-27 78.832 90.8904 0 44.245
## 28 2010-05-28 85.001 67.5888 0 482.287
## 29 2010-05-29 86.347 66.5325 0 5.452
## 30 2010-05-30 100.053 72.3809 0 542.281
## 31 2010-05-31 72.418 9.2241 0 403.839
Below is the May 2010 forecast for all four ATMs. The unit is still hundreds of dollars.
atm_forecast_display <- atm_forecast
atm_forecast_display[,2:5] <- round(atm_forecast_display[,2:5], 2)
kable(atm_forecast_display, caption = "May 2010 ATM Forecast")
| Date | ATM1 | ATM2 | ATM3 | ATM4 |
|---|---|---|---|---|
| 2010-05-01 | 86.69 | 67.55 | 0 | 5.45 |
| 2010-05-02 | 100.40 | 73.40 | 0 | 542.28 |
| 2010-05-03 | 72.76 | 10.25 | 0 | 403.84 |
| 2010-05-04 | 5.35 | 1.28 | 0 | 13.70 |
| 2010-05-05 | 99.80 | 100.87 | 0 | 348.20 |
| 2010-05-06 | 79.09 | 91.66 | 0 | 44.25 |
| 2010-05-07 | 85.26 | 68.36 | 0 | 482.29 |
| 2010-05-08 | 86.61 | 67.30 | 0 | 5.45 |
| 2010-05-09 | 100.31 | 73.15 | 0 | 542.28 |
| 2010-05-10 | 72.68 | 9.99 | 0 | 403.84 |
| 2010-05-11 | 5.26 | 1.02 | 0 | 13.70 |
| 2010-05-12 | 99.71 | 100.61 | 0 | 348.20 |
| 2010-05-13 | 79.00 | 91.40 | 0 | 44.25 |
| 2010-05-14 | 85.17 | 68.10 | 0 | 482.29 |
| 2010-05-15 | 86.52 | 67.04 | 0 | 5.45 |
| 2010-05-16 | 100.22 | 72.89 | 0 | 542.28 |
| 2010-05-17 | 72.59 | 9.74 | 0 | 403.84 |
| 2010-05-18 | 5.18 | 0.77 | 0 | 13.70 |
| 2010-05-19 | 99.63 | 100.35 | 0 | 348.20 |
| 2010-05-20 | 78.92 | 91.15 | 0 | 44.25 |
| 2010-05-21 | 85.09 | 67.84 | 0 | 482.29 |
| 2010-05-22 | 86.43 | 66.79 | 0 | 5.45 |
| 2010-05-23 | 100.14 | 72.64 | 0 | 542.28 |
| 2010-05-24 | 72.50 | 9.48 | 0 | 403.84 |
| 2010-05-25 | 5.09 | 0.51 | 0 | 13.70 |
| 2010-05-26 | 99.54 | 100.10 | 0 | 348.20 |
| 2010-05-27 | 78.83 | 90.89 | 0 | 44.25 |
| 2010-05-28 | 85.00 | 67.59 | 0 | 482.29 |
| 2010-05-29 | 86.35 | 66.53 | 0 | 5.45 |
| 2010-05-30 | 100.05 | 72.38 | 0 | 542.28 |
| 2010-05-31 | 72.42 | 9.22 | 0 | 403.84 |
The main thing I noticed in Part A was that the four ATMs did not behave the same way, so using one model for all of them would not have made much sense.
Out of the four, ATM4 stood out the most to me because the changes were a lot bigger, which would make cash planning more important there.
Using the forecasts, I got these approximate May 2010 totals:
atm_summary <- data.frame(
ATM = c("ATM1", "ATM2", "ATM3", "ATM4"),
Model = c("Holt-Winters additive weekly seasonality",
"Holt-Winters additive weekly seasonality",
"Conservative manual forecast",
"Seasonal naive weekly"),
May_Total_Hundreds = c(round(sum(atm_forecast$ATM1),2),
round(sum(atm_forecast$ATM2),2),
round(sum(atm_forecast$ATM3),2),
round(sum(atm_forecast$ATM4),2))
)
kable(atm_summary, caption = "ATM May 2010 Forecast Summary")
| ATM | Model | May_Total_Hundreds |
|---|---|---|
| ATM1 | Holt-Winters additive weekly seasonality | 2373 |
| ATM2 | Holt-Winters additive weekly seasonality | 1791 |
| ATM3 | Conservative manual forecast | 0 |
| ATM4 | Seasonal naive weekly | 8312 |
power_file <- get_file("ResidentialCustomerForecastLoad-624.*\\.xlsx$")
power_raw <- read_excel(power_file)
power <- power_raw %>%
mutate(Date = as.Date(paste0(`YYYY-MMM`, "-01"), format = "%Y-%b-%d")) %>%
arrange(Date)
power$KWH <- na.interp(power$KWH)
power_ts <- ts(power$KWH, start = c(1998, 1), frequency = 12)
summary(power_ts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 770523 5434539 6314472 6502824 7608792 10655730
autoplot(power_ts) +
labs(title = "Residential Monthly Power Usage", x = "Year", y = "KWH")
This series had a pretty clear monthly seasonal pattern, so I used Holt-Winters with additive seasonality. I also considered other options, but this seemed like the most straightforward choice for the data and for the assignment.
fit_power <- hw(power_ts, seasonal = "additive", h = 12)
power_forecast <- data.frame(
Month = seq(as.Date("2014-01-01"), by = "month", length.out = 12),
Forecast_KWH = round(as.numeric(fit_power$mean), 0)
)
power_forecast
## Month Forecast_KWH
## 1 2014-01-01 9148885
## 2 2014-02-01 8237457
## 3 2014-03-01 7245381
## 4 2014-04-01 6579342
## 5 2014-05-01 6346912
## 6 2014-06-01 7831097
## 7 2014-07-01 8739578
## 8 2014-08-01 9639220
## 9 2014-09-01 9018186
## 10 2014-10-01 7077514
## 11 2014-11-01 6344088
## 12 2014-12-01 7638889
kable(power_forecast, caption = "2014 Monthly Power Forecast")
| Month | Forecast_KWH |
|---|---|
| 2014-01-01 | 9148885 |
| 2014-02-01 | 8237457 |
| 2014-03-01 | 7245381 |
| 2014-04-01 | 6579342 |
| 2014-05-01 | 6346912 |
| 2014-06-01 | 7831097 |
| 2014-07-01 | 8739578 |
| 2014-08-01 | 9639220 |
| 2014-09-01 | 9018186 |
| 2014-10-01 | 7077514 |
| 2014-11-01 | 6344088 |
| 2014-12-01 | 7638889 |
The power data was a lot cleaner than the ATM data. The biggest pattern was the repeated seasonal movement across the year, which makes sense for residential electricity use. The forecast just carries that same general pattern into 2014.
Because that seasonal pattern was so visible, I did not think this part needed anything too complicated. A seasonal monthly model worked fine here.
The instructions for Part C said to time-base the data and aggregate by hour, taking the mean if there were multiple readings within the same hour. That is what I did first.
pipe1_file <- get_file("Waterflow_Pipe1.*\\.xlsx$")
pipe2_file <- get_file("Waterflow_Pipe2.*\\.xlsx$")
fix_datetime <- function(x) {
if (inherits(x, c("POSIXct", "POSIXt"))) {
return(x)
}
if (inherits(x, "Date")) {
return(as.POSIXct(x))
}
if (is.numeric(x)) {
return(as.POSIXct(x, origin = "1899-12-30", tz = "UTC"))
}
parse_date_time(as.character(x),
orders = c("ymd HMS", "ymd HM", "ymdHMS", "mdy HMS", "mdy HM"),
tz = "UTC")
}
pipe1_raw <- read_excel(pipe1_file) %>%
mutate(`Date Time` = fix_datetime(`Date Time`))
pipe2_raw <- read_excel(pipe2_file) %>%
mutate(`Date Time` = fix_datetime(`Date Time`))
pipe1_hourly <- pipe1_raw %>%
filter(!is.na(`Date Time`)) %>%
mutate(Hour = floor_date(`Date Time`, unit = "hour")) %>%
group_by(Hour) %>%
summarise(WaterFlow = mean(WaterFlow, na.rm = TRUE), .groups = "drop") %>%
complete(Hour = seq(min(Hour), max(Hour), by = "hour")) %>%
mutate(WaterFlow = na.approx(WaterFlow, na.rm = FALSE)) %>%
mutate(WaterFlow = na.locf(WaterFlow, na.rm = FALSE),
WaterFlow = na.locf(WaterFlow, fromLast = TRUE))
pipe2_hourly <- pipe2_raw %>%
filter(!is.na(`Date Time`)) %>%
mutate(Hour = floor_date(`Date Time`, unit = "hour")) %>%
group_by(Hour) %>%
summarise(WaterFlow = mean(WaterFlow, na.rm = TRUE), .groups = "drop") %>%
complete(Hour = seq(min(Hour), max(Hour), by = "hour")) %>%
mutate(WaterFlow = na.approx(WaterFlow, na.rm = FALSE)) %>%
mutate(WaterFlow = na.locf(WaterFlow, na.rm = FALSE),
WaterFlow = na.locf(WaterFlow, fromLast = TRUE))
safe_adf <- function(x) {
x <- as.numeric(x)
x <- x[is.finite(x)]
if (length(x) < 10) {
return(list(result = "ADF test not run", p.value = NA_real_,
note = "Series too short for ADF test"))
}
if (sd(x, na.rm = TRUE) == 0) {
return(list(result = "ADF test not run", p.value = NA_real_,
note = "Series is constant, so ADF test is not meaningful"))
}
out <- tryCatch(
tseries::adf.test(x),
error = function(e) NULL
)
if (is.null(out)) {
list(result = "ADF test not run", p.value = NA_real_,
note = "ADF test could not be computed for this series")
} else {
list(result = out, p.value = out$p.value,
note = "ADF test ran successfully")
}
}
adf_pipe1 <- safe_adf(pipe1_hourly$WaterFlow)
adf_pipe2 <- safe_adf(pipe2_hourly$WaterFlow)
adf_summary <- tibble(
Series = c("Pipe1", "Pipe2"),
ADF_p_value = c(adf_pipe1$p.value, adf_pipe2$p.value),
Note = c(adf_pipe1$note, adf_pipe2$note)
)
knitr::kable(adf_summary, digits = 4,
caption = "ADF test results for hourly waterflow series")
| Series | ADF_p_value | Note |
|---|---|---|
| Pipe1 | NA | Series too short for ADF test |
| Pipe2 | NA | Series too short for ADF test |
I used the ADF check as a rough stationarity check. If the test could not be computed cleanly, I treated that as a limitation of the data rather than forcing a result. Since this is just the bonus section, I kept the forecasting step short-horizon and simple.
I kept the models simple here since the bonus part was more about the sequence alignment, stationarity check, and whether forecasting was reasonable at all.
safe_pipe_forecast <- function(df, arima_order = c(1,0,0), h = 168) {
x <- as.numeric(df$WaterFlow)
x <- x[is.finite(x)]
last_time <- max(df$Hour, na.rm = TRUE)
future_index <- seq(last_time + hours(1), by = "hour", length.out = h)
if (length(x) < 10) {
avg_val <- ifelse(length(x) == 0, 0, mean(x, na.rm = TRUE))
fc_vals <- rep(avg_val, h)
method_note <- "Fallback mean forecast used because the series was too short for ARIMA."
return(list(
model = NULL,
forecast = data.frame(DateTime = future_index, Forecast_WaterFlow = fc_vals),
method = method_note
))
}
fit <- tryCatch(
forecast::Arima(x, order = arima_order),
error = function(e) NULL
)
if (is.null(fit)) {
avg_val <- mean(x, na.rm = TRUE)
fc_vals <- rep(avg_val, h)
method_note <- "Fallback mean forecast used because ARIMA would not converge on this series."
return(list(
model = NULL,
forecast = data.frame(DateTime = future_index, Forecast_WaterFlow = fc_vals),
method = method_note
))
}
fc <- forecast::forecast(fit, h = h)
list(
model = fit,
forecast = data.frame(DateTime = future_index,
Forecast_WaterFlow = as.numeric(fc$mean)),
method = "ARIMA forecast ran successfully."
)
}
pipe1_result <- safe_pipe_forecast(pipe1_hourly, arima_order = c(1,0,0), h = 168)
pipe2_result <- safe_pipe_forecast(pipe2_hourly, arima_order = c(3,0,0), h = 168)
pipe1_forecast <- pipe1_result$forecast
pipe2_forecast <- pipe2_result$forecast
pipe_method_summary <- tibble(
Series = c("Pipe1", "Pipe2"),
Method_Used = c(pipe1_result$method, pipe2_result$method)
)
knitr::kable(pipe_method_summary, caption = "Forecasting method used for each pipe series")
| Series | Method_Used |
|---|---|
| Pipe1 | Fallback mean forecast used because the series was too short for ARIMA. |
| Pipe2 | Fallback mean forecast used because the series was too short for ARIMA. |
ggplot() +
geom_line(data = tail(pipe1_hourly, 200), aes(x = Hour, y = WaterFlow), alpha = 0.7) +
geom_line(data = pipe1_forecast, aes(x = DateTime, y = Forecast_WaterFlow), linetype = "dashed") +
labs(title = "Pipe 1 - One Week Forecast", x = NULL, y = "WaterFlow")
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
ggplot() +
geom_line(data = tail(pipe2_hourly, 200), aes(x = Hour, y = WaterFlow), alpha = 0.7) +
geom_line(data = pipe2_forecast, aes(x = DateTime, y = Forecast_WaterFlow), linetype = "dashed") +
labs(title = "Pipe 2 - One Week Forecast", x = NULL, y = "WaterFlow")
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
Most of the work in this section was in the preprocessing step. Since the two files had different timestamps, I first had to align both datasets to the same hourly time base. When there was more than one reading in the same hour, I took the mean, which is what the project instructions asked for. After that, both pipe datasets were in a cleaner hourly format and easier to compare.
For stationarity, I attempted to use the ADF test as a quick check. In this case, the hourly series was too short for the test to run reliably, so I could not make a strong claim about stationarity from that result alone. Because of that, I treated the bonus forecasting part more as a simple short-term estimate instead of a strong final model.
I still think a one-week forecast is reasonable to show for the bonus since the assignment asked whether the data could be forecast, but I would be more cautious interpreting these results than I would be for Parts A and B. The main point here was showing the time alignment, hourly aggregation, and a reasonable forecasting attempt based on the limited data.
Overall, I do not think this project was something where one forecasting method should be used on every dataset.
If I were continuing this project, I would probably compare models more formally with a rolling forecast setup. But for this assignment, I think the methods I used were reasonable and matched the patterns in the data.
For the bonus part, I used the stationarity check more as a rough reference point, but since the test could not be run reliably on these short hourly series, I treated the forecast as a simple short-run estimate. # Export forecast files
write.csv(atm_forecast, "ATM_May2010_Forecast.csv", row.names = FALSE)
write.csv(power_forecast, "Power_2014_Forecast.csv", row.names = FALSE)
write.csv(pipe1_forecast, "Pipe1_1Week_Forecast.csv", row.names = FALSE)
write.csv(pipe2_forecast, "Pipe2_1Week_Forecast.csv", row.names = FALSE)