library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(forecast)
library(lubridate)
library(tseries)
library(zoo)
library(knitr)
theme_set(theme_minimal())
options(digits = 4)
get_file <- function(pattern) {
f <- list.files(pattern = pattern, ignore.case = TRUE)
if(length(f) == 0) stop(paste("Could not find file for pattern:", pattern))
f[1]
}
This project had two required parts and one bonus part. I tried to keep the report simple and just work through each dataset based on what the data looked like. I did not want to force the same exact model on everything because the series were pretty different from each other.
A few things stood out right away:
atm_file <- get_file("ATM624Data.*\\.xlsx$")
atm_raw <- read_excel(atm_file)
atm <- atm_raw %>%
filter(!is.na(ATM)) %>%
mutate(DATE = as.Date(DATE))
atm_wide <- atm %>%
select(DATE, ATM, Cash) %>%
pivot_wider(names_from = ATM, values_from = Cash) %>%
arrange(DATE)
# fill a few missing daily values using interpolation
atm_wide$ATM1 <- na.interp(ts(atm_wide$ATM1, frequency = 7))
atm_wide$ATM2 <- na.interp(ts(atm_wide$ATM2, frequency = 7))
atm_wide$ATM3 <- ts(atm_wide$ATM3, frequency = 7)
atm_wide$ATM4 <- ts(atm_wide$ATM4, frequency = 7)
glimpse(atm_wide)
## Rows: 365
## Columns: 5
## $ DATE <date> 2079-05-03, 2079-05-04, 2079-05-05, 2079-05-06, 2079-05-07, 2079…
## $ ATM1 <dbl> 96, 82, 85, 90, 99, 88, 8, 104, 87, 93, 86, 111, 75, 6, 102, 73, …
## $ ATM2 <dbl> 107, 89, 90, 55, 79, 19, 2, 103, 107, 118, 75, 111, 25, 16, 137, …
## $ ATM3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ATM4 <dbl> 776.99, 524.42, 792.81, 908.24, 52.83, 52.21, 55.47, 558.50, 904.…
atm_long <- atm_wide %>%
mutate(DATE = as.Date(DATE)) %>%
pivot_longer(cols = starts_with("ATM"), names_to = "ATM", values_to = "Cash")
ggplot(atm_long, aes(x = DATE, y = Cash)) +
geom_line(color = "steelblue") +
facet_wrap(~ATM, scales = "free_y", ncol = 2) +
labs(title = "Daily ATM Withdrawals", x = NULL, y = "Cash (hundreds of dollars)")
For this part, I mainly looked at the plots first and then picked models that seemed reasonable.
ATM1 and ATM2 both looked like they had a weekly pattern, so I used
Holt-Winters with weekly seasonality.
ATM4 was a lot more uneven and had some big jumps, so I felt like a
seasonal naive model fit it better than trying to over-smooth it.
ATM3 was the hardest one because it was basically zero for most of the
series and then only had a few nonzero values near the end. Because of
that, I did not think a normal time series model would
be very reliable there, so I kept that forecast conservative and used
zero for May.
atm1_ts <- ts(as.numeric(atm_wide$ATM1), frequency = 7)
atm2_ts <- ts(as.numeric(atm_wide$ATM2), frequency = 7)
atm3_ts <- ts(as.numeric(atm_wide$ATM3), frequency = 7)
atm4_ts <- ts(as.numeric(atm_wide$ATM4), frequency = 7)
fit_atm1 <- hw(atm1_ts, seasonal = "additive", h = 31)
fit_atm2 <- hw(atm2_ts, seasonal = "additive", h = 31)
fit_atm4 <- snaive(atm4_ts, h = 31)
may_dates <- seq(as.Date("2010-05-01"), by = "day", length.out = 31)
atm_forecast <- data.frame(
Date = may_dates,
ATM1 = as.numeric(fit_atm1$mean),
ATM2 = as.numeric(fit_atm2$mean),
ATM3 = rep(0, 31),
ATM4 = as.numeric(fit_atm4$mean)
)
atm_forecast
## Date ATM1 ATM2 ATM3 ATM4
## 1 2010-05-01 86.691 67.5548 0 5.452
## 2 2010-05-02 100.397 73.4032 0 542.281
## 3 2010-05-03 72.762 10.2464 0 403.839
## 4 2010-05-04 5.349 1.2780 0 13.697
## 5 2010-05-05 99.797 100.8661 0 348.201
## 6 2010-05-06 79.090 91.6572 0 44.245
## 7 2010-05-07 85.259 68.3555 0 482.287
## 8 2010-05-08 86.605 67.2992 0 5.452
## 9 2010-05-09 100.311 73.1476 0 542.281
## 10 2010-05-10 72.676 9.9909 0 403.839
## 11 2010-05-11 5.263 1.0224 0 13.697
## 12 2010-05-12 99.711 100.6105 0 348.201
## 13 2010-05-13 79.004 91.4016 0 44.245
## 14 2010-05-14 85.173 68.0999 0 482.287
## 15 2010-05-15 86.519 67.0437 0 5.452
## 16 2010-05-16 100.225 72.8920 0 542.281
## 17 2010-05-17 72.590 9.7353 0 403.839
## 18 2010-05-18 5.177 0.7669 0 13.697
## 19 2010-05-19 99.625 100.3549 0 348.201
## 20 2010-05-20 78.918 91.1460 0 44.245
## 21 2010-05-21 85.087 67.8443 0 482.287
## 22 2010-05-22 86.433 66.7881 0 5.452
## 23 2010-05-23 100.139 72.6364 0 542.281
## 24 2010-05-24 72.504 9.4797 0 403.839
## 25 2010-05-25 5.091 0.5113 0 13.697
## 26 2010-05-26 99.539 100.0994 0 348.201
## 27 2010-05-27 78.832 90.8904 0 44.245
## 28 2010-05-28 85.001 67.5888 0 482.287
## 29 2010-05-29 86.347 66.5325 0 5.452
## 30 2010-05-30 100.053 72.3809 0 542.281
## 31 2010-05-31 72.418 9.2241 0 403.839
Below is the May 2010 forecast for all four ATMs. The unit is still hundreds of dollars.
atm_forecast_display <- atm_forecast
atm_forecast_display[,2:5] <- round(atm_forecast_display[,2:5], 2)
kable(atm_forecast_display, caption = "May 2010 ATM Forecast")
| Date | ATM1 | ATM2 | ATM3 | ATM4 |
|---|---|---|---|---|
| 2010-05-01 | 86.69 | 67.55 | 0 | 5.45 |
| 2010-05-02 | 100.40 | 73.40 | 0 | 542.28 |
| 2010-05-03 | 72.76 | 10.25 | 0 | 403.84 |
| 2010-05-04 | 5.35 | 1.28 | 0 | 13.70 |
| 2010-05-05 | 99.80 | 100.87 | 0 | 348.20 |
| 2010-05-06 | 79.09 | 91.66 | 0 | 44.25 |
| 2010-05-07 | 85.26 | 68.36 | 0 | 482.29 |
| 2010-05-08 | 86.61 | 67.30 | 0 | 5.45 |
| 2010-05-09 | 100.31 | 73.15 | 0 | 542.28 |
| 2010-05-10 | 72.68 | 9.99 | 0 | 403.84 |
| 2010-05-11 | 5.26 | 1.02 | 0 | 13.70 |
| 2010-05-12 | 99.71 | 100.61 | 0 | 348.20 |
| 2010-05-13 | 79.00 | 91.40 | 0 | 44.25 |
| 2010-05-14 | 85.17 | 68.10 | 0 | 482.29 |
| 2010-05-15 | 86.52 | 67.04 | 0 | 5.45 |
| 2010-05-16 | 100.22 | 72.89 | 0 | 542.28 |
| 2010-05-17 | 72.59 | 9.74 | 0 | 403.84 |
| 2010-05-18 | 5.18 | 0.77 | 0 | 13.70 |
| 2010-05-19 | 99.63 | 100.35 | 0 | 348.20 |
| 2010-05-20 | 78.92 | 91.15 | 0 | 44.25 |
| 2010-05-21 | 85.09 | 67.84 | 0 | 482.29 |
| 2010-05-22 | 86.43 | 66.79 | 0 | 5.45 |
| 2010-05-23 | 100.14 | 72.64 | 0 | 542.28 |
| 2010-05-24 | 72.50 | 9.48 | 0 | 403.84 |
| 2010-05-25 | 5.09 | 0.51 | 0 | 13.70 |
| 2010-05-26 | 99.54 | 100.10 | 0 | 348.20 |
| 2010-05-27 | 78.83 | 90.89 | 0 | 44.25 |
| 2010-05-28 | 85.00 | 67.59 | 0 | 482.29 |
| 2010-05-29 | 86.35 | 66.53 | 0 | 5.45 |
| 2010-05-30 | 100.05 | 72.38 | 0 | 542.28 |
| 2010-05-31 | 72.42 | 9.22 | 0 | 403.84 |
The main thing I noticed in Part A was that the four ATMs did not behave the same way, so using one model for all of them would not have made much sense.
Out of the four, ATM4 stood out the most to me because the changes were a lot bigger, which would make cash planning more important there.
Using the forecasts, I got these approximate May 2010 totals:
atm_summary <- data.frame(
ATM = c("ATM1", "ATM2", "ATM3", "ATM4"),
Model = c("Holt-Winters additive weekly seasonality",
"Holt-Winters additive weekly seasonality",
"Conservative manual forecast",
"Seasonal naive weekly"),
May_Total_Hundreds = c(round(sum(atm_forecast$ATM1),2),
round(sum(atm_forecast$ATM2),2),
round(sum(atm_forecast$ATM3),2),
round(sum(atm_forecast$ATM4),2))
)
kable(atm_summary, caption = "ATM May 2010 Forecast Summary")
| ATM | Model | May_Total_Hundreds |
|---|---|---|
| ATM1 | Holt-Winters additive weekly seasonality | 2373 |
| ATM2 | Holt-Winters additive weekly seasonality | 1791 |
| ATM3 | Conservative manual forecast | 0 |
| ATM4 | Seasonal naive weekly | 8312 |
power_file <- get_file("ResidentialCustomerForecastLoad-624.*\\.xlsx$")
power_raw <- read_excel(power_file)
power <- power_raw %>%
mutate(Date = as.Date(paste0(`YYYY-MMM`, "-01"), format = "%Y-%b-%d")) %>%
arrange(Date)
power$KWH <- na.interp(power$KWH)
power_ts <- ts(power$KWH, start = c(1998, 1), frequency = 12)
summary(power_ts)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 770523 5434539 6314472 6502824 7608792 10655730
autoplot(power_ts) +
labs(title = "Residential Monthly Power Usage", x = "Year", y = "KWH")
This series had a pretty clear monthly seasonal pattern, so I used Holt-Winters with additive seasonality. I also considered other options, but this seemed like the most straightforward choice for the data and for the assignment.
fit_power <- hw(power_ts, seasonal = "additive", h = 12)
power_forecast <- data.frame(
Month = seq(as.Date("2014-01-01"), by = "month", length.out = 12),
Forecast_KWH = round(as.numeric(fit_power$mean), 0)
)
power_forecast
## Month Forecast_KWH
## 1 2014-01-01 9148885
## 2 2014-02-01 8237457
## 3 2014-03-01 7245381
## 4 2014-04-01 6579342
## 5 2014-05-01 6346912
## 6 2014-06-01 7831097
## 7 2014-07-01 8739578
## 8 2014-08-01 9639220
## 9 2014-09-01 9018186
## 10 2014-10-01 7077514
## 11 2014-11-01 6344088
## 12 2014-12-01 7638889
kable(power_forecast, caption = "2014 Monthly Power Forecast")
| Month | Forecast_KWH |
|---|---|
| 2014-01-01 | 9148885 |
| 2014-02-01 | 8237457 |
| 2014-03-01 | 7245381 |
| 2014-04-01 | 6579342 |
| 2014-05-01 | 6346912 |
| 2014-06-01 | 7831097 |
| 2014-07-01 | 8739578 |
| 2014-08-01 | 9639220 |
| 2014-09-01 | 9018186 |
| 2014-10-01 | 7077514 |
| 2014-11-01 | 6344088 |
| 2014-12-01 | 7638889 |
The power data was a lot cleaner than the ATM data. The biggest pattern was the repeated seasonal movement across the year, which makes sense for residential electricity use. The forecast just carries that same general pattern into 2014.
Because that seasonal pattern was so visible, I did not think this part needed anything too complicated. A seasonal monthly model worked fine here.
The instructions for Part C said to time-base the data and aggregate by hour, taking the mean if there were multiple readings within the same hour. That is what I did first.
pipe1_file <- get_file("Waterflow_Pipe1.*\\.xlsx$")
pipe2_file <- get_file("Waterflow_Pipe2.*\\.xlsx$")
fix_datetime <- function(x) {
if (inherits(x, c("POSIXct", "POSIXt"))) {
return(x)
}
if (inherits(x, "Date")) {
return(as.POSIXct(x))
}
if (is.numeric(x)) {
return(as.POSIXct(x, origin = "1899-12-30", tz = "UTC"))
}
parse_date_time(as.character(x),
orders = c("ymd HMS", "ymd HM", "ymdHMS", "mdy HMS", "mdy HM"),
tz = "UTC")
}
pipe1_raw <- read_excel(pipe1_file) %>%
mutate(`Date Time` = fix_datetime(`Date Time`))
pipe2_raw <- read_excel(pipe2_file) %>%
mutate(`Date Time` = fix_datetime(`Date Time`))
pipe1_hourly <- pipe1_raw %>%
filter(!is.na(`Date Time`)) %>%
mutate(Hour = floor_date(`Date Time`, unit = "hour")) %>%
group_by(Hour) %>%
summarise(WaterFlow = mean(WaterFlow, na.rm = TRUE), .groups = "drop") %>%
complete(Hour = seq(min(Hour), max(Hour), by = "hour")) %>%
mutate(WaterFlow = na.approx(WaterFlow, na.rm = FALSE)) %>%
mutate(WaterFlow = na.locf(WaterFlow, na.rm = FALSE),
WaterFlow = na.locf(WaterFlow, fromLast = TRUE))
pipe2_hourly <- pipe2_raw %>%
filter(!is.na(`Date Time`)) %>%
mutate(Hour = floor_date(`Date Time`, unit = "hour")) %>%
group_by(Hour) %>%
summarise(WaterFlow = mean(WaterFlow, na.rm = TRUE), .groups = "drop") %>%
complete(Hour = seq(min(Hour), max(Hour), by = "hour")) %>%
mutate(WaterFlow = na.approx(WaterFlow, na.rm = FALSE)) %>%
mutate(WaterFlow = na.locf(WaterFlow, na.rm = FALSE),
WaterFlow = na.locf(WaterFlow, fromLast = TRUE))
safe_adf <- function(x) {
x <- as.numeric(x)
x <- x[is.finite(x)]
if (length(x) < 10) {
return(list(result = "ADF test not run", p.value = NA_real_,
note = "Series too short for ADF test"))
}
if (sd(x, na.rm = TRUE) == 0) {
return(list(result = "ADF test not run", p.value = NA_real_,
note = "Series is constant, so ADF test is not meaningful"))
}
out <- tryCatch(
tseries::adf.test(x),
error = function(e) NULL
)
if (is.null(out)) {
list(result = "ADF test not run", p.value = NA_real_,
note = "ADF test could not be computed for this series")
} else {
list(result = out, p.value = out$p.value,
note = "ADF test ran successfully")
}
}
adf_pipe1 <- safe_adf(pipe1_hourly$WaterFlow)
adf_pipe2 <- safe_adf(pipe2_hourly$WaterFlow)
adf_summary <- tibble(
Series = c("Pipe1", "Pipe2"),
ADF_p_value = c(adf_pipe1$p.value, adf_pipe2$p.value),
Note = c(adf_pipe1$note, adf_pipe2$note)
)
knitr::kable(adf_summary, digits = 4,
caption = "ADF test results for hourly waterflow series")
| Series | ADF_p_value | Note |
|---|---|---|
| Pipe1 | NA | Series too short for ADF test |
| Pipe2 | NA | Series too short for ADF test |
I used the ADF check as a rough stationarity check. If the test could not be computed cleanly, I treated that as a limitation of the data rather than forcing a result. Since this is just the bonus section, I kept the forecasting step short-horizon and simple.
I kept the models simple here since the bonus part was more about the sequence alignment, stationarity check, and whether forecasting was reasonable at all.
safe_pipe_forecast <- function(df, arima_order = c(1,0,0), h = 168) {
x <- as.numeric(df$WaterFlow)
x <- x[is.finite(x)]
last_time <- max(df$Hour, na.rm = TRUE)
future_index <- seq(last_time + hours(1), by = "hour", length.out = h)
if (length(x) < 10) {
avg_val <- ifelse(length(x) == 0, 0, mean(x, na.rm = TRUE))
fc_vals <- rep(avg_val, h)
method_note <- "Fallback mean forecast used because the series was too short for ARIMA."
return(list(
model = NULL,
forecast = data.frame(DateTime = future_index, Forecast_WaterFlow = fc_vals),
method = method_note
))
}
fit <- tryCatch(
forecast::Arima(x, order = arima_order),
error = function(e) NULL
)
if (is.null(fit)) {
avg_val <- mean(x, na.rm = TRUE)
fc_vals <- rep(avg_val, h)
method_note <- "Fallback mean forecast used because ARIMA would not converge on this series."
return(list(
model = NULL,
forecast = data.frame(DateTime = future_index, Forecast_WaterFlow = fc_vals),
method = method_note
))
}
fc <- forecast::forecast(fit, h = h)
list(
model = fit,
forecast = data.frame(DateTime = future_index,
Forecast_WaterFlow = as.numeric(fc$mean)),
method = "ARIMA forecast ran successfully."
)
}
pipe1_result <- safe_pipe_forecast(pipe1_hourly, arima_order = c(1,0,0), h = 168)
pipe2_result <- safe_pipe_forecast(pipe2_hourly, arima_order = c(3,0,0), h = 168)
pipe1_forecast <- pipe1_result$forecast
pipe2_forecast <- pipe2_result$forecast
pipe_method_summary <- tibble(
Series = c("Pipe1", "Pipe2"),
Method_Used = c(pipe1_result$method, pipe2_result$method)
)
knitr::kable(pipe_method_summary, caption = "Forecasting method used for each pipe series")
| Series | Method_Used |
|---|---|
| Pipe1 | Fallback mean forecast used because the series was too short for ARIMA. |
| Pipe2 | Fallback mean forecast used because the series was too short for ARIMA. |
ggplot() +
geom_line(data = tail(pipe1_hourly, 200), aes(x = Hour, y = WaterFlow), alpha = 0.7) +
geom_line(data = pipe1_forecast, aes(x = DateTime, y = Forecast_WaterFlow), linetype = "dashed") +
labs(title = "Pipe 1 - One Week Forecast", x = NULL, y = "WaterFlow")
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
ggplot() +
geom_line(data = tail(pipe2_hourly, 200), aes(x = Hour, y = WaterFlow), alpha = 0.7) +
geom_line(data = pipe2_forecast, aes(x = DateTime, y = Forecast_WaterFlow), linetype = "dashed") +
labs(title = "Pipe 2 - One Week Forecast", x = NULL, y = "WaterFlow")
## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?
Most of the work in this section was really in the preprocessing. The timestamps were not on the same hourly base at first, so I had to aggregate them by hour and take the mean when there was more than one reading in the same hour. After that, both datasets were in a cleaner hourly format.
Based on the stationarity results, I think short-run forecasting is reasonable here. I still would not make very strong claims from a limited amount of data, but a one-week forecast seemed fair for this bonus part.
Overall, I do not think this project was something where one forecasting method should be used on every dataset.
If I were continuing this project, I would probably compare models more formally with a rolling forecast setup. But for this assignment, I think the methods I used were reasonable and matched the patterns in the data.
write.csv(atm_forecast, "ATM_May2010_Forecast.csv", row.names = FALSE)
write.csv(power_forecast, "Power_2014_Forecast.csv", row.names = FALSE)
write.csv(pipe1_forecast, "Pipe1_1Week_Forecast.csv", row.names = FALSE)
write.csv(pipe2_forecast, "Pipe2_1Week_Forecast.csv", row.names = FALSE)