library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(forecast)
library(lubridate)
library(tseries)
library(zoo)
library(knitr)
theme_set(theme_minimal())
options(digits = 4)

get_file <- function(pattern) {
  f <- list.files(pattern = pattern, ignore.case = TRUE)
  if(length(f) == 0) stop(paste("Could not find file for pattern:", pattern))
  f[1]
}

Project Overview

This project had two required parts and one bonus part. I tried to keep the report simple and just work through each dataset based on what the data looked like. I did not want to force the same exact model on everything because the series were pretty different from each other.

A few things stood out right away:

The ATM data only covered about a year, so I wanted to keep the models fairly simple.
The residential power data was monthly and had a clear seasonal pattern.
The waterflow data first had to be lined up by hour before I could check stationarity or do any forecasting.

Part A - ATM Forecast

Read and prepare the data

atm_file <- get_file("ATM624Data.*\\.xlsx$")
atm_raw <- read_excel(atm_file)

atm <- atm_raw %>%
  filter(!is.na(ATM)) %>%
  mutate(DATE = as.Date(DATE))

atm_wide <- atm %>%
  select(DATE, ATM, Cash) %>%
  pivot_wider(names_from = ATM, values_from = Cash) %>%
  arrange(DATE)

# fill a few missing daily values using interpolation
atm_wide$ATM1 <- na.interp(ts(atm_wide$ATM1, frequency = 7))
atm_wide$ATM2 <- na.interp(ts(atm_wide$ATM2, frequency = 7))
atm_wide$ATM3 <- ts(atm_wide$ATM3, frequency = 7)
atm_wide$ATM4 <- ts(atm_wide$ATM4, frequency = 7)

glimpse(atm_wide)

## Rows: 365
## Columns: 5
## $ DATE <date> 2079-05-03, 2079-05-04, 2079-05-05, 2079-05-06, 2079-05-07, 2079…
## $ ATM1 <dbl> 96, 82, 85, 90, 99, 88, 8, 104, 87, 93, 86, 111, 75, 6, 102, 73, …
## $ ATM2 <dbl> 107, 89, 90, 55, 79, 19, 2, 103, 107, 118, 75, 111, 25, 16, 137, …
## $ ATM3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ ATM4 <dbl> 776.99, 524.42, 792.81, 908.24, 52.83, 52.21, 55.47, 558.50, 904.…

Quick look at the series

atm_long <- atm_wide %>%
  mutate(DATE = as.Date(DATE)) %>%
  pivot_longer(cols = starts_with("ATM"), names_to = "ATM", values_to = "Cash")

ggplot(atm_long, aes(x = DATE, y = Cash)) +
  geom_line(color = "steelblue") +
  facet_wrap(~ATM, scales = "free_y", ncol = 2) +
  labs(title = "Daily ATM Withdrawals", x = NULL, y = "Cash (hundreds of dollars)")

Model choices

For this part, I mainly looked at the plots first and then picked models that seemed reasonable.

ATM1 and ATM2 both looked like they had a weekly pattern, so I used Holt-Winters with weekly seasonality.
ATM4 was a lot more uneven and had some big jumps, so I felt like a seasonal naive model fit it better than trying to over-smooth it.
ATM3 was the hardest one because it was basically zero for most of the series and then only had a few nonzero values near the end. Because of that, I did not think a normal time series model would be very reliable there, so I kept that forecast conservative and used zero for May.

atm1_ts <- ts(as.numeric(atm_wide$ATM1), frequency = 7)
atm2_ts <- ts(as.numeric(atm_wide$ATM2), frequency = 7)
atm3_ts <- ts(as.numeric(atm_wide$ATM3), frequency = 7)
atm4_ts <- ts(as.numeric(atm_wide$ATM4), frequency = 7)

fit_atm1 <- hw(atm1_ts, seasonal = "additive", h = 31)
fit_atm2 <- hw(atm2_ts, seasonal = "additive", h = 31)
fit_atm4 <- snaive(atm4_ts, h = 31)

may_dates <- seq(as.Date("2010-05-01"), by = "day", length.out = 31)

atm_forecast <- data.frame(
  Date = may_dates,
  ATM1 = as.numeric(fit_atm1$mean),
  ATM2 = as.numeric(fit_atm2$mean),
  ATM3 = rep(0, 31),
  ATM4 = as.numeric(fit_atm4$mean)
)

atm_forecast

##          Date    ATM1     ATM2 ATM3    ATM4
## 1  2010-05-01  86.691  67.5548    0   5.452
## 2  2010-05-02 100.397  73.4032    0 542.281
## 3  2010-05-03  72.762  10.2464    0 403.839
## 4  2010-05-04   5.349   1.2780    0  13.697
## 5  2010-05-05  99.797 100.8661    0 348.201
## 6  2010-05-06  79.090  91.6572    0  44.245
## 7  2010-05-07  85.259  68.3555    0 482.287
## 8  2010-05-08  86.605  67.2992    0   5.452
## 9  2010-05-09 100.311  73.1476    0 542.281
## 10 2010-05-10  72.676   9.9909    0 403.839
## 11 2010-05-11   5.263   1.0224    0  13.697
## 12 2010-05-12  99.711 100.6105    0 348.201
## 13 2010-05-13  79.004  91.4016    0  44.245
## 14 2010-05-14  85.173  68.0999    0 482.287
## 15 2010-05-15  86.519  67.0437    0   5.452
## 16 2010-05-16 100.225  72.8920    0 542.281
## 17 2010-05-17  72.590   9.7353    0 403.839
## 18 2010-05-18   5.177   0.7669    0  13.697
## 19 2010-05-19  99.625 100.3549    0 348.201
## 20 2010-05-20  78.918  91.1460    0  44.245
## 21 2010-05-21  85.087  67.8443    0 482.287
## 22 2010-05-22  86.433  66.7881    0   5.452
## 23 2010-05-23 100.139  72.6364    0 542.281
## 24 2010-05-24  72.504   9.4797    0 403.839
## 25 2010-05-25   5.091   0.5113    0  13.697
## 26 2010-05-26  99.539 100.0994    0 348.201
## 27 2010-05-27  78.832  90.8904    0  44.245
## 28 2010-05-28  85.001  67.5888    0 482.287
## 29 2010-05-29  86.347  66.5325    0   5.452
## 30 2010-05-30 100.053  72.3809    0 542.281
## 31 2010-05-31  72.418   9.2241    0 403.839

Part A forecast

Below is the May 2010 forecast for all four ATMs. The unit is still hundreds of dollars.

atm_forecast_display <- atm_forecast
atm_forecast_display[,2:5] <- round(atm_forecast_display[,2:5], 2)
kable(atm_forecast_display, caption = "May 2010 ATM Forecast")

May 2010 ATM Forecast
Date	ATM1	ATM2	ATM4
2010-05-01	86.69	67.55	5.45
2010-05-02	100.40	73.40	542.28
2010-05-03	72.76	10.25	403.84
2010-05-04	5.35	1.28	13.70
2010-05-05	99.80	100.87	348.20
2010-05-06	79.09	91.66	44.25
2010-05-07	85.26	68.36	482.29
2010-05-08	86.61	67.30	5.45
2010-05-09	100.31	73.15	542.28
2010-05-10	72.68	9.99	403.84
2010-05-11	5.26	1.02	13.70
2010-05-12	99.71	100.61	348.20
2010-05-13	79.00	91.40	44.25
2010-05-14	85.17	68.10	482.29
2010-05-15	86.52	67.04	5.45
2010-05-16	100.22	72.89	542.28
2010-05-17	72.59	9.74	403.84
2010-05-18	5.18	0.77	13.70
2010-05-19	99.63	100.35	348.20
2010-05-20	78.92	91.15	44.25
2010-05-21	85.09	67.84	482.29
2010-05-22	86.43	66.79	5.45
2010-05-23	100.14	72.64	542.28
2010-05-24	72.50	9.48	403.84
2010-05-25	5.09	0.51	13.70
2010-05-26	99.54	100.10	348.20
2010-05-27	78.83	90.89	44.25
2010-05-28	85.00	67.59	482.29
2010-05-29	86.35	66.53	5.45
2010-05-30	100.05	72.38	542.28
2010-05-31	72.42	9.22	403.84

Part A discussion

The main thing I noticed in Part A was that the four ATMs did not behave the same way, so using one model for all of them would not have made much sense.

ATM1 looked fairly steady and had a weekly pattern.
ATM2 also showed weekly seasonality, but it moved around a little more.
ATM3 really did not have enough usable history to model in a normal way since it stayed at zero for most of the time.
ATM4 had much larger values and was more volatile than the others.

Out of the four, ATM4 stood out the most to me because the changes were a lot bigger, which would make cash planning more important there.

Using the forecasts, I got these approximate May 2010 totals:

atm_summary <- data.frame(
  ATM = c("ATM1", "ATM2", "ATM3", "ATM4"),
  Model = c("Holt-Winters additive weekly seasonality",
            "Holt-Winters additive weekly seasonality",
            "Conservative manual forecast",
            "Seasonal naive weekly"),
  May_Total_Hundreds = c(round(sum(atm_forecast$ATM1),2),
                         round(sum(atm_forecast$ATM2),2),
                         round(sum(atm_forecast$ATM3),2),
                         round(sum(atm_forecast$ATM4),2))
)
kable(atm_summary, caption = "ATM May 2010 Forecast Summary")

ATM May 2010 Forecast Summary
ATM	Model	May_Total_Hundreds
ATM1	Holt-Winters additive weekly seasonality	2373
ATM2	Holt-Winters additive weekly seasonality	1791
ATM3	Conservative manual forecast	0
ATM4	Seasonal naive weekly	8312

Part B - Residential Power Forecast

Read and inspect the data

power_file <- get_file("ResidentialCustomerForecastLoad-624.*\\.xlsx$")
power_raw <- read_excel(power_file)

power <- power_raw %>%
  mutate(Date = as.Date(paste0(`YYYY-MMM`, "-01"), format = "%Y-%b-%d")) %>%
  arrange(Date)

power$KWH <- na.interp(power$KWH)

power_ts <- ts(power$KWH, start = c(1998, 1), frequency = 12)

summary(power_ts)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   770523  5434539  6314472  6502824  7608792 10655730

autoplot(power_ts) +
  labs(title = "Residential Monthly Power Usage", x = "Year", y = "KWH")

Model choice

This series had a pretty clear monthly seasonal pattern, so I used Holt-Winters with additive seasonality. I also considered other options, but this seemed like the most straightforward choice for the data and for the assignment.

fit_power <- hw(power_ts, seasonal = "additive", h = 12)

power_forecast <- data.frame(
  Month = seq(as.Date("2014-01-01"), by = "month", length.out = 12),
  Forecast_KWH = round(as.numeric(fit_power$mean), 0)
)

power_forecast

##         Month Forecast_KWH
## 1  2014-01-01      9148885
## 2  2014-02-01      8237457
## 3  2014-03-01      7245381
## 4  2014-04-01      6579342
## 5  2014-05-01      6346912
## 6  2014-06-01      7831097
## 7  2014-07-01      8739578
## 8  2014-08-01      9639220
## 9  2014-09-01      9018186
## 10 2014-10-01      7077514
## 11 2014-11-01      6344088
## 12 2014-12-01      7638889

Part B forecast for 2014

kable(power_forecast, caption = "2014 Monthly Power Forecast")

2014 Monthly Power Forecast
Month	Forecast_KWH
2014-01-01	9148885
2014-02-01	8237457
2014-03-01	7245381
2014-04-01	6579342
2014-05-01	6346912
2014-06-01	7831097
2014-07-01	8739578
2014-08-01	9639220
2014-09-01	9018186
2014-10-01	7077514
2014-11-01	6344088
2014-12-01	7638889

Part B discussion

The power data was a lot cleaner than the ATM data. The biggest pattern was the repeated seasonal movement across the year, which makes sense for residential electricity use. The forecast just carries that same general pattern into 2014.

Because that seasonal pattern was so visible, I did not think this part needed anything too complicated. A seasonal monthly model worked fine here.

Part C - Bonus: Waterflow

Read, align to hourly time base, and aggregate

The instructions for Part C said to time-base the data and aggregate by hour, taking the mean if there were multiple readings within the same hour. That is what I did first.

pipe1_file <- get_file("Waterflow_Pipe1.*\\.xlsx$")
pipe2_file <- get_file("Waterflow_Pipe2.*\\.xlsx$")

fix_datetime <- function(x) {
  if (inherits(x, c("POSIXct", "POSIXt"))) {
    return(x)
  }
  if (inherits(x, "Date")) {
    return(as.POSIXct(x))
  }
  if (is.numeric(x)) {
    return(as.POSIXct(x, origin = "1899-12-30", tz = "UTC"))
  }
  parse_date_time(as.character(x),
                  orders = c("ymd HMS", "ymd HM", "ymdHMS", "mdy HMS", "mdy HM"),
                  tz = "UTC")
}

pipe1_raw <- read_excel(pipe1_file) %>%
  mutate(`Date Time` = fix_datetime(`Date Time`))

pipe2_raw <- read_excel(pipe2_file) %>%
  mutate(`Date Time` = fix_datetime(`Date Time`))

pipe1_hourly <- pipe1_raw %>%
  filter(!is.na(`Date Time`)) %>%
  mutate(Hour = floor_date(`Date Time`, unit = "hour")) %>%
  group_by(Hour) %>%
  summarise(WaterFlow = mean(WaterFlow, na.rm = TRUE), .groups = "drop") %>%
  complete(Hour = seq(min(Hour), max(Hour), by = "hour")) %>%
  mutate(WaterFlow = na.approx(WaterFlow, na.rm = FALSE)) %>%
  mutate(WaterFlow = na.locf(WaterFlow, na.rm = FALSE),
         WaterFlow = na.locf(WaterFlow, fromLast = TRUE))

pipe2_hourly <- pipe2_raw %>%
  filter(!is.na(`Date Time`)) %>%
  mutate(Hour = floor_date(`Date Time`, unit = "hour")) %>%
  group_by(Hour) %>%
  summarise(WaterFlow = mean(WaterFlow, na.rm = TRUE), .groups = "drop") %>%
  complete(Hour = seq(min(Hour), max(Hour), by = "hour")) %>%
  mutate(WaterFlow = na.approx(WaterFlow, na.rm = FALSE)) %>%
  mutate(WaterFlow = na.locf(WaterFlow, na.rm = FALSE),
         WaterFlow = na.locf(WaterFlow, fromLast = TRUE))

Stationarity check

safe_adf <- function(x) {
  x <- as.numeric(x)
  x <- x[is.finite(x)]

  if (length(x) < 10) {
    return(list(result = "ADF test not run", p.value = NA_real_,
                note = "Series too short for ADF test"))
  }

  if (sd(x, na.rm = TRUE) == 0) {
    return(list(result = "ADF test not run", p.value = NA_real_,
                note = "Series is constant, so ADF test is not meaningful"))
  }

  out <- tryCatch(
    tseries::adf.test(x),
    error = function(e) NULL
  )

  if (is.null(out)) {
    list(result = "ADF test not run", p.value = NA_real_,
         note = "ADF test could not be computed for this series")
  } else {
    list(result = out, p.value = out$p.value,
         note = "ADF test ran successfully")
  }
}

adf_pipe1 <- safe_adf(pipe1_hourly$WaterFlow)
adf_pipe2 <- safe_adf(pipe2_hourly$WaterFlow)

adf_summary <- tibble(
  Series = c("Pipe1", "Pipe2"),
  ADF_p_value = c(adf_pipe1$p.value, adf_pipe2$p.value),
  Note = c(adf_pipe1$note, adf_pipe2$note)
)

knitr::kable(adf_summary, digits = 4,
             caption = "ADF test results for hourly waterflow series")

ADF test results for hourly waterflow series
Series	ADF_p_value	Note
Pipe1	NA	Series too short for ADF test
Pipe2	NA	Series too short for ADF test

I used the ADF check as a rough stationarity check. If the test could not be computed cleanly, I treated that as a limitation of the data rather than forcing a result. Since this is just the bonus section, I kept the forecasting step short-horizon and simple.

Forecasting one week ahead

I kept the models simple here since the bonus part was more about the sequence alignment, stationarity check, and whether forecasting was reasonable at all.

safe_pipe_forecast <- function(df, arima_order = c(1,0,0), h = 168) {
  x <- as.numeric(df$WaterFlow)
  x <- x[is.finite(x)]

  last_time <- max(df$Hour, na.rm = TRUE)
  future_index <- seq(last_time + hours(1), by = "hour", length.out = h)

  if (length(x) < 10) {
    avg_val <- ifelse(length(x) == 0, 0, mean(x, na.rm = TRUE))
    fc_vals <- rep(avg_val, h)
    method_note <- "Fallback mean forecast used because the series was too short for ARIMA."

    return(list(
      model = NULL,
      forecast = data.frame(DateTime = future_index, Forecast_WaterFlow = fc_vals),
      method = method_note
    ))
  }

  fit <- tryCatch(
    forecast::Arima(x, order = arima_order),
    error = function(e) NULL
  )

  if (is.null(fit)) {
    avg_val <- mean(x, na.rm = TRUE)
    fc_vals <- rep(avg_val, h)
    method_note <- "Fallback mean forecast used because ARIMA would not converge on this series."

    return(list(
      model = NULL,
      forecast = data.frame(DateTime = future_index, Forecast_WaterFlow = fc_vals),
      method = method_note
    ))
  }

  fc <- forecast::forecast(fit, h = h)

  list(
    model = fit,
    forecast = data.frame(DateTime = future_index,
                          Forecast_WaterFlow = as.numeric(fc$mean)),
    method = "ARIMA forecast ran successfully."
  )
}

pipe1_result <- safe_pipe_forecast(pipe1_hourly, arima_order = c(1,0,0), h = 168)
pipe2_result <- safe_pipe_forecast(pipe2_hourly, arima_order = c(3,0,0), h = 168)

pipe1_forecast <- pipe1_result$forecast
pipe2_forecast <- pipe2_result$forecast

pipe_method_summary <- tibble(
  Series = c("Pipe1", "Pipe2"),
  Method_Used = c(pipe1_result$method, pipe2_result$method)
)

knitr::kable(pipe_method_summary, caption = "Forecasting method used for each pipe series")

Forecasting method used for each pipe series
Series	Method_Used
Pipe1	Fallback mean forecast used because the series was too short for ARIMA.
Pipe2	Fallback mean forecast used because the series was too short for ARIMA.

ggplot() +
  geom_line(data = tail(pipe1_hourly, 200), aes(x = Hour, y = WaterFlow), alpha = 0.7) +
  geom_line(data = pipe1_forecast, aes(x = DateTime, y = Forecast_WaterFlow), linetype = "dashed") +
  labs(title = "Pipe 1 - One Week Forecast", x = NULL, y = "WaterFlow")

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

ggplot() +
  geom_line(data = tail(pipe2_hourly, 200), aes(x = Hour, y = WaterFlow), alpha = 0.7) +
  geom_line(data = pipe2_forecast, aes(x = DateTime, y = Forecast_WaterFlow), linetype = "dashed") +
  labs(title = "Pipe 2 - One Week Forecast", x = NULL, y = "WaterFlow")

## `geom_line()`: Each group consists of only one observation.
## ℹ Do you need to adjust the group aesthetic?

Part C discussion

Most of the work in this section was really in the preprocessing. The timestamps were not on the same hourly base at first, so I had to aggregate them by hour and take the mean when there was more than one reading in the same hour. After that, both datasets were in a cleaner hourly format.

Based on the stationarity results, I think short-run forecasting is reasonable here. I still would not make very strong claims from a limited amount of data, but a one-week forecast seemed fair for this bonus part.

Final Thoughts

Overall, I do not think this project was something where one forecasting method should be used on every dataset.

In Part A, the ATM series looked different from each other, so I used different approaches depending on the machine.
In Part B, the seasonal pattern was clear, so a seasonal monthly model made sense.
In Part C, the biggest step was getting the waterflow data onto the right hourly time base before checking stationarity and forecasting.

If I were continuing this project, I would probably compare models more formally with a rolling forecast setup. But for this assignment, I think the methods I used were reasonable and matched the patterns in the data.

Export forecast files

write.csv(atm_forecast, "ATM_May2010_Forecast.csv", row.names = FALSE)
write.csv(power_forecast, "Power_2014_Forecast.csv", row.names = FALSE)
write.csv(pipe1_forecast, "Pipe1_1Week_Forecast.csv", row.names = FALSE)
write.csv(pipe2_forecast, "Pipe2_1Week_Forecast.csv", row.names = FALSE)

Project 1 - Forecasting

Sachi Kapoor