1 Overview
2 Load Packages
3 Load Data
4 Part A
5 Part B
- 5.1 Compare Forecasting Models for Power Usage
- 5.2 Final Power Forecast for 2014
6 Part C
7 Export The Final Forecasts To Excel

1 Overview

This project has three parts. Part A forecasts daily ATM cash withdrawals for May 2010. Part B forecasts monthly residential electricity usage for 2014. Part C is the bonus section and forecasts one week of hourly waterflow.

2 Load Packages

library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(forecast)
library(writexl)
library(zoo)
library(ggplot2)
library(knitr)

3 Load Data

# read the Excel files from the same folder as the R Markdown file.
atm_raw <- read_excel("ATM624Data.xlsx")
power_raw <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
pipe1_raw <- read_excel("Waterflow_Pipe1.xlsx")
pipe2_raw <- read_excel("Waterflow_Pipe2.xlsx")

# convert Excel serial dates to R dates.
atm_raw <- atm_raw %>%
  mutate(DATE = as.Date(DATE, origin = "1899-12-30"))

# convert Excel date-time serial values to POSIXct date-time values.
pipe1_raw <- pipe1_raw %>%
  mutate(`Date Time` = as.POSIXct(`Date Time` * 86400,
                                  origin = "1899-12-30",
                                  tz = "UTC")) %>%
  mutate(`Date Time` = floor_date(`Date Time`, unit = "minute"))

pipe2_raw <- pipe2_raw %>%
  mutate(`Date Time` = as.POSIXct(`Date Time` * 86400,
                                  origin = "1899-12-30",
                                  tz = "UTC")) %>%
  mutate(`Date Time` = floor_date(`Date Time`, unit = "minute"))

# quick previews.
head(atm_raw)

head(power_raw)

head(pipe1_raw)

head(pipe2_raw)

# fit one selected forecasting method and return the forecast
forecast_candidate <- function(y_train, h, model_name) {
  switch(
    model_name,
    "Mean" = meanf(y_train, h = h),
    "Naive" = naive(y_train, h = h),
    "Seasonal Naive" = snaive(y_train, h = h),
    "ETS" = forecast(ets(y_train), h = h),
    "ARIMA" = forecast(auto.arima(y_train), h = h),
    "STL + ETS" = stlf(y_train, h = h, method = "ets"),
    "Holt-Winters Additive" = hw(y_train, h = h, seasonal = "additive", damped = TRUE)
  )
}

# calculate RMSE, MAE, and MAPE on the holdout period
accuracy_row <- function(actual, predicted) {
  actual <- as.numeric(actual)
  predicted <- as.numeric(predicted)
  mape_values <- abs((actual - predicted) / ifelse(actual == 0, NA, actual))

  tibble(
    RMSE = sqrt(mean((actual - predicted)^2, na.rm = TRUE)),
    MAE = mean(abs(actual - predicted), na.rm = TRUE),
    MAPE = ifelse(all(is.na(mape_values)), NA_real_, mean(mape_values, na.rm = TRUE) * 100)
  )
}

# compare candidate models using the same holdout period
evaluate_models <- function(y_values, series_name, frequency_value, holdout_n, model_names) {
  y_values <- as.numeric(y_values)
  y_values <- y_values[!is.na(y_values)]
  n <- length(y_values)
  holdout_n <- min(holdout_n, n - max(10, frequency_value * 2))

  if (holdout_n < 1) {
    return(tibble(
      Series = series_name,
      Model = model_names,
      RMSE = NA_real_,
      MAE = NA_real_,
      MAPE = NA_real_
    ))
  }

  train_values <- y_values[1:(n - holdout_n)]
  test_values <- y_values[(n - holdout_n + 1):n]
  train_ts <- ts(train_values, frequency = frequency_value)

  bind_rows(lapply(model_names, function(model_name) {
    fc <- tryCatch(
      forecast_candidate(train_ts, h = length(test_values), model_name = model_name),
      error = function(e) NULL
    )

    if (is.null(fc)) {
      tibble(
        Series = series_name,
        Model = model_name,
        RMSE = NA_real_,
        MAE = NA_real_,
        MAPE = NA_real_
      )
    } else {
      accuracy_row(test_values, as.numeric(fc$mean)) %>%
        mutate(
          Series = series_name,
          Model = model_name
        ) %>%
        select(Series, Model, RMSE, MAE, MAPE)
    }
  }))
}

4 Part A

The goal in Part A is to forecast how much cash will be taken out of four different ATM machines for May 2010. The cash variable is recorded in hundreds of dollars.

I treated each ATM as a separate time series because the four machines do not behave the same way. ATM1, ATM2, and ATM4 show short term day of week behavior. ATM3 is very unusual because it is inactive for almost the whole data set and only becomes active very late in the series. Because of that, I did not force one single model onto all four machines.

4.1 Clean & Explore The ATM Data

atm_base <- atm_raw %>%
  filter(!is.na(ATM)) %>%
  mutate(
    DATE = as.Date(DATE),
    Cash = as.numeric(Cash)
  ) %>%
  arrange(ATM, DATE)

clean_atm_series <- function(data, atm_name) {
  one_atm <- data %>%
    filter(ATM == atm_name) %>%
    select(DATE, ATM, Cash) %>%
    distinct(DATE, .keep_all = TRUE) %>%
    arrange(DATE)
  
  # create a complete daily sequence so missing ATM dates can be counted
  full_series <- tibble(DATE = seq(min(one_atm$DATE), max(one_atm$DATE), by = "day")) %>%
    left_join(one_atm, by = "DATE") %>%
    mutate(
      ATM = atm_name,
      missing_flag = is.na(Cash)
    )

  if (atm_name == "ATM3") {
    clean_values <- na.approx(full_series$Cash, na.rm = FALSE)
    clean_values <- na.locf(clean_values, na.rm = FALSE)
    clean_values <- na.locf(clean_values, fromLast = TRUE, na.rm = FALSE)

    full_series %>%
      mutate(
        Cash_Clean = as.numeric(clean_values),
        outlier_flag = FALSE
      )
  } else {
    
    # use tsclean to handle missing values and possible outliers for the stable ATM series
    clean_ts <- tsclean(ts(full_series$Cash, frequency = 7), replace.missing = TRUE)
    full_series %>%
      mutate(
        Cash_Clean = as.numeric(clean_ts),
        outlier_flag = !missing_flag & !is.na(Cash) & abs(Cash - Cash_Clean) > 1e-8
      )
  }
}

atm_names <- sort(unique(atm_base$ATM))
atm_clean <- bind_rows(lapply(atm_names, function(x) clean_atm_series(atm_base, x)))

atm_audit <- atm_clean %>%
  group_by(ATM) %>%
  summarise(
    First_Date = min(DATE),
    Last_Date = max(DATE),
    Daily_Rows_After_Completion = n(),
    Missing_Values_or_Missing_Dates = sum(missing_flag),
    Extreme_Outliers_Replaced = sum(outlier_flag),
    .groups = "drop"
  )

atm_audit %>%
  kable(caption = "Part A data quality audit after completing daily series")

Part A data quality audit after completing daily series
ATM	First_Date	Last_Date	Daily_Rows_After_Completion	Missing_Values_or_Missing_Dates	Extreme_Outliers_Replaced
ATM1	2009-05-01	2010-04-30	365	3	26
ATM2	2009-05-01	2010-04-30	365	2	10
ATM3	2009-05-01	2010-04-30	365	0	0
ATM4	2009-05-01	2010-04-30	365	0	1

atm_clean %>%
  filter(missing_flag | outlier_flag) %>%
  select(ATM, DATE, Cash, Cash_Clean, missing_flag, outlier_flag) %>%
  arrange(ATM, DATE) %>%
  head(20) %>%
  kable(digits = 2, caption = "First missing values or outliers handled in Part A")

First missing values or outliers handled in Part A
ATM	DATE	Cash	Cash_Clean	missing_flag	outlier_flag
ATM1	2009-06-05	3	102.99	FALSE	TRUE
ATM1	2009-06-13	NA	125.59	TRUE	FALSE
ATM1	2009-06-16	NA	131.13	TRUE	FALSE
ATM1	2009-06-22	NA	97.69	TRUE	FALSE
ATM1	2009-07-10	19	108.58	FALSE	TRUE
ATM1	2009-08-02	68	99.88	FALSE	TRUE
ATM1	2009-08-25	174	133.47	FALSE	TRUE
ATM1	2009-09-11	1	115.44	FALSE	TRUE
ATM1	2009-09-22	180	119.37	FALSE	TRUE
ATM1	2009-11-30	6	105.15	FALSE	TRUE
ATM1	2009-12-01	140	98.69	FALSE	TRUE
ATM1	2009-12-04	140	95.86	FALSE	TRUE
ATM1	2009-12-08	135	86.02	FALSE	TRUE
ATM1	2009-12-29	42	92.20	FALSE	TRUE
ATM1	2009-12-30	2	72.76	FALSE	TRUE
ATM1	2010-01-01	132	94.00	FALSE	TRUE
ATM1	2010-01-11	123	105.79	FALSE	TRUE
ATM1	2010-01-29	152	93.12	FALSE	TRUE
ATM1	2010-02-02	123	74.03	FALSE	TRUE
ATM1	2010-02-05	138	87.88	FALSE	TRUE

The audit table documents how missing observations and extreme outliers were handled. For ATM1, ATM2, and ATM4, I use the cleaned value for modeling. For ATM3, I keep the long sequence of zeros because it looks like the machine was inactive for most of the history rather than randomly missing.

# Compare raw cash values with the cleaned values used in modeling.
atm_clean %>%
  ggplot(aes(x = DATE)) +
  geom_line(aes(y = Cash), alpha = 0.45) +
  geom_line(aes(y = Cash_Clean), linewidth = 0.5) +
  facet_wrap(~ ATM, scales = "free_y", ncol = 2) +
  labs(
    title = "ATM daily cash withdrawals: raw and cleaned series",
    x = "Date",
    y = "Cash in hundreds of dollars"
  )

The plot shows that ATM1, ATM2, and ATM4 have weekly movement but different levels. ATM4 has the largest scale and the most volatility. ATM3 is different because it is almost all zeros and only becomes active near the end of the data set.

4.2 Compare Forecasting Models for ATM1, ATM2, and ATM4

I compared ATM1, ATM2, and ATM4 using the last 31 days as a holdout set. The models tested were mean, naive, seasonal naive, ETS, ARIMA, and STL plus ETS. This makes the model choice more transparent because each ATM is evaluated using the same holdout period.

atm_model_names <- c("Mean", "Naive", "Seasonal Naive", "ETS", "ARIMA", "STL + ETS")

# compare the class models using the last 31 days as a holdout set
atm_model_metrics <- bind_rows(lapply(c("ATM1", "ATM2", "ATM4"), function(atm_name) {
  y <- atm_clean %>%
    filter(ATM == atm_name) %>%
    arrange(DATE) %>%
    pull(Cash_Clean)

  evaluate_models(
    y_values = y,
    series_name = atm_name,
    frequency_value = 7,
    holdout_n = 31,
    model_names = atm_model_names
  )
})) %>%
  arrange(Series, RMSE)

atm_model_metrics %>%
  mutate(across(c(RMSE, MAE, MAPE), ~ round(.x, 2))) %>%
  kable(caption = "Part A holdout model comparison for ATM1, ATM2, and ATM4")

Part A holdout model comparison for ATM1, ATM2, and ATM4
Series	Model	RMSE	MAE	MAPE
ATM1	ARIMA	13.39	10.42	49.71
ATM1	ETS	13.77	10.68	53.72
ATM1	Seasonal Naive	15.86	13.16	70.79
ATM1	STL + ETS	16.75	13.09	85.40
ATM1	Mean	32.16	20.21	314.36
ATM1	Naive	67.20	63.29	126.73
ATM2	ETS	13.41	10.46	26.15
ATM2	STL + ETS	15.92	12.49	102.61
ATM2	ARIMA	16.16	11.88	88.32
ATM2	Seasonal Naive	16.39	13.52	40.36
ATM2	Mean	37.39	32.16	693.07
ATM2	Naive	71.32	60.83	100.00
ATM4	STL + ETS	245.09	203.15	525.96
ATM4	ARIMA	275.94	233.11	1030.13
ATM4	Mean	281.24	235.57	1119.09
ATM4	ETS	303.89	255.06	1227.94
ATM4	Seasonal Naive	329.58	260.18	372.84
ATM4	Naive	466.06	378.19	117.24

# select the lowest RMSE model for each ATM
atm_chosen_non3 <- atm_model_metrics %>%
  filter(!is.na(RMSE)) %>%
  group_by(Series) %>%
  arrange(RMSE, MAE, .by_group = TRUE) %>%
  slice(1) %>%
  ungroup() %>%
  transmute(ATM = Series, Model = Model, RMSE = RMSE, MAE = MAE, MAPE = MAPE)

Based on the holdout test, I select the lowest RMSE model for ATM1, ATM2, and ATM4. If two models are close, I would prefer the simpler one, but the table above keeps the selection transparent.

4.3 ATM3 Treatment

ATM3 needs separate treatment because it does not have a normal full-year operating history. A mean forecast is reasonable here, but I also document why models like ETS, ARIMA, and STL are not reliable for this machine.

atm3_active <- atm_clean %>%
  filter(ATM == "ATM3", Cash_Clean > 0) %>%
  arrange(DATE)

atm3_active_n <- nrow(atm3_active)
atm3_level <- mean(atm3_active$Cash_Clean, na.rm = TRUE)

atm3_model_note <- tibble(
  ATM = "ATM3",
  Model = c("Mean of active days", "ETS", "ARIMA", "STL + ETS"),
  Decision = c("Selected", "Not selected", "Not selected", "Not selected"),
  Reason = c(
    paste0("Only ", atm3_active_n, " active observations are available, so the mean of active days is the most stable choice."),
    "The active history is too short for a dependable exponential smoothing model.",
    "The active history is too short for a dependable ARIMA model.",
    "STL requires a meaningful seasonal history, which ATM3 does not have after activation."
  )
)

atm3_active %>%
  select(DATE, Cash_Clean) %>%
  kable(digits = 2, caption = "ATM3 active observations used for the mean forecast")

ATM3 active observations used for the mean forecast
DATE	Cash_Clean
2010-04-28	96
2010-04-29	82
2010-04-30	85

atm3_model_note %>%
  kable(caption = "ATM3 model decision")

ATM3 model decision
ATM	Model	Decision	Reason
ATM3	Mean of active days	Selected	Only 3 active observations are available, so the mean of active days is the most stable choice.
ATM3	ETS	Not selected	The active history is too short for a dependable exponential smoothing model.
ATM3	ARIMA	Not selected	The active history is too short for a dependable ARIMA model.
ATM3	STL + ETS	Not selected	STL requires a meaningful seasonal history, which ATM3 does not have after activation.

atm3_choice <- tibble(
  ATM = "ATM3",
  Model = "Mean of active days",
  RMSE = NA_real_,
  MAE = NA_real_,
  MAPE = NA_real_
)

atm_chosen <- bind_rows(atm_chosen_non3, atm3_choice) %>%
  arrange(ATM)

For ATM3, I use the mean of the active observations. I do not use the long zero period to pull the forecast down because those zeros appear to represent a machine that was not active, not normal customer behavior after activation.

4.4 Final ATM Forecast for May 2010

may_dates <- seq(as.Date("2010-05-01"), as.Date("2010-05-31"), by = "day")

make_atm_final_forecast <- function(atm_name) {
  chosen_model <- atm_chosen %>%
    filter(ATM == atm_name) %>%
    pull(Model)

  if (atm_name == "ATM3") {
    return(tibble(
      Date = may_dates,
      ATM = atm_name,
      Model = chosen_model,
      Forecast_Cash_Hundreds = atm3_level
    ))
  }

  y <- atm_clean %>%
    filter(ATM == atm_name) %>%
    arrange(DATE) %>%
    pull(Cash_Clean)

  fc <- forecast_candidate(ts(y, frequency = 7), h = length(may_dates), model_name = chosen_model)

  tibble(
    Date = may_dates,
    ATM = atm_name,
    Model = chosen_model,
    Forecast_Cash_Hundreds = as.numeric(fc$mean)
  )
}

atm_final <- bind_rows(lapply(atm_names, make_atm_final_forecast)) %>%
  arrange(Date, ATM)

atm_final %>%
  filter(Date <= as.Date("2010-05-03")) %>%
  kable(digits = 2, caption = "Part A forecast sample for the first three days of May 2010")

Part A forecast sample for the first three days of May 2010
Date	ATM	Model	Forecast_Cash_Hundreds
2010-05-01	ATM1	ARIMA	86.42
2010-05-01	ATM2	ETS	71.67
2010-05-01	ATM3	Mean of active days	87.67
2010-05-01	ATM4	STL + ETS	410.89
2010-05-02	ATM1	ARIMA	100.34
2010-05-02	ATM2	ETS	75.28
2010-05-02	ATM3	Mean of active days	87.67
2010-05-02	ATM4	STL + ETS	454.86
2010-05-03	ATM1	ARIMA	75.36
2010-05-03	ATM2	ETS	11.07
2010-05-03	ATM3	Mean of active days	87.67
2010-05-03	ATM4	STL + ETS	446.56

The final ATM forecast uses the cleaned daily series after missing values and possible outliers were handled. ATM1, ATM2, and ATM4 use the model with the lowest holdout RMSE. ATM3 is handled separately because it was inactive for most of the year, so using the mean of the active days is more reasonable than forcing a full seasonal model.

5 Part B

The goal in Part B is to model residential power usage from January 1998 through December 2013 and forecast the monthly values for 2014.

This series is monthly, long, and seasonal. For utility usage, a model that captures a smooth trend and repeated yearly seasonality is a natural choice.

5.0.1 Clean & Explore The Power Data

power_data <- power_raw %>%
  transmute(
    Date = as.Date(paste0(`YYYY-MMM`, "-01"), format = "%Y-%b-%d"),
    KWH = as.numeric(KWH)
  ) %>%
  arrange(Date)

power_ts_raw <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)

# clean the power series by filling the missing month and replacing possible outliers
power_ts_clean <- tsclean(power_ts_raw, replace.missing = TRUE)

power_data <- power_data %>%
  mutate(
    KWH_Clean = as.numeric(power_ts_clean),
    missing_flag = is.na(KWH),
    outlier_flag = !missing_flag & !is.na(KWH) & abs(KWH - KWH_Clean) > 1e-8
  )

power_audit <- power_data %>%
  summarise(
    First_Month = min(Date),
    Last_Month = max(Date),
    Months = n(),
    Missing_Months = sum(missing_flag),
    Extreme_Outliers_Replaced = sum(outlier_flag)
  )

power_audit %>%
  kable(caption = "Part B data quality audit")

Part B data quality audit
First_Month	Last_Month	Months	Missing_Months	Extreme_Outliers_Replaced
1998-01-01	2013-12-01	192	1	2

power_data %>%
  filter(missing_flag | outlier_flag) %>%
  select(Date, KWH, KWH_Clean, missing_flag, outlier_flag) %>%
  kable(digits = 0, caption = "Part B missing values and outliers handled")

Part B missing values and outliers handled
Date	KWH	KWH_Clean	missing_flag	outlier_flag
2008-09-01	NA	7436335	TRUE	FALSE
2010-07-01	770523	7686742	FALSE	TRUE
2013-12-01	9606304	6987874	FALSE	TRUE

power_data %>%
  ggplot(aes(x = Date)) +
  geom_line(aes(y = KWH), alpha = 0.45) +
  geom_line(aes(y = KWH_Clean), linewidth = 0.5) +
  labs(
    title = "Residential monthly power usage: raw and cleaned series",
    x = "Date",
    y = "KWH"
  )

The plot shows a strong yearly seasonal pattern and a changing long-term level. This supports testing models that can handle seasonality, such as ETS, ARIMA, seasonal naive, and STL-based forecasting.

5.1 Compare Forecasting Models for Power Usage

I use the last 12 months as a holdout set. This is a natural choice because the final forecast also needs 12 months ahead.

power_model_names <- c("Mean", "Naive", "Seasonal Naive", "ETS", "ARIMA", "STL + ETS", "Holt-Winters Additive")

# compare power models using the last 12 months as the holdout period
power_model_metrics <- evaluate_models(
  y_values = as.numeric(power_ts_clean),
  series_name = "Residential Power",
  frequency_value = 12,
  holdout_n = 12,
  model_names = power_model_names
) %>%
  arrange(RMSE)

power_model_metrics %>%
  mutate(across(c(RMSE, MAE, MAPE), ~ round(.x, 2))) %>%
  kable(caption = "Part B holdout model comparison")

Part B holdout model comparison
Series	Model	RMSE	MAE	MAPE
Residential Power	Seasonal Naive	579612.4	400403.1	4.91
Residential Power	STL + ETS	620155.5	402249.2	5.04
Residential Power	ARIMA	628999.0	444805.7	5.51
Residential Power	Holt-Winters Additive	698867.0	431084.6	5.37
Residential Power	ETS	705554.1	422014.3	5.18
Residential Power	Naive	1651544.2	1283217.5	15.82
Residential Power	Mean	1721859.6	1313853.7	15.96

power_chosen <- power_model_metrics %>%
  filter(!is.na(RMSE)) %>%
  arrange(RMSE, MAE) %>%
  slice(1)

power_chosen_model <- power_chosen$Model[1]

power_chosen %>%
  mutate(across(c(RMSE, MAE, MAPE), ~ round(.x, 2))) %>%
  kable(caption = "Part B selected model based on holdout RMSE")

Part B selected model based on holdout RMSE
Series	Model	RMSE	MAE	MAPE
Residential Power	Seasonal Naive	579612.4	400403.1	4.91

This comparison gives a direct reason for the final model choice. I use the model with the lowest holdout RMSE for the 2014 forecast.

5.2 Final Power Forecast for 2014

power_fit <- forecast_candidate(power_ts_clean, h = 12, model_name = power_chosen_model)

power_final <- tibble(
  Month = seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month"),
  Model = power_chosen_model,
  Forecast_KWH = as.numeric(power_fit$mean)
)

power_final %>%
  mutate(
    Month = format(Month, "%Y-%m"),
    Forecast_KWH = format(round(Forecast_KWH, 0), big.mark = ",", scientific = FALSE)
  ) %>%
  kable(caption = "Part B 2014 monthly forecast")

Part B 2014 monthly forecast
Month	Model	Forecast_KWH
2014-01	Seasonal Naive	10,655,730
2014-02	Seasonal Naive	7,681,798
2014-03	Seasonal Naive	6,517,514
2014-04	Seasonal Naive	6,105,359
2014-05	Seasonal Naive	5,940,475
2014-06	Seasonal Naive	7,920,627
2014-07	Seasonal Naive	8,415,321
2014-08	Seasonal Naive	9,080,226
2014-09	Seasonal Naive	7,968,220
2014-10	Seasonal Naive	5,759,367
2014-11	Seasonal Naive	5,769,083
2014-12	Seasonal Naive	6,987,874

plot(power_fit, main = paste("Residential Power Forecast for 2014 -", power_chosen_model))

The residential power forecast now uses a more complete process than my first submission. I filled the missing month, checked possible outliers, and compared several seasonal forecasting models. The final 2014 forecast uses the model with the lowest holdout RMSE.

6 Part C

The goal in Part C is to time base sequence the waterflow data, aggregate the readings by hour using the mean, check whether the data is stable enough to forecast, and then produce a one week forecast.

The two waterflow files have different timestamp structures. Pipe1 has irregular timestamps and Pipe2 is already hourly, but I still aggregate both to hourly mean flow so the treatment is consistent.

6.0.1 Clean & Explore The Waterflow Data

prepare_pipe <- function(data) {
  data %>%
    mutate(hour = floor_date(`Date Time`, unit = "hour")) %>%
    group_by(hour) %>%
    summarise(WaterFlow = mean(WaterFlow, na.rm = TRUE), .groups = "drop") %>%
    arrange(hour) %>%
    complete(hour = seq(min(hour), max(hour), by = "hour")) %>%
    mutate(
      missing_flag = is.na(WaterFlow),
      WaterFlow = na.approx(WaterFlow, na.rm = FALSE),
      WaterFlow = na.locf(WaterFlow, na.rm = FALSE),
      WaterFlow = na.locf(WaterFlow, fromLast = TRUE, na.rm = FALSE)
    )
}

pipe1_hourly <- prepare_pipe(pipe1_raw)
pipe2_hourly <- prepare_pipe(pipe2_raw)

pipe_audit <- tibble(
  Pipe = c("Pipe1", "Pipe2"),
  First_Hour = c(min(pipe1_hourly$hour), min(pipe2_hourly$hour)),
  Last_Hour = c(max(pipe1_hourly$hour), max(pipe2_hourly$hour)),
  Hours = c(nrow(pipe1_hourly), nrow(pipe2_hourly)),
  Filled_Hours = c(sum(pipe1_hourly$missing_flag), sum(pipe2_hourly$missing_flag))
)

pipe_audit %>%
  kable(caption = "Part C hourly aggregation audit")

Part C hourly aggregation audit
Pipe	First_Hour	Last_Hour	Hours	Filled_Hours
Pipe1	2015-10-23 00:00:00	2015-11-01 23:00:00	240	4
Pipe2	2015-10-23 01:00:00	2015-12-03 16:00:00	1000	333

head(pipe1_hourly)

head(pipe2_hourly)

par(mfrow = c(2, 1))

plot(pipe1_hourly$hour, pipe1_hourly$WaterFlow, type = "l",
     main = "Pipe1 hourly mean flow", xlab = "Hour", ylab = "WaterFlow")

plot(pipe2_hourly$hour, pipe2_hourly$WaterFlow, type = "l",
     main = "Pipe2 hourly mean flow", xlab = "Hour", ylab = "WaterFlow")

par(mfrow = c(1, 1))

After hourly aggregation, both pipe series are easier to model. Pipe1 has a shorter history and more visible level movement, while Pipe2 has a longer history.

6.0.2 Build The Waterflow Forecasts

pipe1_ts <- ts(pipe1_hourly$WaterFlow, frequency = 24)
pipe2_ts <- ts(pipe2_hourly$WaterFlow, frequency = 24)

pipe_stationarity <- tibble(
  Series = c("Pipe1", "Pipe2"),
  ndiffs = c(ndiffs(pipe1_ts), ndiffs(pipe2_ts)),
  nsdiffs = c(nsdiffs(pipe1_ts), nsdiffs(pipe2_ts))
)

pipe_stationarity %>%
  kable(caption = "Part C differencing check")

Part C differencing check
Series	ndiffs	nsdiffs
Pipe1	0	0
Pipe2	1	0

pipe1_fit <- holt(pipe1_ts, h = 168)

pipe2_fit <- forecast(ets(pipe2_ts), h = 168)

pipe1_future_time <- seq(max(pipe1_hourly$hour) + hours(1), by = "hour", length.out = 168)
pipe2_future_time <- seq(max(pipe2_hourly$hour) + hours(1), by = "hour", length.out = 168)

pipe1_final <- tibble(
  DateTime = pipe1_future_time,
  Model = "Holt linear trend on hourly mean flow",
  Forecast_WaterFlow = as.numeric(pipe1_fit$mean)
)

pipe2_final <- tibble(
  DateTime = pipe2_future_time,
  Model = "ETS on hourly mean flow",
  Forecast_WaterFlow = as.numeric(pipe2_fit$mean)
)

pipe1_final %>%
  head(12) %>%
  kable(digits = 4, caption = "Part C Pipe1 forecast sample")

Part C Pipe1 forecast sample
DateTime	Model	Forecast_WaterFlow
2015-11-02 00:00:00	Holt linear trend on hourly mean flow	20.1320
2015-11-02 01:00:00	Holt linear trend on hourly mean flow	20.1324
2015-11-02 02:00:00	Holt linear trend on hourly mean flow	20.1328
2015-11-02 03:00:00	Holt linear trend on hourly mean flow	20.1333
2015-11-02 04:00:00	Holt linear trend on hourly mean flow	20.1337
2015-11-02 05:00:00	Holt linear trend on hourly mean flow	20.1341
2015-11-02 06:00:00	Holt linear trend on hourly mean flow	20.1345
2015-11-02 07:00:00	Holt linear trend on hourly mean flow	20.1350
2015-11-02 08:00:00	Holt linear trend on hourly mean flow	20.1354
2015-11-02 09:00:00	Holt linear trend on hourly mean flow	20.1358
2015-11-02 10:00:00	Holt linear trend on hourly mean flow	20.1363
2015-11-02 11:00:00	Holt linear trend on hourly mean flow	20.1367

pipe2_final %>%
  head(12) %>%
  kable(digits = 4, caption = "Part C Pipe2 forecast sample")

Part C Pipe2 forecast sample
DateTime	Model	Forecast_WaterFlow
2015-12-03 17:00:00	ETS on hourly mean flow	44.3658
2015-12-03 18:00:00	ETS on hourly mean flow	44.3658
2015-12-03 19:00:00	ETS on hourly mean flow	44.3658
2015-12-03 20:00:00	ETS on hourly mean flow	44.3658
2015-12-03 21:00:00	ETS on hourly mean flow	44.3658
2015-12-03 22:00:00	ETS on hourly mean flow	44.3658
2015-12-03 23:00:00	ETS on hourly mean flow	44.3658
2015-12-04 00:00:00	ETS on hourly mean flow	44.3658
2015-12-04 01:00:00	ETS on hourly mean flow	44.3658
2015-12-04 02:00:00	ETS on hourly mean flow	44.3658
2015-12-04 03:00:00	ETS on hourly mean flow	44.3658
2015-12-04 04:00:00	ETS on hourly mean flow	44.3658

I converted both waterflow files into regular hourly series by taking the mean of all readings within each hour. After filling the missing hourly gaps, I checked whether differencing was needed. Since this was only a one-week forecast, I kept the final methods simple and easy to interpret.

7 Export The Final Forecasts To Excel

The next chunk saves all final forecast tables into one Excel workbook. This is useful because the project instructions ask for an Excel readable forecast file.

model_summary <- bind_rows(
  atm_chosen %>%
    transmute(
      Part = paste("Part A", ATM),
      Chosen_Model = Model,
      Holdout_RMSE = RMSE,
      Holdout_MAE = MAE,
      Holdout_MAPE = MAPE
    ),
  tibble(
    Part = "Part B Power",
    Chosen_Model = power_chosen_model,
    Holdout_RMSE = power_chosen$RMSE[1],
    Holdout_MAE = power_chosen$MAE[1],
    Holdout_MAPE = power_chosen$MAPE[1]
  ),
  tibble(
    Part = c("Part C Pipe1", "Part C Pipe2"),
    Chosen_Model = c("Holt linear trend on hourly mean flow", "ETS on hourly mean flow"),
    Holdout_RMSE = NA_real_,
    Holdout_MAE = NA_real_,
    Holdout_MAPE = NA_real_
  )
)

data_quality_audit <- bind_rows(
  atm_audit %>%
    transmute(
      Section = paste("Part A", ATM),
      Item_1 = as.character(First_Date),
      Item_2 = as.character(Last_Date),
      Count_1_Name = "Missing values or missing dates",
      Count_1 = Missing_Values_or_Missing_Dates,
      Count_2_Name = "Extreme outliers replaced",
      Count_2 = Extreme_Outliers_Replaced
    ),
  power_audit %>%
    transmute(
      Section = "Part B Power",
      Item_1 = as.character(First_Month),
      Item_2 = as.character(Last_Month),
      Count_1_Name = "Missing months",
      Count_1 = Missing_Months,
      Count_2_Name = "Extreme outliers replaced",
      Count_2 = Extreme_Outliers_Replaced
    ),
  pipe_audit %>%
    transmute(
      Section = paste("Part C", Pipe),
      Item_1 = as.character(First_Hour),
      Item_2 = as.character(Last_Hour),
      Count_1_Name = "Filled hourly gaps",
      Count_1 = Filled_Hours,
      Count_2_Name = "Extreme outliers replaced",
      Count_2 = NA_real_
    )
)

write_xlsx(
  list(
    Model_Summary = model_summary,
    Data_Quality_Audit = data_quality_audit,
    Part_A_Model_Comparison = atm_model_metrics,
    Part_A_ATM3_Decision = atm3_model_note,
    Part_B_Model_Comparison = power_model_metrics,
    ATM_May2010_Forecast = atm_final,
    Power_2014_Forecast = power_final,
    Pipe1_1Week_Forecast = pipe1_final,
    Pipe2_1Week_Forecast = pipe2_final
  ),
  "Project1_Revised_Final_Forecasts.xlsx"
)

Data 624 Project 1

Jayden Jiang

2026-03-23