1 Overview

This project has three parts. Part A forecasts daily ATM cash withdrawals for May 2010. Part B forecasts monthly residential electricity usage for 2014. Part C is the bonus section and forecasts one week of hourly waterflow.

2 Load Packages

library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(forecast)
library(writexl)
library(zoo)
library(ggplot2)
library(knitr)

3 Load Data

# read the Excel files from the same folder as the R Markdown file.
atm_raw <- read_excel("ATM624Data.xlsx")
power_raw <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
pipe1_raw <- read_excel("Waterflow_Pipe1.xlsx")
pipe2_raw <- read_excel("Waterflow_Pipe2.xlsx")

# convert Excel serial dates to R dates.
atm_raw <- atm_raw %>%
  mutate(DATE = as.Date(DATE, origin = "1899-12-30"))

# convert Excel date-time serial values to POSIXct date-time values.
pipe1_raw <- pipe1_raw %>%
  mutate(`Date Time` = as.POSIXct(`Date Time` * 86400,
                                  origin = "1899-12-30",
                                  tz = "UTC")) %>%
  mutate(`Date Time` = floor_date(`Date Time`, unit = "minute"))

pipe2_raw <- pipe2_raw %>%
  mutate(`Date Time` = as.POSIXct(`Date Time` * 86400,
                                  origin = "1899-12-30",
                                  tz = "UTC")) %>%
  mutate(`Date Time` = floor_date(`Date Time`, unit = "minute"))

# quick previews.
head(atm_raw)
head(power_raw)
head(pipe1_raw)
head(pipe2_raw)
# fit one selected forecasting method and return the forecast
forecast_candidate <- function(y_train, h, model_name) {
  switch(
    model_name,
    "Mean" = meanf(y_train, h = h),
    "Naive" = naive(y_train, h = h),
    "Seasonal Naive" = snaive(y_train, h = h),
    "ETS" = forecast(ets(y_train), h = h),
    "ARIMA" = forecast(auto.arima(y_train), h = h),
    "STL + ETS" = stlf(y_train, h = h, method = "ets"),
    "Holt-Winters Additive" = hw(y_train, h = h, seasonal = "additive", damped = TRUE)
  )
}

# calculate RMSE, MAE, and MAPE on the holdout period
accuracy_row <- function(actual, predicted) {
  actual <- as.numeric(actual)
  predicted <- as.numeric(predicted)
  mape_values <- abs((actual - predicted) / ifelse(actual == 0, NA, actual))

  tibble(
    RMSE = sqrt(mean((actual - predicted)^2, na.rm = TRUE)),
    MAE = mean(abs(actual - predicted), na.rm = TRUE),
    MAPE = ifelse(all(is.na(mape_values)), NA_real_, mean(mape_values, na.rm = TRUE) * 100)
  )
}

# compare candidate models using the same holdout period
evaluate_models <- function(y_values, series_name, frequency_value, holdout_n, model_names) {
  y_values <- as.numeric(y_values)
  y_values <- y_values[!is.na(y_values)]
  n <- length(y_values)
  holdout_n <- min(holdout_n, n - max(10, frequency_value * 2))

  if (holdout_n < 1) {
    return(tibble(
      Series = series_name,
      Model = model_names,
      RMSE = NA_real_,
      MAE = NA_real_,
      MAPE = NA_real_
    ))
  }

  train_values <- y_values[1:(n - holdout_n)]
  test_values <- y_values[(n - holdout_n + 1):n]
  train_ts <- ts(train_values, frequency = frequency_value)

  bind_rows(lapply(model_names, function(model_name) {
    fc <- tryCatch(
      forecast_candidate(train_ts, h = length(test_values), model_name = model_name),
      error = function(e) NULL
    )

    if (is.null(fc)) {
      tibble(
        Series = series_name,
        Model = model_name,
        RMSE = NA_real_,
        MAE = NA_real_,
        MAPE = NA_real_
      )
    } else {
      accuracy_row(test_values, as.numeric(fc$mean)) %>%
        mutate(
          Series = series_name,
          Model = model_name
        ) %>%
        select(Series, Model, RMSE, MAE, MAPE)
    }
  }))
}

4 Part A

The goal in Part A is to forecast how much cash will be taken out of four different ATM machines for May 2010. The cash variable is recorded in hundreds of dollars.

I treated each ATM as a separate time series because the four machines do not behave the same way. ATM1, ATM2, and ATM4 show short term day of week behavior. ATM3 is very unusual because it is inactive for almost the whole data set and only becomes active very late in the series. Because of that, I did not force one single model onto all four machines.

4.1 Clean & Explore The ATM Data

atm_base <- atm_raw %>%
  filter(!is.na(ATM)) %>%
  mutate(
    DATE = as.Date(DATE),
    Cash = as.numeric(Cash)
  ) %>%
  arrange(ATM, DATE)

clean_atm_series <- function(data, atm_name) {
  one_atm <- data %>%
    filter(ATM == atm_name) %>%
    select(DATE, ATM, Cash) %>%
    distinct(DATE, .keep_all = TRUE) %>%
    arrange(DATE)
  
  # create a complete daily sequence so missing ATM dates can be counted
  full_series <- tibble(DATE = seq(min(one_atm$DATE), max(one_atm$DATE), by = "day")) %>%
    left_join(one_atm, by = "DATE") %>%
    mutate(
      ATM = atm_name,
      missing_flag = is.na(Cash)
    )

  if (atm_name == "ATM3") {
    clean_values <- na.approx(full_series$Cash, na.rm = FALSE)
    clean_values <- na.locf(clean_values, na.rm = FALSE)
    clean_values <- na.locf(clean_values, fromLast = TRUE, na.rm = FALSE)

    full_series %>%
      mutate(
        Cash_Clean = as.numeric(clean_values),
        outlier_flag = FALSE
      )
  } else {
    
    # use tsclean to handle missing values and possible outliers for the stable ATM series
    clean_ts <- tsclean(ts(full_series$Cash, frequency = 7), replace.missing = TRUE)
    full_series %>%
      mutate(
        Cash_Clean = as.numeric(clean_ts),
        outlier_flag = !missing_flag & !is.na(Cash) & abs(Cash - Cash_Clean) > 1e-8
      )
  }
}

atm_names <- sort(unique(atm_base$ATM))
atm_clean <- bind_rows(lapply(atm_names, function(x) clean_atm_series(atm_base, x)))

atm_audit <- atm_clean %>%
  group_by(ATM) %>%
  summarise(
    First_Date = min(DATE),
    Last_Date = max(DATE),
    Daily_Rows_After_Completion = n(),
    Missing_Values_or_Missing_Dates = sum(missing_flag),
    Extreme_Outliers_Replaced = sum(outlier_flag),
    .groups = "drop"
  )

atm_audit %>%
  kable(caption = "Part A data quality audit after completing daily series")
Part A data quality audit after completing daily series
ATM First_Date Last_Date Daily_Rows_After_Completion Missing_Values_or_Missing_Dates Extreme_Outliers_Replaced
ATM1 2009-05-01 2010-04-30 365 3 26
ATM2 2009-05-01 2010-04-30 365 2 10
ATM3 2009-05-01 2010-04-30 365 0 0
ATM4 2009-05-01 2010-04-30 365 0 1
atm_clean %>%
  filter(missing_flag | outlier_flag) %>%
  select(ATM, DATE, Cash, Cash_Clean, missing_flag, outlier_flag) %>%
  arrange(ATM, DATE) %>%
  head(20) %>%
  kable(digits = 2, caption = "First missing values or outliers handled in Part A")
First missing values or outliers handled in Part A
ATM DATE Cash Cash_Clean missing_flag outlier_flag
ATM1 2009-06-05 3 102.99 FALSE TRUE
ATM1 2009-06-13 NA 125.59 TRUE FALSE
ATM1 2009-06-16 NA 131.13 TRUE FALSE
ATM1 2009-06-22 NA 97.69 TRUE FALSE
ATM1 2009-07-10 19 108.58 FALSE TRUE
ATM1 2009-08-02 68 99.88 FALSE TRUE
ATM1 2009-08-25 174 133.47 FALSE TRUE
ATM1 2009-09-11 1 115.44 FALSE TRUE
ATM1 2009-09-22 180 119.37 FALSE TRUE
ATM1 2009-11-30 6 105.15 FALSE TRUE
ATM1 2009-12-01 140 98.69 FALSE TRUE
ATM1 2009-12-04 140 95.86 FALSE TRUE
ATM1 2009-12-08 135 86.02 FALSE TRUE
ATM1 2009-12-29 42 92.20 FALSE TRUE
ATM1 2009-12-30 2 72.76 FALSE TRUE
ATM1 2010-01-01 132 94.00 FALSE TRUE
ATM1 2010-01-11 123 105.79 FALSE TRUE
ATM1 2010-01-29 152 93.12 FALSE TRUE
ATM1 2010-02-02 123 74.03 FALSE TRUE
ATM1 2010-02-05 138 87.88 FALSE TRUE

The audit table documents how missing observations and extreme outliers were handled. For ATM1, ATM2, and ATM4, I use the cleaned value for modeling. For ATM3, I keep the long sequence of zeros because it looks like the machine was inactive for most of the history rather than randomly missing.

# Compare raw cash values with the cleaned values used in modeling.
atm_clean %>%
  ggplot(aes(x = DATE)) +
  geom_line(aes(y = Cash), alpha = 0.45) +
  geom_line(aes(y = Cash_Clean), linewidth = 0.5) +
  facet_wrap(~ ATM, scales = "free_y", ncol = 2) +
  labs(
    title = "ATM daily cash withdrawals: raw and cleaned series",
    x = "Date",
    y = "Cash in hundreds of dollars"
  )

The plot shows that ATM1, ATM2, and ATM4 have weekly movement but different levels. ATM4 has the largest scale and the most volatility. ATM3 is different because it is almost all zeros and only becomes active near the end of the data set.

4.2 Compare Forecasting Models for ATM1, ATM2, and ATM4

I compared ATM1, ATM2, and ATM4 using the last 31 days as a holdout set. The models tested were mean, naive, seasonal naive, ETS, ARIMA, and STL plus ETS. This makes the model choice more transparent because each ATM is evaluated using the same holdout period.

atm_model_names <- c("Mean", "Naive", "Seasonal Naive", "ETS", "ARIMA", "STL + ETS")

# compare the class models using the last 31 days as a holdout set
atm_model_metrics <- bind_rows(lapply(c("ATM1", "ATM2", "ATM4"), function(atm_name) {
  y <- atm_clean %>%
    filter(ATM == atm_name) %>%
    arrange(DATE) %>%
    pull(Cash_Clean)

  evaluate_models(
    y_values = y,
    series_name = atm_name,
    frequency_value = 7,
    holdout_n = 31,
    model_names = atm_model_names
  )
})) %>%
  arrange(Series, RMSE)

atm_model_metrics %>%
  mutate(across(c(RMSE, MAE, MAPE), ~ round(.x, 2))) %>%
  kable(caption = "Part A holdout model comparison for ATM1, ATM2, and ATM4")
Part A holdout model comparison for ATM1, ATM2, and ATM4
Series Model RMSE MAE MAPE
ATM1 ARIMA 13.39 10.42 49.71
ATM1 ETS 13.77 10.68 53.72
ATM1 Seasonal Naive 15.86 13.16 70.79
ATM1 STL + ETS 16.75 13.09 85.40
ATM1 Mean 32.16 20.21 314.36
ATM1 Naive 67.20 63.29 126.73
ATM2 ETS 13.41 10.46 26.15
ATM2 STL + ETS 15.92 12.49 102.61
ATM2 ARIMA 16.16 11.88 88.32
ATM2 Seasonal Naive 16.39 13.52 40.36
ATM2 Mean 37.39 32.16 693.07
ATM2 Naive 71.32 60.83 100.00
ATM4 STL + ETS 245.09 203.15 525.96
ATM4 ARIMA 275.94 233.11 1030.13
ATM4 Mean 281.24 235.57 1119.09
ATM4 ETS 303.89 255.06 1227.94
ATM4 Seasonal Naive 329.58 260.18 372.84
ATM4 Naive 466.06 378.19 117.24
# select the lowest RMSE model for each ATM
atm_chosen_non3 <- atm_model_metrics %>%
  filter(!is.na(RMSE)) %>%
  group_by(Series) %>%
  arrange(RMSE, MAE, .by_group = TRUE) %>%
  slice(1) %>%
  ungroup() %>%
  transmute(ATM = Series, Model = Model, RMSE = RMSE, MAE = MAE, MAPE = MAPE)

Based on the holdout test, I select the lowest RMSE model for ATM1, ATM2, and ATM4. If two models are close, I would prefer the simpler one, but the table above keeps the selection transparent.

4.3 ATM3 Treatment

ATM3 needs separate treatment because it does not have a normal full-year operating history. A mean forecast is reasonable here, but I also document why models like ETS, ARIMA, and STL are not reliable for this machine.

atm3_active <- atm_clean %>%
  filter(ATM == "ATM3", Cash_Clean > 0) %>%
  arrange(DATE)

atm3_active_n <- nrow(atm3_active)
atm3_level <- mean(atm3_active$Cash_Clean, na.rm = TRUE)

atm3_model_note <- tibble(
  ATM = "ATM3",
  Model = c("Mean of active days", "ETS", "ARIMA", "STL + ETS"),
  Decision = c("Selected", "Not selected", "Not selected", "Not selected"),
  Reason = c(
    paste0("Only ", atm3_active_n, " active observations are available, so the mean of active days is the most stable choice."),
    "The active history is too short for a dependable exponential smoothing model.",
    "The active history is too short for a dependable ARIMA model.",
    "STL requires a meaningful seasonal history, which ATM3 does not have after activation."
  )
)

atm3_active %>%
  select(DATE, Cash_Clean) %>%
  kable(digits = 2, caption = "ATM3 active observations used for the mean forecast")
ATM3 active observations used for the mean forecast
DATE Cash_Clean
2010-04-28 96
2010-04-29 82
2010-04-30 85
atm3_model_note %>%
  kable(caption = "ATM3 model decision")
ATM3 model decision
ATM Model Decision Reason
ATM3 Mean of active days Selected Only 3 active observations are available, so the mean of active days is the most stable choice.
ATM3 ETS Not selected The active history is too short for a dependable exponential smoothing model.
ATM3 ARIMA Not selected The active history is too short for a dependable ARIMA model.
ATM3 STL + ETS Not selected STL requires a meaningful seasonal history, which ATM3 does not have after activation.
atm3_choice <- tibble(
  ATM = "ATM3",
  Model = "Mean of active days",
  RMSE = NA_real_,
  MAE = NA_real_,
  MAPE = NA_real_
)

atm_chosen <- bind_rows(atm_chosen_non3, atm3_choice) %>%
  arrange(ATM)

For ATM3, I use the mean of the active observations. I do not use the long zero period to pull the forecast down because those zeros appear to represent a machine that was not active, not normal customer behavior after activation.

4.4 Final ATM Forecast for May 2010

may_dates <- seq(as.Date("2010-05-01"), as.Date("2010-05-31"), by = "day")

make_atm_final_forecast <- function(atm_name) {
  chosen_model <- atm_chosen %>%
    filter(ATM == atm_name) %>%
    pull(Model)

  if (atm_name == "ATM3") {
    return(tibble(
      Date = may_dates,
      ATM = atm_name,
      Model = chosen_model,
      Forecast_Cash_Hundreds = atm3_level
    ))
  }

  y <- atm_clean %>%
    filter(ATM == atm_name) %>%
    arrange(DATE) %>%
    pull(Cash_Clean)

  fc <- forecast_candidate(ts(y, frequency = 7), h = length(may_dates), model_name = chosen_model)

  tibble(
    Date = may_dates,
    ATM = atm_name,
    Model = chosen_model,
    Forecast_Cash_Hundreds = as.numeric(fc$mean)
  )
}

atm_final <- bind_rows(lapply(atm_names, make_atm_final_forecast)) %>%
  arrange(Date, ATM)

atm_final %>%
  filter(Date <= as.Date("2010-05-03")) %>%
  kable(digits = 2, caption = "Part A forecast sample for the first three days of May 2010")
Part A forecast sample for the first three days of May 2010
Date ATM Model Forecast_Cash_Hundreds
2010-05-01 ATM1 ARIMA 86.42
2010-05-01 ATM2 ETS 71.67
2010-05-01 ATM3 Mean of active days 87.67
2010-05-01 ATM4 STL + ETS 410.89
2010-05-02 ATM1 ARIMA 100.34
2010-05-02 ATM2 ETS 75.28
2010-05-02 ATM3 Mean of active days 87.67
2010-05-02 ATM4 STL + ETS 454.86
2010-05-03 ATM1 ARIMA 75.36
2010-05-03 ATM2 ETS 11.07
2010-05-03 ATM3 Mean of active days 87.67
2010-05-03 ATM4 STL + ETS 446.56

The final ATM forecast uses the cleaned daily series after missing values and possible outliers were handled. ATM1, ATM2, and ATM4 use the model with the lowest holdout RMSE. ATM3 is handled separately because it was inactive for most of the year, so using the mean of the active days is more reasonable than forcing a full seasonal model.

5 Part B

The goal in Part B is to model residential power usage from January 1998 through December 2013 and forecast the monthly values for 2014.

This series is monthly, long, and seasonal. For utility usage, a model that captures a smooth trend and repeated yearly seasonality is a natural choice.

5.0.1 Clean & Explore The Power Data

power_data <- power_raw %>%
  transmute(
    Date = as.Date(paste0(`YYYY-MMM`, "-01"), format = "%Y-%b-%d"),
    KWH = as.numeric(KWH)
  ) %>%
  arrange(Date)

power_ts_raw <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)

# clean the power series by filling the missing month and replacing possible outliers
power_ts_clean <- tsclean(power_ts_raw, replace.missing = TRUE)

power_data <- power_data %>%
  mutate(
    KWH_Clean = as.numeric(power_ts_clean),
    missing_flag = is.na(KWH),
    outlier_flag = !missing_flag & !is.na(KWH) & abs(KWH - KWH_Clean) > 1e-8
  )

power_audit <- power_data %>%
  summarise(
    First_Month = min(Date),
    Last_Month = max(Date),
    Months = n(),
    Missing_Months = sum(missing_flag),
    Extreme_Outliers_Replaced = sum(outlier_flag)
  )

power_audit %>%
  kable(caption = "Part B data quality audit")
Part B data quality audit
First_Month Last_Month Months Missing_Months Extreme_Outliers_Replaced
1998-01-01 2013-12-01 192 1 2
power_data %>%
  filter(missing_flag | outlier_flag) %>%
  select(Date, KWH, KWH_Clean, missing_flag, outlier_flag) %>%
  kable(digits = 0, caption = "Part B missing values and outliers handled")
Part B missing values and outliers handled
Date KWH KWH_Clean missing_flag outlier_flag
2008-09-01 NA 7436335 TRUE FALSE
2010-07-01 770523 7686742 FALSE TRUE
2013-12-01 9606304 6987874 FALSE TRUE
power_data %>%
  ggplot(aes(x = Date)) +
  geom_line(aes(y = KWH), alpha = 0.45) +
  geom_line(aes(y = KWH_Clean), linewidth = 0.5) +
  labs(
    title = "Residential monthly power usage: raw and cleaned series",
    x = "Date",
    y = "KWH"
  )

The plot shows a strong yearly seasonal pattern and a changing long-term level. This supports testing models that can handle seasonality, such as ETS, ARIMA, seasonal naive, and STL-based forecasting.

5.1 Compare Forecasting Models for Power Usage

I use the last 12 months as a holdout set. This is a natural choice because the final forecast also needs 12 months ahead.

power_model_names <- c("Mean", "Naive", "Seasonal Naive", "ETS", "ARIMA", "STL + ETS", "Holt-Winters Additive")

# compare power models using the last 12 months as the holdout period
power_model_metrics <- evaluate_models(
  y_values = as.numeric(power_ts_clean),
  series_name = "Residential Power",
  frequency_value = 12,
  holdout_n = 12,
  model_names = power_model_names
) %>%
  arrange(RMSE)

power_model_metrics %>%
  mutate(across(c(RMSE, MAE, MAPE), ~ round(.x, 2))) %>%
  kable(caption = "Part B holdout model comparison")
Part B holdout model comparison
Series Model RMSE MAE MAPE
Residential Power Seasonal Naive 579612.4 400403.1 4.91
Residential Power STL + ETS 620155.5 402249.2 5.04
Residential Power ARIMA 628999.0 444805.7 5.51
Residential Power Holt-Winters Additive 698867.0 431084.6 5.37
Residential Power ETS 705554.1 422014.3 5.18
Residential Power Naive 1651544.2 1283217.5 15.82
Residential Power Mean 1721859.6 1313853.7 15.96
power_chosen <- power_model_metrics %>%
  filter(!is.na(RMSE)) %>%
  arrange(RMSE, MAE) %>%
  slice(1)

power_chosen_model <- power_chosen$Model[1]

power_chosen %>%
  mutate(across(c(RMSE, MAE, MAPE), ~ round(.x, 2))) %>%
  kable(caption = "Part B selected model based on holdout RMSE")
Part B selected model based on holdout RMSE
Series Model RMSE MAE MAPE
Residential Power Seasonal Naive 579612.4 400403.1 4.91

This comparison gives a direct reason for the final model choice. I use the model with the lowest holdout RMSE for the 2014 forecast.

5.2 Final Power Forecast for 2014

power_fit <- forecast_candidate(power_ts_clean, h = 12, model_name = power_chosen_model)

power_final <- tibble(
  Month = seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month"),
  Model = power_chosen_model,
  Forecast_KWH = as.numeric(power_fit$mean)
)

power_final %>%
  mutate(
    Month = format(Month, "%Y-%m"),
    Forecast_KWH = format(round(Forecast_KWH, 0), big.mark = ",", scientific = FALSE)
  ) %>%
  kable(caption = "Part B 2014 monthly forecast")
Part B 2014 monthly forecast
Month Model Forecast_KWH
2014-01 Seasonal Naive 10,655,730
2014-02 Seasonal Naive 7,681,798
2014-03 Seasonal Naive 6,517,514
2014-04 Seasonal Naive 6,105,359
2014-05 Seasonal Naive 5,940,475
2014-06 Seasonal Naive 7,920,627
2014-07 Seasonal Naive 8,415,321
2014-08 Seasonal Naive 9,080,226
2014-09 Seasonal Naive 7,968,220
2014-10 Seasonal Naive 5,759,367
2014-11 Seasonal Naive 5,769,083
2014-12 Seasonal Naive 6,987,874
plot(power_fit, main = paste("Residential Power Forecast for 2014 -", power_chosen_model))

The residential power forecast now uses a more complete process than my first submission. I filled the missing month, checked possible outliers, and compared several seasonal forecasting models. The final 2014 forecast uses the model with the lowest holdout RMSE.

6 Part C

The goal in Part C is to time base sequence the waterflow data, aggregate the readings by hour using the mean, check whether the data is stable enough to forecast, and then produce a one week forecast.

The two waterflow files have different timestamp structures. Pipe1 has irregular timestamps and Pipe2 is already hourly, but I still aggregate both to hourly mean flow so the treatment is consistent.

6.0.1 Clean & Explore The Waterflow Data

prepare_pipe <- function(data) {
  data %>%
    mutate(hour = floor_date(`Date Time`, unit = "hour")) %>%
    group_by(hour) %>%
    summarise(WaterFlow = mean(WaterFlow, na.rm = TRUE), .groups = "drop") %>%
    arrange(hour) %>%
    complete(hour = seq(min(hour), max(hour), by = "hour")) %>%
    mutate(
      missing_flag = is.na(WaterFlow),
      WaterFlow = na.approx(WaterFlow, na.rm = FALSE),
      WaterFlow = na.locf(WaterFlow, na.rm = FALSE),
      WaterFlow = na.locf(WaterFlow, fromLast = TRUE, na.rm = FALSE)
    )
}

pipe1_hourly <- prepare_pipe(pipe1_raw)
pipe2_hourly <- prepare_pipe(pipe2_raw)

pipe_audit <- tibble(
  Pipe = c("Pipe1", "Pipe2"),
  First_Hour = c(min(pipe1_hourly$hour), min(pipe2_hourly$hour)),
  Last_Hour = c(max(pipe1_hourly$hour), max(pipe2_hourly$hour)),
  Hours = c(nrow(pipe1_hourly), nrow(pipe2_hourly)),
  Filled_Hours = c(sum(pipe1_hourly$missing_flag), sum(pipe2_hourly$missing_flag))
)

pipe_audit %>%
  kable(caption = "Part C hourly aggregation audit")
Part C hourly aggregation audit
Pipe First_Hour Last_Hour Hours Filled_Hours
Pipe1 2015-10-23 00:00:00 2015-11-01 23:00:00 240 4
Pipe2 2015-10-23 01:00:00 2015-12-03 16:00:00 1000 333
head(pipe1_hourly)
head(pipe2_hourly)
par(mfrow = c(2, 1))

plot(pipe1_hourly$hour, pipe1_hourly$WaterFlow, type = "l",
     main = "Pipe1 hourly mean flow", xlab = "Hour", ylab = "WaterFlow")

plot(pipe2_hourly$hour, pipe2_hourly$WaterFlow, type = "l",
     main = "Pipe2 hourly mean flow", xlab = "Hour", ylab = "WaterFlow")

par(mfrow = c(1, 1))

After hourly aggregation, both pipe series are easier to model. Pipe1 has a shorter history and more visible level movement, while Pipe2 has a longer history.

6.0.2 Build The Waterflow Forecasts

pipe1_ts <- ts(pipe1_hourly$WaterFlow, frequency = 24)
pipe2_ts <- ts(pipe2_hourly$WaterFlow, frequency = 24)

pipe_stationarity <- tibble(
  Series = c("Pipe1", "Pipe2"),
  ndiffs = c(ndiffs(pipe1_ts), ndiffs(pipe2_ts)),
  nsdiffs = c(nsdiffs(pipe1_ts), nsdiffs(pipe2_ts))
)

pipe_stationarity %>%
  kable(caption = "Part C differencing check")
Part C differencing check
Series ndiffs nsdiffs
Pipe1 0 0
Pipe2 1 0
pipe1_fit <- holt(pipe1_ts, h = 168)

pipe2_fit <- forecast(ets(pipe2_ts), h = 168)

pipe1_future_time <- seq(max(pipe1_hourly$hour) + hours(1), by = "hour", length.out = 168)
pipe2_future_time <- seq(max(pipe2_hourly$hour) + hours(1), by = "hour", length.out = 168)

pipe1_final <- tibble(
  DateTime = pipe1_future_time,
  Model = "Holt linear trend on hourly mean flow",
  Forecast_WaterFlow = as.numeric(pipe1_fit$mean)
)

pipe2_final <- tibble(
  DateTime = pipe2_future_time,
  Model = "ETS on hourly mean flow",
  Forecast_WaterFlow = as.numeric(pipe2_fit$mean)
)
pipe1_final %>%
  head(12) %>%
  kable(digits = 4, caption = "Part C Pipe1 forecast sample")
Part C Pipe1 forecast sample
DateTime Model Forecast_WaterFlow
2015-11-02 00:00:00 Holt linear trend on hourly mean flow 20.1320
2015-11-02 01:00:00 Holt linear trend on hourly mean flow 20.1324
2015-11-02 02:00:00 Holt linear trend on hourly mean flow 20.1328
2015-11-02 03:00:00 Holt linear trend on hourly mean flow 20.1333
2015-11-02 04:00:00 Holt linear trend on hourly mean flow 20.1337
2015-11-02 05:00:00 Holt linear trend on hourly mean flow 20.1341
2015-11-02 06:00:00 Holt linear trend on hourly mean flow 20.1345
2015-11-02 07:00:00 Holt linear trend on hourly mean flow 20.1350
2015-11-02 08:00:00 Holt linear trend on hourly mean flow 20.1354
2015-11-02 09:00:00 Holt linear trend on hourly mean flow 20.1358
2015-11-02 10:00:00 Holt linear trend on hourly mean flow 20.1363
2015-11-02 11:00:00 Holt linear trend on hourly mean flow 20.1367
pipe2_final %>%
  head(12) %>%
  kable(digits = 4, caption = "Part C Pipe2 forecast sample")
Part C Pipe2 forecast sample
DateTime Model Forecast_WaterFlow
2015-12-03 17:00:00 ETS on hourly mean flow 44.3658
2015-12-03 18:00:00 ETS on hourly mean flow 44.3658
2015-12-03 19:00:00 ETS on hourly mean flow 44.3658
2015-12-03 20:00:00 ETS on hourly mean flow 44.3658
2015-12-03 21:00:00 ETS on hourly mean flow 44.3658
2015-12-03 22:00:00 ETS on hourly mean flow 44.3658
2015-12-03 23:00:00 ETS on hourly mean flow 44.3658
2015-12-04 00:00:00 ETS on hourly mean flow 44.3658
2015-12-04 01:00:00 ETS on hourly mean flow 44.3658
2015-12-04 02:00:00 ETS on hourly mean flow 44.3658
2015-12-04 03:00:00 ETS on hourly mean flow 44.3658
2015-12-04 04:00:00 ETS on hourly mean flow 44.3658

I converted both waterflow files into regular hourly series by taking the mean of all readings within each hour. After filling the missing hourly gaps, I checked whether differencing was needed. Since this was only a one-week forecast, I kept the final methods simple and easy to interpret.

7 Export The Final Forecasts To Excel

The next chunk saves all final forecast tables into one Excel workbook. This is useful because the project instructions ask for an Excel readable forecast file.

model_summary <- bind_rows(
  atm_chosen %>%
    transmute(
      Part = paste("Part A", ATM),
      Chosen_Model = Model,
      Holdout_RMSE = RMSE,
      Holdout_MAE = MAE,
      Holdout_MAPE = MAPE
    ),
  tibble(
    Part = "Part B Power",
    Chosen_Model = power_chosen_model,
    Holdout_RMSE = power_chosen$RMSE[1],
    Holdout_MAE = power_chosen$MAE[1],
    Holdout_MAPE = power_chosen$MAPE[1]
  ),
  tibble(
    Part = c("Part C Pipe1", "Part C Pipe2"),
    Chosen_Model = c("Holt linear trend on hourly mean flow", "ETS on hourly mean flow"),
    Holdout_RMSE = NA_real_,
    Holdout_MAE = NA_real_,
    Holdout_MAPE = NA_real_
  )
)

data_quality_audit <- bind_rows(
  atm_audit %>%
    transmute(
      Section = paste("Part A", ATM),
      Item_1 = as.character(First_Date),
      Item_2 = as.character(Last_Date),
      Count_1_Name = "Missing values or missing dates",
      Count_1 = Missing_Values_or_Missing_Dates,
      Count_2_Name = "Extreme outliers replaced",
      Count_2 = Extreme_Outliers_Replaced
    ),
  power_audit %>%
    transmute(
      Section = "Part B Power",
      Item_1 = as.character(First_Month),
      Item_2 = as.character(Last_Month),
      Count_1_Name = "Missing months",
      Count_1 = Missing_Months,
      Count_2_Name = "Extreme outliers replaced",
      Count_2 = Extreme_Outliers_Replaced
    ),
  pipe_audit %>%
    transmute(
      Section = paste("Part C", Pipe),
      Item_1 = as.character(First_Hour),
      Item_2 = as.character(Last_Hour),
      Count_1_Name = "Filled hourly gaps",
      Count_1 = Filled_Hours,
      Count_2_Name = "Extreme outliers replaced",
      Count_2 = NA_real_
    )
)

write_xlsx(
  list(
    Model_Summary = model_summary,
    Data_Quality_Audit = data_quality_audit,
    Part_A_Model_Comparison = atm_model_metrics,
    Part_A_ATM3_Decision = atm3_model_note,
    Part_B_Model_Comparison = power_model_metrics,
    ATM_May2010_Forecast = atm_final,
    Power_2014_Forecast = power_final,
    Pipe1_1Week_Forecast = pipe1_final,
    Pipe2_1Week_Forecast = pipe2_final
  ),
  "Project1_Revised_Final_Forecasts.xlsx"
)