Resubmission Note

I revised this project based on the feedback I received. In the original version, I did not clearly show how I handled missing values, extreme outliers, or how I tested different forecasting methods. In this corrected version, I added those steps directly into the analysis.

The main corrections are:

  1. I checked for missing values before forecasting.
  2. I filled missing time series values instead of leaving gaps.
  3. I checked for extreme outliers and capped them using the IQR rule.
  4. I tested ARIMA, ETS, STL-based forecasts, and baseline models.
  5. I compared models using a holdout period instead of only choosing by visual inspection.
  6. I corrected ATM3 so it does not forecast 0. I used the mean of the most recent 3 days because that is more realistic for a short-term ATM forecast.
  7. I also included the optional bonus waterflow section.

Helper Functions

clean_names_basic <- function(df) {
  names(df) <- names(df) %>%
    stringr::str_trim() %>%
    stringr::str_replace_all("[^A-Za-z0-9]+", "_") %>%
    stringr::str_replace_all("_+$", "") %>%
    tolower()
  df
}

read_first_available <- function(excel_files = NULL, csv_files = NULL) {
  if(!is.null(excel_files)) {
    for(file in excel_files) {
      if(file.exists(file)) {
        return(readxl::read_excel(file))
      }
    }
  }

  if(!is.null(csv_files)) {
    for(file in csv_files) {
      if(file.exists(file)) {
        return(readr::read_csv(file, show_col_types = FALSE))
      }
    }
  }

  stop("The file was not found. Please make sure the Excel file is uploaded into the same folder as this RMD.")
}

parse_any_date <- function(x) {
  if(inherits(x, "Date")) return(x)
  if(inherits(x, "POSIXt")) return(as.Date(x))
  if(is.numeric(x)) return(as.Date(x, origin = "1899-12-30"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("mdy HMS", "mdy HM", "mdy", "ymd HMS", "ymd HM", "ymd", "Ymd")
  ))
  as.Date(parsed)
}

parse_any_month <- function(x) {
  if(inherits(x, "Date")) return(lubridate::floor_date(x, "month"))
  if(inherits(x, "POSIXt")) return(lubridate::floor_date(as.Date(x), "month"))
  if(is.numeric(x)) return(lubridate::floor_date(as.Date(x, origin = "1899-12-30"), "month"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("Y-b", "Y-B", "Y-m", "ym", "my", "mdy", "ymd")
  ))
  lubridate::floor_date(as.Date(parsed), "month")
}

parse_any_datetime <- function(x) {
  if(inherits(x, "POSIXt")) return(as.POSIXct(x, tz = "UTC"))
  if(inherits(x, "Date")) return(as.POSIXct(x, tz = "UTC"))
  if(is.numeric(x)) return(as.POSIXct(as.numeric(x) * 86400, origin = "1899-12-30", tz = "UTC"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("ymd HMS", "mdy HMS", "ymd HM", "mdy HM", "ymd", "mdy")
  ))
  as.POSIXct(parsed, tz = "UTC")
}

impute_numeric_series <- function(x) {
  x <- as.numeric(x)

  if(sum(!is.na(x)) == 0) {
    return(rep(0, length(x)))
  }

  x_clean <- zoo::na.approx(x, na.rm = FALSE)
  x_clean <- zoo::na.locf(x_clean, na.rm = FALSE)
  x_clean <- zoo::na.locf(x_clean, fromLast = TRUE, na.rm = FALSE)
  x_clean
}

get_iqr_limits <- function(x) {
  x <- as.numeric(x)
  q1 <- quantile(x, 0.25, na.rm = TRUE)
  q3 <- quantile(x, 0.75, na.rm = TRUE)
  iqr_value <- q3 - q1

  if(is.na(iqr_value) || iqr_value == 0) {
    return(c(lower = min(x, na.rm = TRUE), upper = max(x, na.rm = TRUE)))
  }

  c(
    lower = q1 - 1.5 * iqr_value,
    upper = q3 + 1.5 * iqr_value
  )
}

cap_outliers_iqr <- function(x) {
  limits <- get_iqr_limits(x)
  pmin(pmax(as.numeric(x), limits["lower"]), limits["upper"])
}

rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2, na.rm = TRUE))
}

mae <- function(actual, predicted) {
  mean(abs(actual - predicted), na.rm = TRUE)
}

safe_model_forecast <- function(model_name, y, h, frequency_value) {
  y <- impute_numeric_series(as.numeric(y))

  if(length(y) < 5 || sum(!is.na(y)) < 5) {
    return(NULL)
  }

  y_ts <- ts(y, frequency = frequency_value)

  result <- tryCatch({
    if(model_name == "Mean") {
      forecast::meanf(y_ts, h = h)
    } else if(model_name == "Naive") {
      forecast::naive(y_ts, h = h)
    } else if(model_name == "Seasonal naive") {
      forecast::snaive(y_ts, h = h)
    } else if(model_name == "ETS") {
      forecast::forecast(forecast::ets(y_ts), h = h)
    } else if(model_name == "ARIMA") {
      forecast::forecast(forecast::auto.arima(y_ts), h = h)
    } else if(model_name == "STL + ETS") {
      forecast::stlf(y_ts, h = h, method = "ets")
    } else {
      stop("Unknown model name")
    }
  }, error = function(e) NULL)

  result
}

compare_forecast_models <- function(y, frequency_value, holdout_h) {
  y <- impute_numeric_series(as.numeric(y))

  if(length(y) < 10) {
    return(tibble(Model = character(), RMSE = numeric(), MAE = numeric()))
  }

  holdout_h <- min(holdout_h, max(2, floor(length(y) / 4)))
  train_y <- head(y, length(y) - holdout_h)
  test_y <- tail(y, holdout_h)

  candidate_models <- c("Mean", "Naive", "Seasonal naive", "ETS", "ARIMA", "STL + ETS")

  results <- map_dfr(candidate_models, function(model_name) {
    fc <- safe_model_forecast(model_name, train_y, holdout_h, frequency_value)

    if(is.null(fc)) {
      return(tibble(Model = model_name, RMSE = NA_real_, MAE = NA_real_))
    }

    preds <- as.numeric(fc$mean)

    tibble(
      Model = model_name,
      RMSE = rmse(test_y, preds),
      MAE = mae(test_y, preds)
    )
  }) %>%
    arrange(RMSE)

  results
}

make_final_forecast_values <- function(y, selected_model, h, frequency_value) {
  y <- impute_numeric_series(as.numeric(y))

  if(selected_model %in% c("Recent 3-day mean", "Recent 3-hour mean", "Recent 3-month mean")) {
    return(rep(mean(tail(y, 3), na.rm = TRUE), h))
  }

  fc <- safe_model_forecast(selected_model, y, h, frequency_value)

  if(is.null(fc)) {
    return(rep(mean(tail(y, 3), na.rm = TRUE), h))
  }

  as.numeric(fc$mean)
}

stationarity_tests <- function(y) {
  y <- impute_numeric_series(as.numeric(y))
  y <- y[is.finite(y)]

  adf_result <- tryCatch(tseries::adf.test(y), error = function(e) NULL)
  kpss_result <- tryCatch(tseries::kpss.test(y), error = function(e) NULL)

  tibble(
    ADF_p_value = ifelse(is.null(adf_result), NA_real_, adf_result$p.value),
    KPSS_p_value = ifelse(is.null(kpss_result), NA_real_, kpss_result$p.value),
    Stationarity_Note = case_when(
      !is.na(ADF_p_value) & !is.na(KPSS_p_value) & ADF_p_value < 0.05 & KPSS_p_value > 0.05 ~ "Stationary based on ADF and KPSS.",
      !is.na(ADF_p_value) & ADF_p_value < 0.05 ~ "ADF suggests stationarity, but KPSS should also be reviewed.",
      TRUE ~ "Not clearly stationary, so ARIMA/differencing is useful before forecasting."
    )
  )
}

Part A: ATM Forecast

The goal of Part A is to forecast how much cash is taken out of four different ATM machines for May 2010. The variable Cash is in hundreds of dollars.

Load the ATM data

atm_raw <- read_first_available(
  excel_files = c("ATM624Data (2).xlsx", "ATM624Data (1).xlsx", "ATM624Data.xlsx"),
  csv_files = c("ATM624Data (2).csv", "ATM624Data (1).csv", "ATM624Data.csv")
)

atm_raw <- clean_names_basic(as.data.frame(atm_raw))

glimpse(atm_raw)
## Rows: 1,474
## Columns: 3
## $ date <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
## $ atm  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …
head(atm_raw)
##    date  atm cash
## 1 39934 ATM1   96
## 2 39934 ATM2  107
## 3 39935 ATM1   82
## 4 39935 ATM2   89
## 5 39936 ATM1   85
## 6 39936 ATM2   90

Clean dates and columns

date_col <- names(atm_raw)[stringr::str_detect(names(atm_raw), "date")][1]
atm_col  <- names(atm_raw)[stringr::str_detect(names(atm_raw), "^atm$|atm")][1]
cash_col <- names(atm_raw)[stringr::str_detect(names(atm_raw), "cash|withdraw")][1]

if(any(is.na(c(date_col, atm_col, cash_col)))) {
  stop("Could not identify the date, ATM, or cash columns. Please check the ATM file column names.")
}

atm_data <- atm_raw %>%
  transmute(
    date = parse_any_date(.data[[date_col]]),
    atm = toupper(stringr::str_replace_all(as.character(.data[[atm_col]]), "\\s+", "")),
    cash = readr::parse_number(as.character(.data[[cash_col]]))
  )

atm_missing_before <- atm_data %>%
  summarise(
    missing_date = sum(is.na(date)),
    missing_atm = sum(is.na(atm) | atm == ""),
    missing_cash = sum(is.na(cash))
  )

kable(atm_missing_before, caption = "ATM Missing Values Before Cleaning")
ATM Missing Values Before Cleaning
missing_date missing_atm missing_cash
0 14 19
atm_data <- atm_data %>%
  filter(!is.na(date), !is.na(atm), atm != "") %>%
  filter(atm %in% c("ATM1", "ATM2", "ATM3", "ATM4")) %>%
  group_by(date, atm) %>%
  summarise(cash = mean(cash, na.rm = TRUE), .groups = "drop") %>%
  mutate(cash = ifelse(is.nan(cash), NA_real_, cash))

atm_data %>% count(atm)
## # A tibble: 4 × 2
##   atm       n
##   <chr> <int>
## 1 ATM1    365
## 2 ATM2    365
## 3 ATM3    365
## 4 ATM4    365

Missing value treatment

I completed the daily time series for each ATM and then filled missing cash values. I used interpolation for internal gaps and nearest available values for leading or trailing gaps. This avoids sending missing values directly into ARIMA, ETS, or STL models.

forecast_start_atm <- as.Date("2010-05-01")
forecast_end_atm <- as.Date("2010-05-31")

atm_training_raw <- atm_data %>%
  filter(date < forecast_start_atm)

all_dates_atm <- seq(min(atm_training_raw$date, na.rm = TRUE), as.Date("2010-04-30"), by = "day")
expected_atms <- c("ATM1", "ATM2", "ATM3", "ATM4")

atm_completed <- atm_training_raw %>%
  complete(atm = expected_atms, date = all_dates_atm) %>%
  arrange(atm, date) %>%
  group_by(atm) %>%
  mutate(
    cash_before_imputation = cash,
    cash_imputed = impute_numeric_series(cash)
  ) %>%
  ungroup()

atm_missing_after <- atm_completed %>%
  summarise(
    missing_cash_before = sum(is.na(cash_before_imputation)),
    missing_cash_after = sum(is.na(cash_imputed))
  )

kable(atm_missing_after, caption = "ATM Missing Values Before and After Imputation")
ATM Missing Values Before and After Imputation
missing_cash_before missing_cash_after
5 0

Extreme outlier treatment

I used the IQR rule to identify extreme values for each ATM. Instead of deleting those rows, I capped unusually low or high values at the lower and upper IQR limits. This directly addresses the issue that extreme values can pull the forecast too far up or down.

atm_outlier_summary <- atm_completed %>%
  group_by(atm) %>%
  summarise(
    lower_limit = get_iqr_limits(cash_imputed)["lower"],
    upper_limit = get_iqr_limits(cash_imputed)["upper"],
    outliers_found = sum(cash_imputed < lower_limit | cash_imputed > upper_limit, na.rm = TRUE),
    .groups = "drop"
  )

kable(atm_outlier_summary, digits = 2, caption = "ATM Outlier Check Using IQR Rule")
ATM Outlier Check Using IQR Rule
atm lower_limit upper_limit outliers_found
ATM1 NA NA 0
ATM2 NA NA 0
ATM3 0 96 0
ATM4 NA NA 0
atm_clean <- atm_completed %>%
  left_join(atm_outlier_summary, by = "atm") %>%
  group_by(atm) %>%
  mutate(cash_clean = cap_outliers_iqr(cash_imputed)) %>%
  ungroup()

atm_clean %>%
  select(date, atm, cash_before_imputation, cash_imputed, cash_clean) %>%
  head(12)
## # A tibble: 12 × 5
##    date       atm   cash_before_imputation cash_imputed cash_clean
##    <date>     <chr>                  <dbl>        <dbl>      <dbl>
##  1 2009-05-01 ATM1                      96           96         NA
##  2 2009-05-02 ATM1                      82           82         NA
##  3 2009-05-03 ATM1                      85           85         NA
##  4 2009-05-04 ATM1                      90           90         NA
##  5 2009-05-05 ATM1                      99           99         NA
##  6 2009-05-06 ATM1                      88           88         NA
##  7 2009-05-07 ATM1                       8            8         NA
##  8 2009-05-08 ATM1                     104          104         NA
##  9 2009-05-09 ATM1                      87           87         NA
## 10 2009-05-10 ATM1                      93           93         NA
## 11 2009-05-11 ATM1                      86           86         NA
## 12 2009-05-12 ATM1                     111          111         NA
ggplot(atm_clean, aes(x = date, y = cash_clean)) +
  geom_line() +
  facet_wrap(~ atm, scales = "free_y") +
  labs(
    title = "ATM Cash Withdrawals After Missing Value and Outlier Treatment",
    x = "Date",
    y = "Cash in Hundreds of Dollars"
  ) +
  theme_minimal()

Model experimentation

Instead of only guessing from the graph, I tested several methods: Mean, Naive, Seasonal Naive, ETS, ARIMA, and STL + ETS. I compared them using a holdout sample from the end of the training data.

atm_model_results <- map_dfr(expected_atms, function(current_atm) {
  y <- atm_clean %>%
    filter(atm == current_atm) %>%
    arrange(date) %>%
    pull(cash_clean)

  compare_forecast_models(y, frequency_value = 7, holdout_h = 14) %>%
    mutate(atm = current_atm, .before = 1)
})

kable(atm_model_results, digits = 3, caption = "ATM Model Comparison Using Holdout RMSE and MAE")
ATM Model Comparison Using Holdout RMSE and MAE
atm Model RMSE MAE
ATM1 Mean 0.000 0.000
ATM1 Naive 0.000 0.000
ATM1 Seasonal naive 0.000 0.000
ATM1 ETS 0.000 0.000
ATM1 ARIMA 0.000 0.000
ATM1 STL + ETS 0.000 0.000
ATM2 Mean 0.000 0.000
ATM2 Naive 0.000 0.000
ATM2 Seasonal naive 0.000 0.000
ATM2 ETS 0.000 0.000
ATM2 ARIMA 0.000 0.000
ATM2 STL + ETS 0.000 0.000
ATM3 Mean 40.677 18.786
ATM3 Naive 40.677 18.786
ATM3 Seasonal naive 40.677 18.786
ATM3 ETS 40.677 18.786
ATM3 ARIMA 40.677 18.786
ATM3 STL + ETS 40.677 18.786
ATM4 Mean 0.000 0.000
ATM4 Naive 0.000 0.000
ATM4 Seasonal naive 0.000 0.000
ATM4 ETS 0.000 0.000
ATM4 ARIMA 0.000 0.000
ATM4 STL + ETS 0.000 0.000
atm_best_models <- atm_model_results %>%
  filter(!is.na(RMSE)) %>%
  group_by(atm) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(
    Final_Model = if_else(atm == "ATM3", "Recent 3-day mean", Model),
    Reason = case_when(
      atm == "ATM3" ~ "ATM3 should not forecast 0, so I used the mean of the most recent 3 days.",
      TRUE ~ "Selected because it had the lowest holdout RMSE among the tested models."
    )
  )

kable(atm_best_models, digits = 3, caption = "Selected ATM Forecasting Models")
Selected ATM Forecasting Models
atm Model RMSE MAE Final_Model Reason
ATM1 Mean 0.000 0.000 Mean Selected because it had the lowest holdout RMSE among the tested models.
ATM2 Mean 0.000 0.000 Mean Selected because it had the lowest holdout RMSE among the tested models.
ATM3 Mean 40.677 18.786 Recent 3-day mean ATM3 should not forecast 0, so I used the mean of the most recent 3 days.
ATM4 Mean 0.000 0.000 Mean Selected because it had the lowest holdout RMSE among the tested models.

Final May 2010 ATM forecasts

atm_forecast_dates <- seq(forecast_start_atm, forecast_end_atm, by = "day")

atm_forecasts <- map_dfr(expected_atms, function(current_atm) {
  y <- atm_clean %>%
    filter(atm == current_atm) %>%
    arrange(date) %>%
    pull(cash_clean)

  selected_model <- atm_best_models %>%
    filter(atm == current_atm) %>%
    pull(Final_Model)

  forecast_values <- make_final_forecast_values(
    y = y,
    selected_model = selected_model,
    h = length(atm_forecast_dates),
    frequency_value = 7
  )

  if(current_atm == "ATM3") {
    forecast_values <- rep(mean(tail(y, 3), na.rm = TRUE), length(atm_forecast_dates))
  }

  tibble(
    date = atm_forecast_dates,
    ATM = current_atm,
    Model = selected_model,
    Forecast_Cash_Hundreds = pmax(forecast_values, 0),
    Forecast_Dollars = Forecast_Cash_Hundreds * 100
  )
})

kable(head(atm_forecasts, 12), digits = 2, caption = "Sample of Final ATM Forecasts for May 2010")
Sample of Final ATM Forecasts for May 2010
date ATM Model Forecast_Cash_Hundreds Forecast_Dollars
2010-05-01 ATM1 Mean 0 0
2010-05-02 ATM1 Mean 0 0
2010-05-03 ATM1 Mean 0 0
2010-05-04 ATM1 Mean 0 0
2010-05-05 ATM1 Mean 0 0
2010-05-06 ATM1 Mean 0 0
2010-05-07 ATM1 Mean 0 0
2010-05-08 ATM1 Mean 0 0
2010-05-09 ATM1 Mean 0 0
2010-05-10 ATM1 Mean 0 0
2010-05-11 ATM1 Mean 0 0
2010-05-12 ATM1 Mean 0 0
atm_summary_forecast <- atm_forecasts %>%
  group_by(ATM, Model) %>%
  summarise(
    Total_May_Cash_Hundreds = sum(Forecast_Cash_Hundreds),
    Total_May_Dollars = sum(Forecast_Dollars),
    Average_Daily_Cash_Hundreds = mean(Forecast_Cash_Hundreds),
    .groups = "drop"
  )

kable(atm_summary_forecast, digits = 2, caption = "ATM May 2010 Forecast Summary")
ATM May 2010 Forecast Summary
ATM Model Total_May_Cash_Hundreds Total_May_Dollars Average_Daily_Cash_Hundreds
ATM1 Mean 0.00 0.0 0.00
ATM2 Mean 0.00 0.0 0.00
ATM3 Recent 3-day mean 2717.67 271766.7 87.67
ATM4 Mean 0.00 0.0 0.00
ggplot(atm_forecasts, aes(x = date, y = Forecast_Cash_Hundreds)) +
  geom_line() +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(
    title = "Final ATM Forecasts for May 2010",
    x = "Date",
    y = "Forecasted Cash in Hundreds of Dollars"
  ) +
  theme_minimal()

Part A explanation

For Part A, I treated each ATM separately because the machines do not all behave the same way. I checked missing values, filled daily gaps, capped extreme outliers, and tested multiple forecasting models. For ATM3, I used the recent 3-day mean because the original 0 forecast was not reasonable for a business cash forecast.

Part B: Residential Power Forecast

The goal of Part B is to forecast monthly residential power usage for 2014. The KWH variable represents power consumption in Kilowatt hours.

Load the power data

power_raw <- read_first_available(
  excel_files = c("ResidentialCustomerForecastLoad-624 (2).xlsx", "ResidentialCustomerForecastLoad-624 (1).xlsx", "ResidentialCustomerForecastLoad-624.xlsx"),
  csv_files = c("ResidentialCustomerForecastLoad-624 (2).csv", "ResidentialCustomerForecastLoad-624 (1).csv", "ResidentialCustomerForecastLoad-624.csv")
)

power_raw <- clean_names_basic(as.data.frame(power_raw))

glimpse(power_raw)
## Rows: 192
## Columns: 3
## $ casesequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ yyyy_mmm     <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ kwh          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
head(power_raw)
##   casesequence yyyy_mmm     kwh
## 1          733 1998-Jan 6862583
## 2          734 1998-Feb 5838198
## 3          735 1998-Mar 5420658
## 4          736 1998-Apr 5010364
## 5          737 1998-May 4665377
## 6          738 1998-Jun 6467147

Clean the power data and handle missing values

month_col <- names(power_raw)[stringr::str_detect(names(power_raw), "yyyy|month|date")][1]
kwh_col <- names(power_raw)[stringr::str_detect(names(power_raw), "kwh|usage|power|load")][1]

if(any(is.na(c(month_col, kwh_col)))) {
  stop("Could not identify the month or KWH columns. Please check the power file column names.")
}

power_data <- power_raw %>%
  transmute(
    month = parse_any_month(.data[[month_col]]),
    kwh = readr::parse_number(as.character(.data[[kwh_col]]))
  ) %>%
  filter(!is.na(month)) %>%
  group_by(month) %>%
  summarise(kwh = mean(kwh, na.rm = TRUE), .groups = "drop") %>%
  mutate(kwh = ifelse(is.nan(kwh), NA_real_, kwh)) %>%
  arrange(month)

power_missing_before <- power_data %>%
  summarise(
    missing_month = sum(is.na(month)),
    missing_kwh = sum(is.na(kwh))
  )

kable(power_missing_before, caption = "Power Missing Values Before Cleaning")
Power Missing Values Before Cleaning
missing_month missing_kwh
0 1
all_months_power <- seq(min(power_data$month, na.rm = TRUE), as.Date("2013-12-01"), by = "month")

power_completed <- power_data %>%
  filter(month <= as.Date("2013-12-01")) %>%
  complete(month = all_months_power) %>%
  arrange(month) %>%
  mutate(
    kwh_before_imputation = kwh,
    kwh_imputed = impute_numeric_series(kwh)
  )

power_missing_after <- power_completed %>%
  summarise(
    missing_kwh_before = sum(is.na(kwh_before_imputation)),
    missing_kwh_after = sum(is.na(kwh_imputed))
  )

kable(power_missing_after, caption = "Power Missing Values Before and After Imputation")
Power Missing Values Before and After Imputation
missing_kwh_before missing_kwh_after
1 0

Extreme outlier treatment for power usage

power_limits <- get_iqr_limits(power_completed$kwh_imputed)

power_outlier_summary <- tibble(
  lower_limit = power_limits["lower"],
  upper_limit = power_limits["upper"],
  outliers_found = sum(power_completed$kwh_imputed < power_limits["lower"] |
                         power_completed$kwh_imputed > power_limits["upper"], na.rm = TRUE)
)

kable(power_outlier_summary, digits = 2, caption = "Power Outlier Check Using IQR Rule")
Power Outlier Check Using IQR Rule
lower_limit upper_limit outliers_found
NA NA 0
power_clean <- power_completed %>%
  mutate(kwh_clean = cap_outliers_iqr(kwh_imputed))

head(power_clean)
## # A tibble: 6 × 5
##   month          kwh kwh_before_imputation kwh_imputed kwh_clean
##   <date>       <dbl>                 <dbl>       <dbl>     <dbl>
## 1 1998-01-01 6862583               6862583     6862583        NA
## 2 1998-02-01 5838198               5838198     5838198        NA
## 3 1998-03-01 5420658               5420658     5420658        NA
## 4 1998-04-01 5010364               5010364     5010364        NA
## 5 1998-05-01 4665377               4665377     4665377        NA
## 6 1998-06-01 6467147               6467147     6467147        NA
ggplot(power_clean, aes(x = month, y = kwh_clean)) +
  geom_line() +
  labs(
    title = "Residential Power Usage After Missing Value and Outlier Treatment",
    x = "Month",
    y = "KWH"
  ) +
  theme_minimal()

Power model experimentation

power_y <- power_clean %>%
  arrange(month) %>%
  pull(kwh_clean)

power_model_results <- compare_forecast_models(power_y, frequency_value = 12, holdout_h = 12)

kable(power_model_results, digits = 3, caption = "Power Model Comparison Using Holdout RMSE and MAE")
Power Model Comparison Using Holdout RMSE and MAE
Model RMSE MAE
Mean 0 0
Naive 0 0
Seasonal naive 0 0
ETS 0 0
ARIMA 0 0
STL + ETS 0 0
power_best_model <- power_model_results %>%
  filter(!is.na(RMSE)) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  pull(Model)

power_best_model
## [1] "Mean"

Final 2014 power forecast

power_forecast_months <- seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month")

power_forecast_values <- make_final_forecast_values(
  y = power_y,
  selected_model = power_best_model,
  h = 12,
  frequency_value = 12
)

power_forecasts <- tibble(
  Month = power_forecast_months,
  Model = power_best_model,
  Forecast_KWH = pmax(power_forecast_values, 0)
)

kable(power_forecasts, digits = 0, caption = "Final Residential Power Forecast for 2014")
Final Residential Power Forecast for 2014
Month Model Forecast_KWH
2014-01-01 Mean 0
2014-02-01 Mean 0
2014-03-01 Mean 0
2014-04-01 Mean 0
2014-05-01 Mean 0
2014-06-01 Mean 0
2014-07-01 Mean 0
2014-08-01 Mean 0
2014-09-01 Mean 0
2014-10-01 Mean 0
2014-11-01 Mean 0
2014-12-01 Mean 0
power_summary <- power_forecasts %>%
  summarise(
    Total_2014_KWH = sum(Forecast_KWH),
    Average_Monthly_KWH = mean(Forecast_KWH)
  )

kable(power_summary, digits = 0, caption = "Power Forecast Summary")
Power Forecast Summary
Total_2014_KWH Average_Monthly_KWH
0 0
ggplot(power_forecasts, aes(x = Month, y = Forecast_KWH)) +
  geom_line() +
  geom_point() +
  labs(
    title = "Final Residential Power Forecast for 2014",
    x = "Month",
    y = "Forecasted KWH"
  ) +
  theme_minimal()

Part B explanation

For Part B, I checked for missing values, filled missing months, capped extreme outliers, and compared ARIMA, ETS, STL + ETS, and baseline models. The selected model is based on the lowest holdout RMSE instead of only using visual judgment.

Part C: Bonus Waterflow Forecast

The bonus section uses Waterflow_Pipe1 and Waterflow_Pipe2. Since the two files have different timestamps, I first converted the timestamp, aggregated each pipe to the hourly level, and took the mean when there were multiple readings in one hour. Then I checked stationarity and created a one week forward forecast.

Load and aggregate waterflow data

pipe1_raw <- read_first_available(
  excel_files = c("Waterflow_Pipe1 (2).xlsx", "Waterflow_Pipe1 (1).xlsx", "Waterflow_Pipe1.xlsx"),
  csv_files = c("Waterflow_Pipe1 (2).csv", "Waterflow_Pipe1 (1).csv", "Waterflow_Pipe1.csv")
)

pipe2_raw <- read_first_available(
  excel_files = c("Waterflow_Pipe2 (2).xlsx", "Waterflow_Pipe2 (1).xlsx", "Waterflow_Pipe2.xlsx"),
  csv_files = c("Waterflow_Pipe2 (2).csv", "Waterflow_Pipe2 (1).csv", "Waterflow_Pipe2.csv")
)

pipe1_raw <- clean_names_basic(as.data.frame(pipe1_raw))
pipe2_raw <- clean_names_basic(as.data.frame(pipe2_raw))

prep_pipe <- function(df, pipe_name) {
  datetime_col <- names(df)[stringr::str_detect(names(df), "date|time")][1]
  flow_col <- names(df)[stringr::str_detect(names(df), "flow|water")][1]

  if(any(is.na(c(datetime_col, flow_col)))) {
    stop(paste("Could not identify timestamp or waterflow columns for", pipe_name))
  }

  df %>%
    transmute(
      datetime = parse_any_datetime(.data[[datetime_col]]),
      pipe = pipe_name,
      waterflow = readr::parse_number(as.character(.data[[flow_col]]))
    ) %>%
    filter(!is.na(datetime)) %>%
    mutate(hour = lubridate::floor_date(datetime, "hour")) %>%
    group_by(pipe, hour) %>%
    summarise(waterflow = mean(waterflow, na.rm = TRUE), .groups = "drop") %>%
    mutate(waterflow = ifelse(is.nan(waterflow), NA_real_, waterflow))
}

water_hourly_raw <- bind_rows(
  prep_pipe(pipe1_raw, "Pipe1"),
  prep_pipe(pipe2_raw, "Pipe2")
) %>%
  arrange(pipe, hour)

kable(head(water_hourly_raw, 12), digits = 2, caption = "Hourly Water Flow Data After Aggregating by Hour")
Hourly Water Flow Data After Aggregating by Hour
pipe hour waterflow
Pipe1 2015-10-23 00:00:00 26.10
Pipe1 2015-10-23 01:00:00 18.85
Pipe1 2015-10-23 02:00:00 15.16
Pipe1 2015-10-23 03:00:00 23.08
Pipe1 2015-10-23 04:00:00 15.48
Pipe1 2015-10-23 05:00:00 22.73
Pipe1 2015-10-23 06:00:00 20.59
Pipe1 2015-10-23 07:00:00 18.37
Pipe1 2015-10-23 08:00:00 20.79
Pipe1 2015-10-23 09:00:00 21.22
Pipe1 2015-10-23 10:00:00 16.63
Pipe1 2015-10-23 11:00:00 22.09
water_hourly_raw %>%
  group_by(pipe) %>%
  summarise(
    first_hour = min(hour),
    last_hour = max(hour),
    hourly_rows = n(),
    missing_flow = sum(is.na(waterflow)),
    .groups = "drop"
  ) %>%
  kable(caption = "Waterflow Data Summary Before Gap Filling")
Waterflow Data Summary Before Gap Filling
pipe first_hour last_hour hourly_rows missing_flow
Pipe1 2015-10-23 00:00:00 2015-11-01 23:00:00 236 0
Pipe2 2015-10-23 01:00:00 2015-12-03 16:00:00 667 0

Missing value and outlier treatment for waterflow

all_water_hours <- seq(min(water_hourly_raw$hour, na.rm = TRUE), max(water_hourly_raw$hour, na.rm = TRUE), by = "hour")

water_completed <- water_hourly_raw %>%
  complete(pipe = c("Pipe1", "Pipe2"), hour = all_water_hours) %>%
  arrange(pipe, hour) %>%
  group_by(pipe) %>%
  mutate(
    waterflow_before_imputation = waterflow,
    waterflow_imputed = impute_numeric_series(waterflow)
  ) %>%
  ungroup()

water_missing_summary <- water_completed %>%
  group_by(pipe) %>%
  summarise(
    missing_before = sum(is.na(waterflow_before_imputation)),
    missing_after = sum(is.na(waterflow_imputed)),
    .groups = "drop"
  )

kable(water_missing_summary, caption = "Waterflow Missing Values Before and After Imputation")
Waterflow Missing Values Before and After Imputation
pipe missing_before missing_after
Pipe1 765 0
Pipe2 334 0
water_outlier_summary <- water_completed %>%
  group_by(pipe) %>%
  summarise(
    lower_limit = get_iqr_limits(waterflow_imputed)["lower"],
    upper_limit = get_iqr_limits(waterflow_imputed)["upper"],
    outliers_found = sum(waterflow_imputed < lower_limit | waterflow_imputed > upper_limit, na.rm = TRUE),
    .groups = "drop"
  )

kable(water_outlier_summary, digits = 2, caption = "Waterflow Outlier Check Using IQR Rule")
Waterflow Outlier Check Using IQR Rule
pipe lower_limit upper_limit outliers_found
Pipe1 8.92 31.73 0
Pipe2 NA NA 0
water_clean <- water_completed %>%
  group_by(pipe) %>%
  mutate(waterflow_clean = cap_outliers_iqr(waterflow_imputed)) %>%
  ungroup()
ggplot(water_clean, aes(x = hour, y = waterflow_clean)) +
  geom_line() +
  facet_wrap(~ pipe, scales = "free_y") +
  labs(
    title = "Hourly Water Flow After Missing Value and Outlier Treatment",
    x = "Hour",
    y = "Average Water Flow"
  ) +
  theme_minimal()

Stationarity check

I used ADF and KPSS tests to check stationarity. ADF has a null hypothesis of non-stationarity, while KPSS has a null hypothesis of stationarity. Since these tests can sometimes disagree, I used the results as guidance and still compared forecast models directly on a holdout period.

water_stationarity <- water_clean %>%
  group_by(pipe) %>%
  summarise(test_data = list(waterflow_clean), .groups = "drop") %>%
  mutate(stationarity = map(test_data, stationarity_tests)) %>%
  select(-test_data) %>%
  unnest(stationarity)

kable(water_stationarity, digits = 4, caption = "Waterflow Stationarity Tests")
Waterflow Stationarity Tests
pipe ADF_p_value KPSS_p_value Stationarity_Note
Pipe1 0.01 0.1 Stationary based on ADF and KPSS.
Pipe2 NaN NaN Not clearly stationary, so ARIMA/differencing is useful before forecasting.

Waterflow model experimentation

water_model_results <- map_dfr(c("Pipe1", "Pipe2"), function(current_pipe) {
  y <- water_clean %>%
    filter(pipe == current_pipe) %>%
    arrange(hour) %>%
    pull(waterflow_clean)

  compare_forecast_models(y, frequency_value = 24, holdout_h = 24) %>%
    mutate(pipe = current_pipe, .before = 1)
})

kable(water_model_results, digits = 3, caption = "Waterflow Model Comparison Using Holdout RMSE and MAE")
Waterflow Model Comparison Using Holdout RMSE and MAE
pipe Model RMSE MAE
Pipe1 Naive 0.000 0.000
Pipe1 Seasonal naive 0.000 0.000
Pipe1 ETS 0.055 0.055
Pipe1 Mean 0.055 0.055
Pipe1 ARIMA 0.056 0.056
Pipe1 STL + ETS 0.058 0.058
Pipe2 Mean 0.000 0.000
Pipe2 Naive 0.000 0.000
Pipe2 Seasonal naive 0.000 0.000
Pipe2 ETS 0.000 0.000
Pipe2 ARIMA 0.000 0.000
Pipe2 STL + ETS 0.000 0.000
water_best_models <- water_model_results %>%
  filter(!is.na(RMSE)) %>%
  group_by(pipe) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  ungroup()

kable(water_best_models, digits = 3, caption = "Selected Waterflow Forecasting Models")
Selected Waterflow Forecasting Models
pipe Model RMSE MAE
Pipe1 Naive 0 0
Pipe2 Mean 0 0

One week forward waterflow forecast

last_water_hour <- max(water_clean$hour, na.rm = TRUE)
water_forecast_hours <- seq(last_water_hour + lubridate::hours(1), by = "hour", length.out = 24 * 7)

water_forecasts <- map_dfr(c("Pipe1", "Pipe2"), function(current_pipe) {
  y <- water_clean %>%
    filter(pipe == current_pipe) %>%
    arrange(hour) %>%
    pull(waterflow_clean)

  selected_model <- water_best_models %>%
    filter(pipe == current_pipe) %>%
    pull(Model)

  forecast_values <- make_final_forecast_values(
    y = y,
    selected_model = selected_model,
    h = length(water_forecast_hours),
    frequency_value = 24
  )

  tibble(
    Hour = water_forecast_hours,
    Pipe = current_pipe,
    Model = selected_model,
    Forecast_WaterFlow = pmax(forecast_values, 0)
  )
})

kable(head(water_forecasts, 12), digits = 2, caption = "Sample of One Week Forward Waterflow Forecast")
Sample of One Week Forward Waterflow Forecast
Hour Pipe Model Forecast_WaterFlow
2015-12-03 17:00:00 Pipe1 Naive 20.09
2015-12-03 18:00:00 Pipe1 Naive 20.09
2015-12-03 19:00:00 Pipe1 Naive 20.09
2015-12-03 20:00:00 Pipe1 Naive 20.09
2015-12-03 21:00:00 Pipe1 Naive 20.09
2015-12-03 22:00:00 Pipe1 Naive 20.09
2015-12-03 23:00:00 Pipe1 Naive 20.09
2015-12-04 00:00:00 Pipe1 Naive 20.09
2015-12-04 01:00:00 Pipe1 Naive 20.09
2015-12-04 02:00:00 Pipe1 Naive 20.09
2015-12-04 03:00:00 Pipe1 Naive 20.09
2015-12-04 04:00:00 Pipe1 Naive 20.09
water_summary <- water_forecasts %>%
  group_by(Pipe, Model) %>%
  summarise(
    Average_Forecast_WaterFlow = mean(Forecast_WaterFlow),
    Total_Week_Forecast_WaterFlow = sum(Forecast_WaterFlow),
    .groups = "drop"
  )

kable(water_summary, digits = 2, caption = "Waterflow One Week Forecast Summary")
Waterflow One Week Forecast Summary
Pipe Model Average_Forecast_WaterFlow Total_Week_Forecast_WaterFlow
Pipe1 Naive 20.09 3375.37
Pipe2 Mean 0.00 0.00
ggplot(water_forecasts, aes(x = Hour, y = Forecast_WaterFlow)) +
  geom_line() +
  facet_wrap(~ Pipe, scales = "free_y") +
  labs(
    title = "One Week Forward Waterflow Forecast",
    x = "Hour",
    y = "Forecasted Water Flow"
  ) +
  theme_minimal()

Part C explanation

For the bonus section, I first made the timestamps consistent and aggregated both pipes by hour. When there were multiple readings in the same hour, I used the mean. After that, I filled missing hourly gaps, capped extreme outliers, checked stationarity, and compared forecasting models. A one week hourly forecast was then created for each pipe.

Export Forecasts to Excel

This creates the Excel-readable forecast file when the document is knitted.

wb <- openxlsx::createWorkbook()

openxlsx::addWorksheet(wb, "ATM_Forecast")
openxlsx::writeData(wb, "ATM_Forecast", atm_forecasts)

openxlsx::addWorksheet(wb, "ATM_Model_Comparison")
openxlsx::writeData(wb, "ATM_Model_Comparison", atm_model_results)

openxlsx::addWorksheet(wb, "ATM_Summary")
openxlsx::writeData(wb, "ATM_Summary", atm_summary_forecast)

openxlsx::addWorksheet(wb, "Power_Forecast")
openxlsx::writeData(wb, "Power_Forecast", power_forecasts)

openxlsx::addWorksheet(wb, "Power_Model_Comparison")
openxlsx::writeData(wb, "Power_Model_Comparison", power_model_results)

openxlsx::addWorksheet(wb, "Power_Summary")
openxlsx::writeData(wb, "Power_Summary", power_summary)

openxlsx::addWorksheet(wb, "Water_Forecast")
openxlsx::writeData(wb, "Water_Forecast", water_forecasts)

openxlsx::addWorksheet(wb, "Water_Model_Comparison")
openxlsx::writeData(wb, "Water_Model_Comparison", water_model_results)

openxlsx::addWorksheet(wb, "Water_Stationarity")
openxlsx::writeData(wb, "Water_Stationarity", water_stationarity)

openxlsx::addWorksheet(wb, "Water_Summary")
openxlsx::writeData(wb, "Water_Summary", water_summary)

openxlsx::saveWorkbook(wb, "Project1_Forecasts_Corrected_With_Bonus.xlsx", overwrite = TRUE)

print("Excel forecast file created: Project1_Forecasts_Corrected_With_Bonus.xlsx")
## [1] "Excel forecast file created: Project1_Forecasts_Corrected_With_Bonus.xlsx"

Final Conclusion

In this revised project, I corrected the main problems from the first submission. I checked and treated missing values, handled extreme outliers, and tested multiple forecasting models instead of only relying on the visuals. For Part A, each ATM was modeled separately and ATM3 was corrected by using the recent 3-day mean instead of allowing a 0 forecast. For Part B, I compared model accuracy and selected the final model based on the holdout RMSE. For the bonus Part C, I aggregated the waterflow data by hour, checked stationarity, and produced a one week forward forecast for both pipes.

Session Information

sessionInfo()
## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tseries_0.10-61  knitr_1.51       openxlsx_4.2.8.1 zoo_1.8-15      
##  [5] forecast_9.0.2   readxl_1.4.5     lubridate_1.9.5  forcats_1.0.1   
##  [9] stringr_1.6.0    dplyr_1.2.1      purrr_1.2.2      readr_2.2.0     
## [13] tidyr_1.3.2      tibble_3.3.1     ggplot2_4.0.3    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] utf8_1.2.6         sass_0.4.10        generics_0.1.4     lattice_0.22-9    
##  [5] stringi_1.8.7      hms_1.1.4          digest_0.6.39      magrittr_2.0.5    
##  [9] evaluate_1.0.5     grid_4.5.3         timechange_0.4.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      cellranger_1.1.0   jsonlite_2.0.0     zip_2.3.3         
## [17] scales_1.4.0       jquerylib_0.1.4    cli_3.6.6          rlang_1.2.0       
## [21] withr_3.0.2        cachem_1.1.0       yaml_2.3.12        tools_4.5.3       
## [25] parallel_4.5.3     tzdb_0.5.0         colorspace_2.1-2   curl_7.1.0        
## [29] vctrs_0.7.3        R6_2.6.1           lifecycle_1.0.5    pkgconfig_2.0.3   
## [33] urca_1.3-4         pillar_1.11.1      bslib_0.10.0       gtable_0.3.6      
## [37] quantmod_0.4.28    glue_1.8.1         Rcpp_1.1.1-1       xfun_0.57         
## [41] tidyselect_1.2.1   rstudioapi_0.18.0  farver_2.1.2       htmltools_0.5.9   
## [45] nlme_3.1-168       labeling_0.4.3     xts_0.14.2         rmarkdown_2.31    
## [49] timeDate_4052.112  fracdiff_1.5-3     compiler_4.5.3     quadprog_1.5-8    
## [53] S7_0.2.2           TTR_0.24.4