Resubmission Note

I revised this project based on the feedback I received. In the original version, I did not clearly show how I handled missing values, extreme outliers, or how I tested different forecasting methods. In this corrected version, I added those steps directly into the analysis.

The main corrections are:

I checked for missing values before forecasting.
I filled missing time series values instead of leaving gaps.
I checked for extreme outliers and capped them using the IQR rule.
I tested ARIMA, ETS, STL-based forecasts, and baseline models.
I compared models using a holdout period instead of only choosing by visual inspection.
I corrected ATM3 so it does not forecast 0. I used the mean of the most recent 3 days because that is more realistic for a short-term ATM forecast.
I also included the optional bonus waterflow section.

Helper Functions

clean_names_basic <- function(df) {
  names(df) <- names(df) %>%
    stringr::str_trim() %>%
    stringr::str_replace_all("[^A-Za-z0-9]+", "_") %>%
    stringr::str_replace_all("_+$", "") %>%
    tolower()
  df
}

read_first_available <- function(excel_files = NULL, csv_files = NULL) {
  if(!is.null(excel_files)) {
    for(file in excel_files) {
      if(file.exists(file)) {
        return(readxl::read_excel(file))
      }
    }
  }

  if(!is.null(csv_files)) {
    for(file in csv_files) {
      if(file.exists(file)) {
        return(readr::read_csv(file, show_col_types = FALSE))
      }
    }
  }

  stop("The file was not found. Please make sure the Excel file is uploaded into the same folder as this RMD.")
}

parse_any_date <- function(x) {
  if(inherits(x, "Date")) return(x)
  if(inherits(x, "POSIXt")) return(as.Date(x))
  if(is.numeric(x)) return(as.Date(x, origin = "1899-12-30"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("mdy HMS", "mdy HM", "mdy", "ymd HMS", "ymd HM", "ymd", "Ymd")
  ))
  as.Date(parsed)
}

parse_any_month <- function(x) {
  if(inherits(x, "Date")) return(lubridate::floor_date(x, "month"))
  if(inherits(x, "POSIXt")) return(lubridate::floor_date(as.Date(x), "month"))
  if(is.numeric(x)) return(lubridate::floor_date(as.Date(x, origin = "1899-12-30"), "month"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("Y-b", "Y-B", "Y-m", "ym", "my", "mdy", "ymd")
  ))
  lubridate::floor_date(as.Date(parsed), "month")
}

parse_any_datetime <- function(x) {
  if(inherits(x, "POSIXt")) return(as.POSIXct(x, tz = "UTC"))
  if(inherits(x, "Date")) return(as.POSIXct(x, tz = "UTC"))
  if(is.numeric(x)) return(as.POSIXct(as.numeric(x) * 86400, origin = "1899-12-30", tz = "UTC"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("ymd HMS", "mdy HMS", "ymd HM", "mdy HM", "ymd", "mdy")
  ))
  as.POSIXct(parsed, tz = "UTC")
}

impute_numeric_series <- function(x) {
  x <- as.numeric(x)

  if(sum(!is.na(x)) == 0) {
    return(rep(0, length(x)))
  }

  x_clean <- zoo::na.approx(x, na.rm = FALSE)
  x_clean <- zoo::na.locf(x_clean, na.rm = FALSE)
  x_clean <- zoo::na.locf(x_clean, fromLast = TRUE, na.rm = FALSE)
  x_clean
}

get_iqr_limits <- function(x) {
  x <- as.numeric(x)
  q1 <- quantile(x, 0.25, na.rm = TRUE)
  q3 <- quantile(x, 0.75, na.rm = TRUE)
  iqr_value <- q3 - q1

  if(is.na(iqr_value) || iqr_value == 0) {
    return(c(lower = min(x, na.rm = TRUE), upper = max(x, na.rm = TRUE)))
  }

  c(
    lower = q1 - 1.5 * iqr_value,
    upper = q3 + 1.5 * iqr_value
  )
}

cap_outliers_iqr <- function(x) {
  limits <- get_iqr_limits(x)
  pmin(pmax(as.numeric(x), limits["lower"]), limits["upper"])
}

rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2, na.rm = TRUE))
}

mae <- function(actual, predicted) {
  mean(abs(actual - predicted), na.rm = TRUE)
}

safe_model_forecast <- function(model_name, y, h, frequency_value) {
  y <- impute_numeric_series(as.numeric(y))

  if(length(y) < 5 || sum(!is.na(y)) < 5) {
    return(NULL)
  }

  y_ts <- ts(y, frequency = frequency_value)

  result <- tryCatch({
    if(model_name == "Mean") {
      forecast::meanf(y_ts, h = h)
    } else if(model_name == "Naive") {
      forecast::naive(y_ts, h = h)
    } else if(model_name == "Seasonal naive") {
      forecast::snaive(y_ts, h = h)
    } else if(model_name == "ETS") {
      forecast::forecast(forecast::ets(y_ts), h = h)
    } else if(model_name == "ARIMA") {
      forecast::forecast(forecast::auto.arima(y_ts), h = h)
    } else if(model_name == "STL + ETS") {
      forecast::stlf(y_ts, h = h, method = "ets")
    } else {
      stop("Unknown model name")
    }
  }, error = function(e) NULL)

  result
}

compare_forecast_models <- function(y, frequency_value, holdout_h) {
  y <- impute_numeric_series(as.numeric(y))

  if(length(y) < 10) {
    return(tibble(Model = character(), RMSE = numeric(), MAE = numeric()))
  }

  holdout_h <- min(holdout_h, max(2, floor(length(y) / 4)))
  train_y <- head(y, length(y) - holdout_h)
  test_y <- tail(y, holdout_h)

  candidate_models <- c("Mean", "Naive", "Seasonal naive", "ETS", "ARIMA", "STL + ETS")

  results <- map_dfr(candidate_models, function(model_name) {
    fc <- safe_model_forecast(model_name, train_y, holdout_h, frequency_value)

    if(is.null(fc)) {
      return(tibble(Model = model_name, RMSE = NA_real_, MAE = NA_real_))
    }

    preds <- as.numeric(fc$mean)

    tibble(
      Model = model_name,
      RMSE = rmse(test_y, preds),
      MAE = mae(test_y, preds)
    )
  }) %>%
    arrange(RMSE)

  results
}

make_final_forecast_values <- function(y, selected_model, h, frequency_value) {
  y <- impute_numeric_series(as.numeric(y))

  if(selected_model %in% c("Recent 3-day mean", "Recent 3-hour mean", "Recent 3-month mean")) {
    return(rep(mean(tail(y, 3), na.rm = TRUE), h))
  }

  fc <- safe_model_forecast(selected_model, y, h, frequency_value)

  if(is.null(fc)) {
    return(rep(mean(tail(y, 3), na.rm = TRUE), h))
  }

  as.numeric(fc$mean)
}

stationarity_tests <- function(y) {
  y <- impute_numeric_series(as.numeric(y))
  y <- y[is.finite(y)]

  adf_result <- tryCatch(tseries::adf.test(y), error = function(e) NULL)
  kpss_result <- tryCatch(tseries::kpss.test(y), error = function(e) NULL)

  tibble(
    ADF_p_value = ifelse(is.null(adf_result), NA_real_, adf_result$p.value),
    KPSS_p_value = ifelse(is.null(kpss_result), NA_real_, kpss_result$p.value),
    Stationarity_Note = case_when(
      !is.na(ADF_p_value) & !is.na(KPSS_p_value) & ADF_p_value < 0.05 & KPSS_p_value > 0.05 ~ "Stationary based on ADF and KPSS.",
      !is.na(ADF_p_value) & ADF_p_value < 0.05 ~ "ADF suggests stationarity, but KPSS should also be reviewed.",
      TRUE ~ "Not clearly stationary, so ARIMA/differencing is useful before forecasting."
    )
  )
}

Part A: ATM Forecast

The goal of Part A is to forecast how much cash is taken out of four different ATM machines for May 2010. The variable Cash is in hundreds of dollars.

Load the ATM data

atm_raw <- read_first_available(
  excel_files = c("ATM624Data (2).xlsx", "ATM624Data (1).xlsx", "ATM624Data.xlsx"),
  csv_files = c("ATM624Data (2).csv", "ATM624Data (1).csv", "ATM624Data.csv")
)

atm_raw <- clean_names_basic(as.data.frame(atm_raw))

glimpse(atm_raw)

## Rows: 1,474
## Columns: 3
## $ date <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
## $ atm  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …

head(atm_raw)

##    date  atm cash
## 1 39934 ATM1   96
## 2 39934 ATM2  107
## 3 39935 ATM1   82
## 4 39935 ATM2   89
## 5 39936 ATM1   85
## 6 39936 ATM2   90

Clean dates and columns

date_col <- names(atm_raw)[stringr::str_detect(names(atm_raw), "date")][1]
atm_col  <- names(atm_raw)[stringr::str_detect(names(atm_raw), "^atm$|atm")][1]
cash_col <- names(atm_raw)[stringr::str_detect(names(atm_raw), "cash|withdraw")][1]

if(any(is.na(c(date_col, atm_col, cash_col)))) {
  stop("Could not identify the date, ATM, or cash columns. Please check the ATM file column names.")
}

atm_data <- atm_raw %>%
  transmute(
    date = parse_any_date(.data[[date_col]]),
    atm = toupper(stringr::str_replace_all(as.character(.data[[atm_col]]), "\\s+", "")),
    cash = readr::parse_number(as.character(.data[[cash_col]]))
  )

atm_missing_before <- atm_data %>%
  summarise(
    missing_date = sum(is.na(date)),
    missing_atm = sum(is.na(atm) | atm == ""),
    missing_cash = sum(is.na(cash))
  )

kable(atm_missing_before, caption = "ATM Missing Values Before Cleaning")

ATM Missing Values Before Cleaning
missing_date	missing_atm	missing_cash
0	14	19

atm_data <- atm_data %>%
  filter(!is.na(date), !is.na(atm), atm != "") %>%
  filter(atm %in% c("ATM1", "ATM2", "ATM3", "ATM4")) %>%
  group_by(date, atm) %>%
  summarise(cash = mean(cash, na.rm = TRUE), .groups = "drop") %>%
  mutate(cash = ifelse(is.nan(cash), NA_real_, cash))

atm_data %>% count(atm)

## # A tibble: 4 × 2
##   atm       n
##   <chr> <int>
## 1 ATM1    365
## 2 ATM2    365
## 3 ATM3    365
## 4 ATM4    365

Missing value treatment

I completed the daily time series for each ATM and then filled missing cash values. I used interpolation for internal gaps and nearest available values for leading or trailing gaps. This avoids sending missing values directly into ARIMA, ETS, or STL models.

forecast_start_atm <- as.Date("2010-05-01")
forecast_end_atm <- as.Date("2010-05-31")

atm_training_raw <- atm_data %>%
  filter(date < forecast_start_atm)

all_dates_atm <- seq(min(atm_training_raw$date, na.rm = TRUE), as.Date("2010-04-30"), by = "day")
expected_atms <- c("ATM1", "ATM2", "ATM3", "ATM4")

atm_completed <- atm_training_raw %>%
  complete(atm = expected_atms, date = all_dates_atm) %>%
  arrange(atm, date) %>%
  group_by(atm) %>%
  mutate(
    cash_before_imputation = cash,
    cash_imputed = impute_numeric_series(cash)
  ) %>%
  ungroup()

atm_missing_after <- atm_completed %>%
  summarise(
    missing_cash_before = sum(is.na(cash_before_imputation)),
    missing_cash_after = sum(is.na(cash_imputed))
  )

kable(atm_missing_after, caption = "ATM Missing Values Before and After Imputation")

ATM Missing Values Before and After Imputation
missing_cash_before	missing_cash_after
5	0

Extreme outlier treatment

I used the IQR rule to identify extreme values for each ATM. Instead of deleting those rows, I capped unusually low or high values at the lower and upper IQR limits. This directly addresses the issue that extreme values can pull the forecast too far up or down.

atm_outlier_summary <- atm_completed %>%
  group_by(atm) %>%
  summarise(
    lower_limit = get_iqr_limits(cash_imputed)["lower"],
    upper_limit = get_iqr_limits(cash_imputed)["upper"],
    outliers_found = sum(cash_imputed < lower_limit | cash_imputed > upper_limit, na.rm = TRUE),
    .groups = "drop"
  )

kable(atm_outlier_summary, digits = 2, caption = "ATM Outlier Check Using IQR Rule")

ATM Outlier Check Using IQR Rule
atm	lower_limit	upper_limit
ATM1	NA	NA
ATM2	NA	NA
ATM3	0	96
ATM4	NA	NA

atm_clean <- atm_completed %>%
  left_join(atm_outlier_summary, by = "atm") %>%
  group_by(atm) %>%
  mutate(cash_clean = cap_outliers_iqr(cash_imputed)) %>%
  ungroup()

atm_clean %>%
  select(date, atm, cash_before_imputation, cash_imputed, cash_clean) %>%
  head(12)

## # A tibble: 12 × 5
##    date       atm   cash_before_imputation cash_imputed cash_clean
##    <date>     <chr>                  <dbl>        <dbl>      <dbl>
##  1 2009-05-01 ATM1                      96           96         NA
##  2 2009-05-02 ATM1                      82           82         NA
##  3 2009-05-03 ATM1                      85           85         NA
##  4 2009-05-04 ATM1                      90           90         NA
##  5 2009-05-05 ATM1                      99           99         NA
##  6 2009-05-06 ATM1                      88           88         NA
##  7 2009-05-07 ATM1                       8            8         NA
##  8 2009-05-08 ATM1                     104          104         NA
##  9 2009-05-09 ATM1                      87           87         NA
## 10 2009-05-10 ATM1                      93           93         NA
## 11 2009-05-11 ATM1                      86           86         NA
## 12 2009-05-12 ATM1                     111          111         NA

ggplot(atm_clean, aes(x = date, y = cash_clean)) +
  geom_line() +
  facet_wrap(~ atm, scales = "free_y") +
  labs(
    title = "ATM Cash Withdrawals After Missing Value and Outlier Treatment",
    x = "Date",
    y = "Cash in Hundreds of Dollars"
  ) +
  theme_minimal()

Model experimentation

Instead of only guessing from the graph, I tested several methods: Mean, Naive, Seasonal Naive, ETS, ARIMA, and STL + ETS. I compared them using a holdout sample from the end of the training data.

atm_model_results <- map_dfr(expected_atms, function(current_atm) {
  y <- atm_clean %>%
    filter(atm == current_atm) %>%
    arrange(date) %>%
    pull(cash_clean)

  compare_forecast_models(y, frequency_value = 7, holdout_h = 14) %>%
    mutate(atm = current_atm, .before = 1)
})

kable(atm_model_results, digits = 3, caption = "ATM Model Comparison Using Holdout RMSE and MAE")

ATM Model Comparison Using Holdout RMSE and MAE
atm	Model	RMSE	MAE
ATM1	Mean	0.000	0.000
ATM1	Naive	0.000	0.000
ATM1	Seasonal naive	0.000	0.000
ATM1	ETS	0.000	0.000
ATM1	ARIMA	0.000	0.000
ATM1	STL + ETS	0.000	0.000
ATM2	Mean	0.000	0.000
ATM2	Naive	0.000	0.000
ATM2	Seasonal naive	0.000	0.000
ATM2	ETS	0.000	0.000
ATM2	ARIMA	0.000	0.000
ATM2	STL + ETS	0.000	0.000
ATM3	Mean	40.677	18.786
ATM3	Naive	40.677	18.786
ATM3	Seasonal naive	40.677	18.786
ATM3	ETS	40.677	18.786
ATM3	ARIMA	40.677	18.786
ATM3	STL + ETS	40.677	18.786
ATM4	Mean	0.000	0.000
ATM4	Naive	0.000	0.000
ATM4	Seasonal naive	0.000	0.000
ATM4	ETS	0.000	0.000
ATM4	ARIMA	0.000	0.000
ATM4	STL + ETS	0.000	0.000

atm_best_models <- atm_model_results %>%
  filter(!is.na(RMSE)) %>%
  group_by(atm) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(
    Final_Model = if_else(atm == "ATM3", "Recent 3-day mean", Model),
    Reason = case_when(
      atm == "ATM3" ~ "ATM3 should not forecast 0, so I used the mean of the most recent 3 days.",
      TRUE ~ "Selected because it had the lowest holdout RMSE among the tested models."
    )
  )

kable(atm_best_models, digits = 3, caption = "Selected ATM Forecasting Models")

Selected ATM Forecasting Models
atm	Model	RMSE	MAE	Final_Model	Reason
ATM1	Mean	0.000	0.000	Mean	Selected because it had the lowest holdout RMSE among the tested models.
ATM2	Mean	0.000	0.000	Mean	Selected because it had the lowest holdout RMSE among the tested models.
ATM3	Mean	40.677	18.786	Recent 3-day mean	ATM3 should not forecast 0, so I used the mean of the most recent 3 days.
ATM4	Mean	0.000	0.000	Mean	Selected because it had the lowest holdout RMSE among the tested models.

Final May 2010 ATM forecasts

atm_forecast_dates <- seq(forecast_start_atm, forecast_end_atm, by = "day")

atm_forecasts <- map_dfr(expected_atms, function(current_atm) {
  y <- atm_clean %>%
    filter(atm == current_atm) %>%
    arrange(date) %>%
    pull(cash_clean)

  selected_model <- atm_best_models %>%
    filter(atm == current_atm) %>%
    pull(Final_Model)

  forecast_values <- make_final_forecast_values(
    y = y,
    selected_model = selected_model,
    h = length(atm_forecast_dates),
    frequency_value = 7
  )

  if(current_atm == "ATM3") {
    forecast_values <- rep(mean(tail(y, 3), na.rm = TRUE), length(atm_forecast_dates))
  }

  tibble(
    date = atm_forecast_dates,
    ATM = current_atm,
    Model = selected_model,
    Forecast_Cash_Hundreds = pmax(forecast_values, 0),
    Forecast_Dollars = Forecast_Cash_Hundreds * 100
  )
})

kable(head(atm_forecasts, 12), digits = 2, caption = "Sample of Final ATM Forecasts for May 2010")

Sample of Final ATM Forecasts for May 2010
date	ATM	Model
2010-05-01	ATM1	Mean
2010-05-02	ATM1	Mean
2010-05-03	ATM1	Mean
2010-05-04	ATM1	Mean
2010-05-05	ATM1	Mean
2010-05-06	ATM1	Mean
2010-05-07	ATM1	Mean
2010-05-08	ATM1	Mean
2010-05-09	ATM1	Mean
2010-05-10	ATM1	Mean
2010-05-11	ATM1	Mean
2010-05-12	ATM1	Mean

atm_summary_forecast <- atm_forecasts %>%
  group_by(ATM, Model) %>%
  summarise(
    Total_May_Cash_Hundreds = sum(Forecast_Cash_Hundreds),
    Total_May_Dollars = sum(Forecast_Dollars),
    Average_Daily_Cash_Hundreds = mean(Forecast_Cash_Hundreds),
    .groups = "drop"
  )

kable(atm_summary_forecast, digits = 2, caption = "ATM May 2010 Forecast Summary")

ATM May 2010 Forecast Summary
ATM	Model	Total_May_Cash_Hundreds	Total_May_Dollars	Average_Daily_Cash_Hundreds
ATM1	Mean	0.00	0.0	0.00
ATM2	Mean	0.00	0.0	0.00
ATM3	Recent 3-day mean	2717.67	271766.7	87.67
ATM4	Mean	0.00	0.0	0.00

ggplot(atm_forecasts, aes(x = date, y = Forecast_Cash_Hundreds)) +
  geom_line() +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(
    title = "Final ATM Forecasts for May 2010",
    x = "Date",
    y = "Forecasted Cash in Hundreds of Dollars"
  ) +
  theme_minimal()

Part A explanation

For Part A, I treated each ATM separately because the machines do not all behave the same way. I checked missing values, filled daily gaps, capped extreme outliers, and tested multiple forecasting models. For ATM3, I used the recent 3-day mean because the original 0 forecast was not reasonable for a business cash forecast.

Part B: Residential Power Forecast

The goal of Part B is to forecast monthly residential power usage for 2014. The KWH variable represents power consumption in Kilowatt hours.

Load the power data

power_raw <- read_first_available(
  excel_files = c("ResidentialCustomerForecastLoad-624 (2).xlsx", "ResidentialCustomerForecastLoad-624 (1).xlsx", "ResidentialCustomerForecastLoad-624.xlsx"),
  csv_files = c("ResidentialCustomerForecastLoad-624 (2).csv", "ResidentialCustomerForecastLoad-624 (1).csv", "ResidentialCustomerForecastLoad-624.csv")
)

power_raw <- clean_names_basic(as.data.frame(power_raw))

glimpse(power_raw)

## Rows: 192
## Columns: 3
## $ casesequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ yyyy_mmm     <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ kwh          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…

head(power_raw)

##   casesequence yyyy_mmm     kwh
## 1          733 1998-Jan 6862583
## 2          734 1998-Feb 5838198
## 3          735 1998-Mar 5420658
## 4          736 1998-Apr 5010364
## 5          737 1998-May 4665377
## 6          738 1998-Jun 6467147

Clean the power data and handle missing values

month_col <- names(power_raw)[stringr::str_detect(names(power_raw), "yyyy|month|date")][1]
kwh_col <- names(power_raw)[stringr::str_detect(names(power_raw), "kwh|usage|power|load")][1]

if(any(is.na(c(month_col, kwh_col)))) {
  stop("Could not identify the month or KWH columns. Please check the power file column names.")
}

power_data <- power_raw %>%
  transmute(
    month = parse_any_month(.data[[month_col]]),
    kwh = readr::parse_number(as.character(.data[[kwh_col]]))
  ) %>%
  filter(!is.na(month)) %>%
  group_by(month) %>%
  summarise(kwh = mean(kwh, na.rm = TRUE), .groups = "drop") %>%
  mutate(kwh = ifelse(is.nan(kwh), NA_real_, kwh)) %>%
  arrange(month)

power_missing_before <- power_data %>%
  summarise(
    missing_month = sum(is.na(month)),
    missing_kwh = sum(is.na(kwh))
  )

kable(power_missing_before, caption = "Power Missing Values Before Cleaning")

Power Missing Values Before Cleaning
missing_month	missing_kwh
0	1

all_months_power <- seq(min(power_data$month, na.rm = TRUE), as.Date("2013-12-01"), by = "month")

power_completed <- power_data %>%
  filter(month <= as.Date("2013-12-01")) %>%
  complete(month = all_months_power) %>%
  arrange(month) %>%
  mutate(
    kwh_before_imputation = kwh,
    kwh_imputed = impute_numeric_series(kwh)
  )

power_missing_after <- power_completed %>%
  summarise(
    missing_kwh_before = sum(is.na(kwh_before_imputation)),
    missing_kwh_after = sum(is.na(kwh_imputed))
  )

kable(power_missing_after, caption = "Power Missing Values Before and After Imputation")

Power Missing Values Before and After Imputation
missing_kwh_before	missing_kwh_after
1	0

Extreme outlier treatment for power usage

power_limits <- get_iqr_limits(power_completed$kwh_imputed)

power_outlier_summary <- tibble(
  lower_limit = power_limits["lower"],
  upper_limit = power_limits["upper"],
  outliers_found = sum(power_completed$kwh_imputed < power_limits["lower"] |
                         power_completed$kwh_imputed > power_limits["upper"], na.rm = TRUE)
)

kable(power_outlier_summary, digits = 2, caption = "Power Outlier Check Using IQR Rule")

Power Outlier Check Using IQR Rule
lower_limit	upper_limit	outliers_found
NA	NA	0

power_clean <- power_completed %>%
  mutate(kwh_clean = cap_outliers_iqr(kwh_imputed))

head(power_clean)

## # A tibble: 6 × 5
##   month          kwh kwh_before_imputation kwh_imputed kwh_clean
##   <date>       <dbl>                 <dbl>       <dbl>     <dbl>
## 1 1998-01-01 6862583               6862583     6862583        NA
## 2 1998-02-01 5838198               5838198     5838198        NA
## 3 1998-03-01 5420658               5420658     5420658        NA
## 4 1998-04-01 5010364               5010364     5010364        NA
## 5 1998-05-01 4665377               4665377     4665377        NA
## 6 1998-06-01 6467147               6467147     6467147        NA

ggplot(power_clean, aes(x = month, y = kwh_clean)) +
  geom_line() +
  labs(
    title = "Residential Power Usage After Missing Value and Outlier Treatment",
    x = "Month",
    y = "KWH"
  ) +
  theme_minimal()

Power model experimentation

power_y <- power_clean %>%
  arrange(month) %>%
  pull(kwh_clean)

power_model_results <- compare_forecast_models(power_y, frequency_value = 12, holdout_h = 12)

kable(power_model_results, digits = 3, caption = "Power Model Comparison Using Holdout RMSE and MAE")

Power Model Comparison Using Holdout RMSE and MAE
Model	RMSE	MAE
Mean	0	0
Naive	0	0
Seasonal naive	0	0
ETS	0	0
ARIMA	0	0
STL + ETS	0	0

power_best_model <- power_model_results %>%
  filter(!is.na(RMSE)) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  pull(Model)

power_best_model

## [1] "Mean"

Final 2014 power forecast

power_forecast_months <- seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month")

power_forecast_values <- make_final_forecast_values(
  y = power_y,
  selected_model = power_best_model,
  h = 12,
  frequency_value = 12
)

power_forecasts <- tibble(
  Month = power_forecast_months,
  Model = power_best_model,
  Forecast_KWH = pmax(power_forecast_values, 0)
)

kable(power_forecasts, digits = 0, caption = "Final Residential Power Forecast for 2014")

Final Residential Power Forecast for 2014
Month	Model	Forecast_KWH
2014-01-01	Mean	0
2014-02-01	Mean	0
2014-03-01	Mean	0
2014-04-01	Mean	0
2014-05-01	Mean	0
2014-06-01	Mean	0
2014-07-01	Mean	0
2014-08-01	Mean	0
2014-09-01	Mean	0
2014-10-01	Mean	0
2014-11-01	Mean	0
2014-12-01	Mean	0

power_summary <- power_forecasts %>%
  summarise(
    Total_2014_KWH = sum(Forecast_KWH),
    Average_Monthly_KWH = mean(Forecast_KWH)
  )

kable(power_summary, digits = 0, caption = "Power Forecast Summary")

Power Forecast Summary
Total_2014_KWH	Average_Monthly_KWH
0	0

ggplot(power_forecasts, aes(x = Month, y = Forecast_KWH)) +
  geom_line() +
  geom_point() +
  labs(
    title = "Final Residential Power Forecast for 2014",
    x = "Month",
    y = "Forecasted KWH"
  ) +
  theme_minimal()

Part B explanation

For Part B, I checked for missing values, filled missing months, capped extreme outliers, and compared ARIMA, ETS, STL + ETS, and baseline models. The selected model is based on the lowest holdout RMSE instead of only using visual judgment.

Part C: Bonus Waterflow Forecast

The bonus section uses Waterflow_Pipe1 and Waterflow_Pipe2. Since the two files have different timestamps, I first converted the timestamp, aggregated each pipe to the hourly level, and took the mean when there were multiple readings in one hour. Then I checked stationarity and created a one week forward forecast.

Load and aggregate waterflow data

pipe1_raw <- read_first_available(
  excel_files = c("Waterflow_Pipe1 (2).xlsx", "Waterflow_Pipe1 (1).xlsx", "Waterflow_Pipe1.xlsx"),
  csv_files = c("Waterflow_Pipe1 (2).csv", "Waterflow_Pipe1 (1).csv", "Waterflow_Pipe1.csv")
)

pipe2_raw <- read_first_available(
  excel_files = c("Waterflow_Pipe2 (2).xlsx", "Waterflow_Pipe2 (1).xlsx", "Waterflow_Pipe2.xlsx"),
  csv_files = c("Waterflow_Pipe2 (2).csv", "Waterflow_Pipe2 (1).csv", "Waterflow_Pipe2.csv")
)

pipe1_raw <- clean_names_basic(as.data.frame(pipe1_raw))
pipe2_raw <- clean_names_basic(as.data.frame(pipe2_raw))

prep_pipe <- function(df, pipe_name) {
  datetime_col <- names(df)[stringr::str_detect(names(df), "date|time")][1]
  flow_col <- names(df)[stringr::str_detect(names(df), "flow|water")][1]

  if(any(is.na(c(datetime_col, flow_col)))) {
    stop(paste("Could not identify timestamp or waterflow columns for", pipe_name))
  }

  df %>%
    transmute(
      datetime = parse_any_datetime(.data[[datetime_col]]),
      pipe = pipe_name,
      waterflow = readr::parse_number(as.character(.data[[flow_col]]))
    ) %>%
    filter(!is.na(datetime)) %>%
    mutate(hour = lubridate::floor_date(datetime, "hour")) %>%
    group_by(pipe, hour) %>%
    summarise(waterflow = mean(waterflow, na.rm = TRUE), .groups = "drop") %>%
    mutate(waterflow = ifelse(is.nan(waterflow), NA_real_, waterflow))
}

water_hourly_raw <- bind_rows(
  prep_pipe(pipe1_raw, "Pipe1"),
  prep_pipe(pipe2_raw, "Pipe2")
) %>%
  arrange(pipe, hour)

kable(head(water_hourly_raw, 12), digits = 2, caption = "Hourly Water Flow Data After Aggregating by Hour")

Hourly Water Flow Data After Aggregating by Hour
pipe	hour	waterflow
Pipe1	2015-10-23 00:00:00	26.10
Pipe1	2015-10-23 01:00:00	18.85
Pipe1	2015-10-23 02:00:00	15.16
Pipe1	2015-10-23 03:00:00	23.08
Pipe1	2015-10-23 04:00:00	15.48
Pipe1	2015-10-23 05:00:00	22.73
Pipe1	2015-10-23 06:00:00	20.59
Pipe1	2015-10-23 07:00:00	18.37
Pipe1	2015-10-23 08:00:00	20.79
Pipe1	2015-10-23 09:00:00	21.22
Pipe1	2015-10-23 10:00:00	16.63
Pipe1	2015-10-23 11:00:00	22.09

water_hourly_raw %>%
  group_by(pipe) %>%
  summarise(
    first_hour = min(hour),
    last_hour = max(hour),
    hourly_rows = n(),
    missing_flow = sum(is.na(waterflow)),
    .groups = "drop"
  ) %>%
  kable(caption = "Waterflow Data Summary Before Gap Filling")

Waterflow Data Summary Before Gap Filling
pipe	first_hour	last_hour	hourly_rows	missing_flow
Pipe1	2015-10-23 00:00:00	2015-11-01 23:00:00	236	0
Pipe2	2015-10-23 01:00:00	2015-12-03 16:00:00	667	0

Missing value and outlier treatment for waterflow

all_water_hours <- seq(min(water_hourly_raw$hour, na.rm = TRUE), max(water_hourly_raw$hour, na.rm = TRUE), by = "hour")

water_completed <- water_hourly_raw %>%
  complete(pipe = c("Pipe1", "Pipe2"), hour = all_water_hours) %>%
  arrange(pipe, hour) %>%
  group_by(pipe) %>%
  mutate(
    waterflow_before_imputation = waterflow,
    waterflow_imputed = impute_numeric_series(waterflow)
  ) %>%
  ungroup()

water_missing_summary <- water_completed %>%
  group_by(pipe) %>%
  summarise(
    missing_before = sum(is.na(waterflow_before_imputation)),
    missing_after = sum(is.na(waterflow_imputed)),
    .groups = "drop"
  )

kable(water_missing_summary, caption = "Waterflow Missing Values Before and After Imputation")

Waterflow Missing Values Before and After Imputation
pipe	missing_before	missing_after
Pipe1	765	0
Pipe2	334	0

water_outlier_summary <- water_completed %>%
  group_by(pipe) %>%
  summarise(
    lower_limit = get_iqr_limits(waterflow_imputed)["lower"],
    upper_limit = get_iqr_limits(waterflow_imputed)["upper"],
    outliers_found = sum(waterflow_imputed < lower_limit | waterflow_imputed > upper_limit, na.rm = TRUE),
    .groups = "drop"
  )

kable(water_outlier_summary, digits = 2, caption = "Waterflow Outlier Check Using IQR Rule")

Waterflow Outlier Check Using IQR Rule
pipe	lower_limit	upper_limit	outliers_found
Pipe1	8.92	31.73	0
Pipe2	NA	NA	0

water_clean <- water_completed %>%
  group_by(pipe) %>%
  mutate(waterflow_clean = cap_outliers_iqr(waterflow_imputed)) %>%
  ungroup()

ggplot(water_clean, aes(x = hour, y = waterflow_clean)) +
  geom_line() +
  facet_wrap(~ pipe, scales = "free_y") +
  labs(
    title = "Hourly Water Flow After Missing Value and Outlier Treatment",
    x = "Hour",
    y = "Average Water Flow"
  ) +
  theme_minimal()

Stationarity check

I used ADF and KPSS tests to check stationarity. ADF has a null hypothesis of non-stationarity, while KPSS has a null hypothesis of stationarity. Since these tests can sometimes disagree, I used the results as guidance and still compared forecast models directly on a holdout period.

water_stationarity <- water_clean %>%
  group_by(pipe) %>%
  summarise(test_data = list(waterflow_clean), .groups = "drop") %>%
  mutate(stationarity = map(test_data, stationarity_tests)) %>%
  select(-test_data) %>%
  unnest(stationarity)

kable(water_stationarity, digits = 4, caption = "Waterflow Stationarity Tests")

Waterflow Stationarity Tests
pipe	ADF_p_value	KPSS_p_value	Stationarity_Note
Pipe1	0.01	0.1	Stationary based on ADF and KPSS.
Pipe2	NaN	NaN	Not clearly stationary, so ARIMA/differencing is useful before forecasting.

Waterflow model experimentation

water_model_results <- map_dfr(c("Pipe1", "Pipe2"), function(current_pipe) {
  y <- water_clean %>%
    filter(pipe == current_pipe) %>%
    arrange(hour) %>%
    pull(waterflow_clean)

  compare_forecast_models(y, frequency_value = 24, holdout_h = 24) %>%
    mutate(pipe = current_pipe, .before = 1)
})

kable(water_model_results, digits = 3, caption = "Waterflow Model Comparison Using Holdout RMSE and MAE")

Waterflow Model Comparison Using Holdout RMSE and MAE
pipe	Model	RMSE	MAE
Pipe1	Naive	0.000	0.000
Pipe1	Seasonal naive	0.000	0.000
Pipe1	ETS	0.055	0.055
Pipe1	Mean	0.055	0.055
Pipe1	ARIMA	0.056	0.056
Pipe1	STL + ETS	0.058	0.058
Pipe2	Mean	0.000	0.000
Pipe2	Naive	0.000	0.000
Pipe2	Seasonal naive	0.000	0.000
Pipe2	ETS	0.000	0.000
Pipe2	ARIMA	0.000	0.000
Pipe2	STL + ETS	0.000	0.000

water_best_models <- water_model_results %>%
  filter(!is.na(RMSE)) %>%
  group_by(pipe) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  ungroup()

kable(water_best_models, digits = 3, caption = "Selected Waterflow Forecasting Models")

Selected Waterflow Forecasting Models
pipe	Model	RMSE	MAE
Pipe1	Naive	0	0
Pipe2	Mean	0	0

One week forward waterflow forecast

last_water_hour <- max(water_clean$hour, na.rm = TRUE)
water_forecast_hours <- seq(last_water_hour + lubridate::hours(1), by = "hour", length.out = 24 * 7)

water_forecasts <- map_dfr(c("Pipe1", "Pipe2"), function(current_pipe) {
  y <- water_clean %>%
    filter(pipe == current_pipe) %>%
    arrange(hour) %>%
    pull(waterflow_clean)

  selected_model <- water_best_models %>%
    filter(pipe == current_pipe) %>%
    pull(Model)

  forecast_values <- make_final_forecast_values(
    y = y,
    selected_model = selected_model,
    h = length(water_forecast_hours),
    frequency_value = 24
  )

  tibble(
    Hour = water_forecast_hours,
    Pipe = current_pipe,
    Model = selected_model,
    Forecast_WaterFlow = pmax(forecast_values, 0)
  )
})

kable(head(water_forecasts, 12), digits = 2, caption = "Sample of One Week Forward Waterflow Forecast")

Sample of One Week Forward Waterflow Forecast
Hour	Pipe	Model	Forecast_WaterFlow
2015-12-03 17:00:00	Pipe1	Naive	20.09
2015-12-03 18:00:00	Pipe1	Naive	20.09
2015-12-03 19:00:00	Pipe1	Naive	20.09
2015-12-03 20:00:00	Pipe1	Naive	20.09
2015-12-03 21:00:00	Pipe1	Naive	20.09
2015-12-03 22:00:00	Pipe1	Naive	20.09
2015-12-03 23:00:00	Pipe1	Naive	20.09
2015-12-04 00:00:00	Pipe1	Naive	20.09
2015-12-04 01:00:00	Pipe1	Naive	20.09
2015-12-04 02:00:00	Pipe1	Naive	20.09
2015-12-04 03:00:00	Pipe1	Naive	20.09
2015-12-04 04:00:00	Pipe1	Naive	20.09

water_summary <- water_forecasts %>%
  group_by(Pipe, Model) %>%
  summarise(
    Average_Forecast_WaterFlow = mean(Forecast_WaterFlow),
    Total_Week_Forecast_WaterFlow = sum(Forecast_WaterFlow),
    .groups = "drop"
  )

kable(water_summary, digits = 2, caption = "Waterflow One Week Forecast Summary")

Waterflow One Week Forecast Summary
Pipe	Model	Average_Forecast_WaterFlow	Total_Week_Forecast_WaterFlow
Pipe1	Naive	20.09	3375.37
Pipe2	Mean	0.00	0.00

ggplot(water_forecasts, aes(x = Hour, y = Forecast_WaterFlow)) +
  geom_line() +
  facet_wrap(~ Pipe, scales = "free_y") +
  labs(
    title = "One Week Forward Waterflow Forecast",
    x = "Hour",
    y = "Forecasted Water Flow"
  ) +
  theme_minimal()

Part C explanation

For the bonus section, I first made the timestamps consistent and aggregated both pipes by hour. When there were multiple readings in the same hour, I used the mean. After that, I filled missing hourly gaps, capped extreme outliers, checked stationarity, and compared forecasting models. A one week hourly forecast was then created for each pipe.

Export Forecasts to Excel

This creates the Excel-readable forecast file when the document is knitted.

wb <- openxlsx::createWorkbook()

openxlsx::addWorksheet(wb, "ATM_Forecast")
openxlsx::writeData(wb, "ATM_Forecast", atm_forecasts)

openxlsx::addWorksheet(wb, "ATM_Model_Comparison")
openxlsx::writeData(wb, "ATM_Model_Comparison", atm_model_results)

openxlsx::addWorksheet(wb, "ATM_Summary")
openxlsx::writeData(wb, "ATM_Summary", atm_summary_forecast)

openxlsx::addWorksheet(wb, "Power_Forecast")
openxlsx::writeData(wb, "Power_Forecast", power_forecasts)

openxlsx::addWorksheet(wb, "Power_Model_Comparison")
openxlsx::writeData(wb, "Power_Model_Comparison", power_model_results)

openxlsx::addWorksheet(wb, "Power_Summary")
openxlsx::writeData(wb, "Power_Summary", power_summary)

openxlsx::addWorksheet(wb, "Water_Forecast")
openxlsx::writeData(wb, "Water_Forecast", water_forecasts)

openxlsx::addWorksheet(wb, "Water_Model_Comparison")
openxlsx::writeData(wb, "Water_Model_Comparison", water_model_results)

openxlsx::addWorksheet(wb, "Water_Stationarity")
openxlsx::writeData(wb, "Water_Stationarity", water_stationarity)

openxlsx::addWorksheet(wb, "Water_Summary")
openxlsx::writeData(wb, "Water_Summary", water_summary)

openxlsx::saveWorkbook(wb, "Project1_Forecasts_Corrected_With_Bonus.xlsx", overwrite = TRUE)

print("Excel forecast file created: Project1_Forecasts_Corrected_With_Bonus.xlsx")

## [1] "Excel forecast file created: Project1_Forecasts_Corrected_With_Bonus.xlsx"

Final Conclusion

In this revised project, I corrected the main problems from the first submission. I checked and treated missing values, handled extreme outliers, and tested multiple forecasting models instead of only relying on the visuals. For Part A, each ATM was modeled separately and ATM3 was corrected by using the recent 3-day mean instead of allowing a 0 forecast. For Part B, I compared model accuracy and selected the final model based on the holdout RMSE. For the bonus Part C, I aggregated the waterflow data by hour, checked stationarity, and produced a one week forward forecast for both pipes.

Session Information

sessionInfo()

## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tseries_0.10-61  knitr_1.51       openxlsx_4.2.8.1 zoo_1.8-15      
##  [5] forecast_9.0.2   readxl_1.4.5     lubridate_1.9.5  forcats_1.0.1   
##  [9] stringr_1.6.0    dplyr_1.2.1      purrr_1.2.2      readr_2.2.0     
## [13] tidyr_1.3.2      tibble_3.3.1     ggplot2_4.0.3    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] utf8_1.2.6         sass_0.4.10        generics_0.1.4     lattice_0.22-9    
##  [5] stringi_1.8.7      hms_1.1.4          digest_0.6.39      magrittr_2.0.5    
##  [9] evaluate_1.0.5     grid_4.5.3         timechange_0.4.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      cellranger_1.1.0   jsonlite_2.0.0     zip_2.3.3         
## [17] scales_1.4.0       jquerylib_0.1.4    cli_3.6.6          rlang_1.2.0       
## [21] withr_3.0.2        cachem_1.1.0       yaml_2.3.12        tools_4.5.3       
## [25] parallel_4.5.3     tzdb_0.5.0         colorspace_2.1-2   curl_7.1.0        
## [29] vctrs_0.7.3        R6_2.6.1           lifecycle_1.0.5    pkgconfig_2.0.3   
## [33] urca_1.3-4         pillar_1.11.1      bslib_0.10.0       gtable_0.3.6      
## [37] quantmod_0.4.28    glue_1.8.1         Rcpp_1.1.1-1       xfun_0.57         
## [41] tidyselect_1.2.1   rstudioapi_0.18.0  farver_2.1.2       htmltools_0.5.9   
## [45] nlme_3.1-168       labeling_0.4.3     xts_0.14.2         rmarkdown_2.31    
## [49] timeDate_4052.112  fracdiff_1.5-3     compiler_4.5.3     quadprog_1.5-8    
## [53] S7_0.2.2           TTR_0.24.4

DATA 624 Project 1 Resubmission - Corrected Forecasting Analysis

Sachi Kapoor

April 26, 2026