Resubmission Note

I revised this project based on the feedback I received. In the original version, I did not clearly show how I handled missing values, extreme outliers, or how I tested different forecasting methods. In this corrected version, I added those steps directly into the analysis.

The main corrections are:

  1. I checked for missing values before forecasting.
  2. I filled missing time series values instead of leaving gaps.
  3. I checked for extreme outliers and capped them using the IQR rule.
  4. I tested ARIMA, ETS, STL-based forecasts, and baseline models.
  5. I compared models using a holdout period instead of only choosing by visual inspection.
  6. I corrected ATM3 so it does not forecast 0. I used the mean of the most recent 3 days because that is more realistic for a short-term ATM forecast.
  7. I also included the optional bonus waterflow section.

Helper Functions

clean_names_basic <- function(df) {
  names(df) <- names(df) %>%
    stringr::str_trim() %>%
    stringr::str_replace_all("[^A-Za-z0-9]+", "_") %>%
    stringr::str_replace_all("_+$", "") %>%
    tolower()
  df
}

read_first_available <- function(excel_files = NULL, csv_files = NULL) {
  if(!is.null(excel_files)) {
    for(file in excel_files) {
      if(file.exists(file)) {
        return(readxl::read_excel(file))
      }
    }
  }

  if(!is.null(csv_files)) {
    for(file in csv_files) {
      if(file.exists(file)) {
        return(readr::read_csv(file, show_col_types = FALSE))
      }
    }
  }

  stop("The file was not found. Please make sure the Excel file is uploaded into the same folder as this RMD.")
}

parse_any_date <- function(x) {
  if(inherits(x, "Date")) return(x)
  if(inherits(x, "POSIXt")) return(as.Date(x))
  if(is.numeric(x)) return(as.Date(x, origin = "1899-12-30"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("mdy HMS", "mdy HM", "mdy", "ymd HMS", "ymd HM", "ymd", "Ymd")
  ))
  as.Date(parsed)
}

parse_any_month <- function(x) {
  if(inherits(x, "Date")) return(lubridate::floor_date(x, "month"))
  if(inherits(x, "POSIXt")) return(lubridate::floor_date(as.Date(x), "month"))
  if(is.numeric(x)) return(lubridate::floor_date(as.Date(x, origin = "1899-12-30"), "month"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("Y-b", "Y-B", "Y-m", "ym", "my", "mdy", "ymd")
  ))
  lubridate::floor_date(as.Date(parsed), "month")
}

parse_any_datetime <- function(x) {
  if(inherits(x, "POSIXt")) return(as.POSIXct(x, tz = "UTC"))
  if(inherits(x, "Date")) return(as.POSIXct(x, tz = "UTC"))
  if(is.numeric(x)) return(as.POSIXct(as.numeric(x) * 86400, origin = "1899-12-30", tz = "UTC"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("ymd HMS", "mdy HMS", "ymd HM", "mdy HM", "ymd", "mdy")
  ))
  as.POSIXct(parsed, tz = "UTC")
}

impute_numeric_series <- function(x) {
  x <- as.numeric(x)

  if(sum(!is.na(x)) == 0) {
    return(rep(0, length(x)))
  }

  x_clean <- zoo::na.approx(x, na.rm = FALSE)
  x_clean <- zoo::na.locf(x_clean, na.rm = FALSE)
  x_clean <- zoo::na.locf(x_clean, fromLast = TRUE, na.rm = FALSE)
  x_clean
}

get_iqr_limits <- function(x) {
  x <- as.numeric(x)
  x_finite <- x[is.finite(x)]

  if(length(x_finite) == 0) {
    return(c(lower = 0, upper = 0))
  }

  q1 <- as.numeric(quantile(x_finite, 0.25, na.rm = TRUE, names = FALSE))
  q3 <- as.numeric(quantile(x_finite, 0.75, na.rm = TRUE, names = FALSE))
  iqr_value <- q3 - q1

  if(is.na(iqr_value) || iqr_value == 0) {
    return(c(lower = min(x_finite, na.rm = TRUE), upper = max(x_finite, na.rm = TRUE)))
  }

  lower_value <- q1 - 1.5 * iqr_value
  upper_value <- q3 + 1.5 * iqr_value

  c(lower = lower_value, upper = upper_value)
}

cap_outliers_iqr <- function(x) {
  x <- as.numeric(x)
  limits <- get_iqr_limits(x)
  pmin(pmax(x, limits[["lower"]]), limits[["upper"]])
}

rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2, na.rm = TRUE))
}

mae <- function(actual, predicted) {
  mean(abs(actual - predicted), na.rm = TRUE)
}

safe_model_forecast <- function(model_name, y, h, frequency_value) {
  y <- impute_numeric_series(as.numeric(y))

  if(length(y) < 5 || sum(!is.na(y)) < 5) {
    return(NULL)
  }

  y_ts <- ts(y, frequency = frequency_value)

  result <- tryCatch({
    if(model_name == "Mean") {
      forecast::meanf(y_ts, h = h)
    } else if(model_name == "Naive") {
      forecast::naive(y_ts, h = h)
    } else if(model_name == "Seasonal naive") {
      forecast::snaive(y_ts, h = h)
    } else if(model_name == "ETS") {
      forecast::forecast(forecast::ets(y_ts), h = h)
    } else if(model_name == "ARIMA") {
      forecast::forecast(forecast::auto.arima(y_ts), h = h)
    } else if(model_name == "STL + ETS") {
      forecast::stlf(y_ts, h = h, method = "ets")
    } else {
      stop("Unknown model name")
    }
  }, error = function(e) NULL)

  result
}

compare_forecast_models <- function(y, frequency_value, holdout_h) {
  y <- impute_numeric_series(as.numeric(y))

  if(length(y) < 10) {
    return(tibble(Model = character(), RMSE = numeric(), MAE = numeric()))
  }

  holdout_h <- min(holdout_h, max(2, floor(length(y) / 4)))
  train_y <- head(y, length(y) - holdout_h)
  test_y <- tail(y, holdout_h)

  candidate_models <- c("Mean", "Naive", "Seasonal naive", "ETS", "ARIMA", "STL + ETS")

  results <- map_dfr(candidate_models, function(model_name) {
    fc <- safe_model_forecast(model_name, train_y, holdout_h, frequency_value)

    if(is.null(fc)) {
      return(tibble(Model = model_name, RMSE = NA_real_, MAE = NA_real_))
    }

    preds <- as.numeric(fc$mean)

    tibble(
      Model = model_name,
      RMSE = rmse(test_y, preds),
      MAE = mae(test_y, preds)
    )
  }) %>%
    arrange(RMSE)

  results
}

make_final_forecast_values <- function(y, selected_model, h, frequency_value) {
  y <- impute_numeric_series(as.numeric(y))

  if(selected_model %in% c("Recent 3-day mean", "Recent 3-hour mean", "Recent 3-month mean")) {
    return(rep(mean(tail(y, 3), na.rm = TRUE), h))
  }

  fc <- safe_model_forecast(selected_model, y, h, frequency_value)

  if(is.null(fc)) {
    return(rep(mean(tail(y, 3), na.rm = TRUE), h))
  }

  as.numeric(fc$mean)
}

stationarity_tests <- function(y) {
  y <- impute_numeric_series(as.numeric(y))
  y <- y[is.finite(y)]

  adf_result <- tryCatch(tseries::adf.test(y), error = function(e) NULL)
  kpss_result <- tryCatch(tseries::kpss.test(y), error = function(e) NULL)

  tibble(
    ADF_p_value = ifelse(is.null(adf_result), NA_real_, adf_result$p.value),
    KPSS_p_value = ifelse(is.null(kpss_result), NA_real_, kpss_result$p.value),
    Stationarity_Note = case_when(
      !is.na(ADF_p_value) & !is.na(KPSS_p_value) & ADF_p_value < 0.05 & KPSS_p_value > 0.05 ~ "Stationary based on ADF and KPSS.",
      !is.na(ADF_p_value) & ADF_p_value < 0.05 ~ "ADF suggests stationarity, but KPSS should also be reviewed.",
      TRUE ~ "Not clearly stationary, so ARIMA/differencing is useful before forecasting."
    )
  )
}

Part A: ATM Forecast

The goal of Part A is to forecast how much cash is taken out of four different ATM machines for May 2010. The variable Cash is in hundreds of dollars.

Load the ATM data

atm_raw <- read_first_available(
  excel_files = c("ATM624Data (2).xlsx", "ATM624Data (1).xlsx", "ATM624Data.xlsx"),
  csv_files = c("ATM624Data (2).csv", "ATM624Data (1).csv", "ATM624Data.csv")
)

atm_raw <- clean_names_basic(as.data.frame(atm_raw))

glimpse(atm_raw)
## Rows: 1,474
## Columns: 3
## $ date <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
## $ atm  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …
head(atm_raw)
##    date  atm cash
## 1 39934 ATM1   96
## 2 39934 ATM2  107
## 3 39935 ATM1   82
## 4 39935 ATM2   89
## 5 39936 ATM1   85
## 6 39936 ATM2   90

Clean dates and columns

date_col <- names(atm_raw)[stringr::str_detect(names(atm_raw), "date")][1]
atm_col  <- names(atm_raw)[stringr::str_detect(names(atm_raw), "^atm$|atm")][1]
cash_col <- names(atm_raw)[stringr::str_detect(names(atm_raw), "cash|withdraw")][1]

if(any(is.na(c(date_col, atm_col, cash_col)))) {
  stop("Could not identify the date, ATM, or cash columns. Please check the ATM file column names.")
}

atm_data <- atm_raw %>%
  transmute(
    date = parse_any_date(.data[[date_col]]),
    atm = toupper(stringr::str_replace_all(as.character(.data[[atm_col]]), "\\s+", "")),
    cash = readr::parse_number(as.character(.data[[cash_col]]))
  )

atm_missing_before <- atm_data %>%
  summarise(
    missing_date = sum(is.na(date)),
    missing_atm = sum(is.na(atm) | atm == ""),
    missing_cash = sum(is.na(cash))
  )

kable(atm_missing_before, caption = "ATM Missing Values Before Cleaning")
ATM Missing Values Before Cleaning
missing_date missing_atm missing_cash
0 14 19
atm_data <- atm_data %>%
  filter(!is.na(date), !is.na(atm), atm != "") %>%
  filter(atm %in% c("ATM1", "ATM2", "ATM3", "ATM4")) %>%
  group_by(date, atm) %>%
  summarise(cash = mean(cash, na.rm = TRUE), .groups = "drop") %>%
  mutate(cash = ifelse(is.nan(cash), NA_real_, cash))

atm_data %>% count(atm)
## # A tibble: 4 × 2
##   atm       n
##   <chr> <int>
## 1 ATM1    365
## 2 ATM2    365
## 3 ATM3    365
## 4 ATM4    365

Missing value treatment

I completed the daily time series for each ATM and then filled missing cash values. I used interpolation for internal gaps and nearest available values for leading or trailing gaps. This avoids sending missing values directly into ARIMA, ETS, or STL models.

forecast_start_atm <- as.Date("2010-05-01")
forecast_end_atm <- as.Date("2010-05-31")

atm_training_raw <- atm_data %>%
  filter(date < forecast_start_atm)

all_dates_atm <- seq(min(atm_training_raw$date, na.rm = TRUE), as.Date("2010-04-30"), by = "day")
expected_atms <- c("ATM1", "ATM2", "ATM3", "ATM4")

atm_completed <- atm_training_raw %>%
  complete(atm = expected_atms, date = all_dates_atm) %>%
  arrange(atm, date) %>%
  group_by(atm) %>%
  mutate(
    cash_before_imputation = cash,
    cash_imputed = impute_numeric_series(cash)
  ) %>%
  ungroup()

atm_missing_after <- atm_completed %>%
  summarise(
    missing_cash_before = sum(is.na(cash_before_imputation)),
    missing_cash_after = sum(is.na(cash_imputed))
  )

kable(atm_missing_after, caption = "ATM Missing Values Before and After Imputation")
ATM Missing Values Before and After Imputation
missing_cash_before missing_cash_after
5 0

Extreme outlier treatment

I used the IQR rule to identify extreme values for each ATM. Instead of deleting those rows, I capped unusually low or high values at the lower and upper IQR limits. This directly addresses the issue that extreme values can pull the forecast too far up or down.

atm_outlier_summary <- atm_completed %>%
  group_by(atm) %>%
  summarise(
    lower_limit = get_iqr_limits(cash_imputed)[["lower"]],
    upper_limit = get_iqr_limits(cash_imputed)[["upper"]],
    outliers_found = sum(cash_imputed < lower_limit | cash_imputed > upper_limit, na.rm = TRUE),
    .groups = "drop"
  )

kable(atm_outlier_summary, digits = 2, caption = "ATM Outlier Check Using IQR Rule")
ATM Outlier Check Using IQR Rule
atm lower_limit upper_limit outliers_found
ATM1 20.50 160.50 51
ATM2 -74.50 193.50 0
ATM3 0.00 96.00 0
ATM4 -745.92 1574.77 3
atm_clean <- atm_completed %>%
  left_join(atm_outlier_summary, by = "atm") %>%
  group_by(atm) %>%
  mutate(cash_clean = cap_outliers_iqr(cash_imputed)) %>%
  ungroup()

atm_clean %>%
  select(date, atm, cash_before_imputation, cash_imputed, cash_clean) %>%
  head(12)
## # A tibble: 12 × 5
##    date       atm   cash_before_imputation cash_imputed cash_clean
##    <date>     <chr>                  <dbl>        <dbl>      <dbl>
##  1 2009-05-01 ATM1                      96           96       96  
##  2 2009-05-02 ATM1                      82           82       82  
##  3 2009-05-03 ATM1                      85           85       85  
##  4 2009-05-04 ATM1                      90           90       90  
##  5 2009-05-05 ATM1                      99           99       99  
##  6 2009-05-06 ATM1                      88           88       88  
##  7 2009-05-07 ATM1                       8            8       20.5
##  8 2009-05-08 ATM1                     104          104      104  
##  9 2009-05-09 ATM1                      87           87       87  
## 10 2009-05-10 ATM1                      93           93       93  
## 11 2009-05-11 ATM1                      86           86       86  
## 12 2009-05-12 ATM1                     111          111      111
atm_clean_check <- atm_clean %>%
  group_by(atm) %>%
  summarise(
    missing_clean_values = sum(is.na(cash_clean)),
    minimum_clean_cash = min(cash_clean, na.rm = TRUE),
    maximum_clean_cash = max(cash_clean, na.rm = TRUE),
    .groups = "drop"
  )

kable(atm_clean_check, digits = 2, caption = "ATM Cleaned Data Check Before Modeling")
ATM Cleaned Data Check Before Modeling
atm missing_clean_values minimum_clean_cash maximum_clean_cash
ATM1 0 20.50 160.50
ATM2 0 0.00 147.00
ATM3 0 0.00 96.00
ATM4 0 1.56 1574.77
ggplot(atm_clean, aes(x = date, y = cash_clean)) +
  geom_line() +
  facet_wrap(~ atm, scales = "free_y") +
  labs(
    title = "ATM Cash Withdrawals After Missing Value and Outlier Treatment",
    x = "Date",
    y = "Cash in Hundreds of Dollars"
  ) +
  theme_minimal()

Model experimentation

Instead of only guessing from the graph, I tested several methods: Mean, Naive, Seasonal Naive, ETS, ARIMA, and STL + ETS. I compared them using a holdout sample from the end of the training data.

atm_model_results <- map_dfr(expected_atms, function(current_atm) {
  y <- atm_clean %>%
    filter(atm == current_atm) %>%
    arrange(date) %>%
    pull(cash_clean)

  compare_forecast_models(y, frequency_value = 7, holdout_h = 14) %>%
    mutate(atm = current_atm, .before = 1)
})

kable(atm_model_results, digits = 3, caption = "ATM Model Comparison Using Holdout RMSE and MAE")
ATM Model Comparison Using Holdout RMSE and MAE
atm Model RMSE MAE
ATM1 Seasonal naive 10.007 7.286
ATM1 ARIMA 10.089 7.720
ATM1 ETS 10.336 7.370
ATM1 STL + ETS 11.307 7.014
ATM1 Naive 25.206 16.500
ATM1 Mean 26.467 15.782
ATM2 ETS 13.129 9.823
ATM2 Seasonal naive 13.488 10.071
ATM2 STL + ETS 14.804 9.878
ATM2 ARIMA 14.970 12.614
ATM2 Mean 38.446 34.071
ATM2 Naive 40.611 34.071
ATM3 Mean 40.677 18.786
ATM3 Naive 40.677 18.786
ATM3 Seasonal naive 40.677 18.786
ATM3 ETS 40.677 18.786
ATM3 ARIMA 40.677 18.786
ATM3 STL + ETS 40.677 18.786
ATM4 ARIMA 262.512 213.811
ATM4 Mean 267.344 217.079
ATM4 STL + ETS 268.890 207.192
ATM4 ETS 286.723 234.770
ATM4 Seasonal naive 432.328 347.756
ATM4 Naive 565.702 520.298
atm_best_models <- atm_model_results %>%
  filter(!is.na(RMSE)) %>%
  group_by(atm) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(
    Final_Model = if_else(atm == "ATM3", "Recent 3-day mean", Model),
    Reason = case_when(
      atm == "ATM3" ~ "ATM3 should not forecast 0, so I used the mean of the most recent 3 days.",
      TRUE ~ "Selected because it had the lowest holdout RMSE among the tested models."
    )
  )

kable(atm_best_models, digits = 3, caption = "Selected ATM Forecasting Models")
Selected ATM Forecasting Models
atm Model RMSE MAE Final_Model Reason
ATM1 Seasonal naive 10.007 7.286 Seasonal naive Selected because it had the lowest holdout RMSE among the tested models.
ATM2 ETS 13.129 9.823 ETS Selected because it had the lowest holdout RMSE among the tested models.
ATM3 Mean 40.677 18.786 Recent 3-day mean ATM3 should not forecast 0, so I used the mean of the most recent 3 days.
ATM4 ARIMA 262.512 213.811 ARIMA Selected because it had the lowest holdout RMSE among the tested models.

Final May 2010 ATM forecasts

atm_forecast_dates <- seq(forecast_start_atm, forecast_end_atm, by = "day")

atm_forecasts <- map_dfr(expected_atms, function(current_atm) {
  y <- atm_clean %>%
    filter(atm == current_atm) %>%
    arrange(date) %>%
    pull(cash_clean)

  selected_model <- atm_best_models %>%
    filter(atm == current_atm) %>%
    pull(Final_Model)

  forecast_values <- make_final_forecast_values(
    y = y,
    selected_model = selected_model,
    h = length(atm_forecast_dates),
    frequency_value = 7
  )

  if(current_atm == "ATM3") {
    forecast_values <- rep(mean(tail(y, 3), na.rm = TRUE), length(atm_forecast_dates))
  }

  tibble(
    date = atm_forecast_dates,
    ATM = current_atm,
    Model = selected_model,
    Forecast_Cash_Hundreds = pmax(forecast_values, 0),
    Forecast_Dollars = Forecast_Cash_Hundreds * 100
  )
})

kable(head(atm_forecasts, 12), digits = 2, caption = "Sample of Final ATM Forecasts for May 2010")
Sample of Final ATM Forecasts for May 2010
date ATM Model Forecast_Cash_Hundreds Forecast_Dollars
2010-05-01 ATM1 Seasonal naive 85.0 8500
2010-05-02 ATM1 Seasonal naive 109.0 10900
2010-05-03 ATM1 Seasonal naive 74.0 7400
2010-05-04 ATM1 Seasonal naive 20.5 2050
2010-05-05 ATM1 Seasonal naive 96.0 9600
2010-05-06 ATM1 Seasonal naive 82.0 8200
2010-05-07 ATM1 Seasonal naive 85.0 8500
2010-05-08 ATM1 Seasonal naive 85.0 8500
2010-05-09 ATM1 Seasonal naive 109.0 10900
2010-05-10 ATM1 Seasonal naive 74.0 7400
2010-05-11 ATM1 Seasonal naive 20.5 2050
2010-05-12 ATM1 Seasonal naive 96.0 9600
atm_summary_forecast <- atm_forecasts %>%
  group_by(ATM, Model) %>%
  summarise(
    Total_May_Cash_Hundreds = sum(Forecast_Cash_Hundreds),
    Total_May_Dollars = sum(Forecast_Dollars),
    Average_Daily_Cash_Hundreds = mean(Forecast_Cash_Hundreds),
    .groups = "drop"
  )

kable(atm_summary_forecast, digits = 2, caption = "ATM May 2010 Forecast Summary")
ATM May 2010 Forecast Summary
ATM Model Total_May_Cash_Hundreds Total_May_Dollars Average_Daily_Cash_Hundreds
ATM1 Seasonal naive 2474.00 247400.0 79.81
ATM2 ETS 1827.57 182757.3 58.95
ATM3 Recent 3-day mean 2717.67 271766.7 87.67
ATM4 ARIMA 13445.40 1344540.3 433.72
ggplot(atm_forecasts, aes(x = date, y = Forecast_Cash_Hundreds)) +
  geom_line() +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(
    title = "Final ATM Forecasts for May 2010",
    x = "Date",
    y = "Forecasted Cash in Hundreds of Dollars"
  ) +
  theme_minimal()

Part A explanation

For Part A, I treated each ATM separately because the machines do not all behave the same way. I checked missing values, filled daily gaps, capped extreme outliers, and tested multiple forecasting models. For ATM3, I used the recent 3-day mean because the original 0 forecast was not reasonable for a business cash forecast.

Part B: Residential Power Forecast

The goal of Part B is to forecast monthly residential power usage for 2014. The KWH variable represents power consumption in Kilowatt hours.

Load the power data

power_raw <- read_first_available(
  excel_files = c("ResidentialCustomerForecastLoad-624 (2).xlsx", "ResidentialCustomerForecastLoad-624 (1).xlsx", "ResidentialCustomerForecastLoad-624.xlsx"),
  csv_files = c("ResidentialCustomerForecastLoad-624 (2).csv", "ResidentialCustomerForecastLoad-624 (1).csv", "ResidentialCustomerForecastLoad-624.csv")
)

power_raw <- clean_names_basic(as.data.frame(power_raw))

glimpse(power_raw)
## Rows: 192
## Columns: 3
## $ casesequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ yyyy_mmm     <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ kwh          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
head(power_raw)
##   casesequence yyyy_mmm     kwh
## 1          733 1998-Jan 6862583
## 2          734 1998-Feb 5838198
## 3          735 1998-Mar 5420658
## 4          736 1998-Apr 5010364
## 5          737 1998-May 4665377
## 6          738 1998-Jun 6467147

Clean the power data and handle missing values

month_col <- names(power_raw)[stringr::str_detect(names(power_raw), "yyyy|month|date")][1]
kwh_col <- names(power_raw)[stringr::str_detect(names(power_raw), "kwh|usage|power|load")][1]

if(any(is.na(c(month_col, kwh_col)))) {
  stop("Could not identify the month or KWH columns. Please check the power file column names.")
}

power_data <- power_raw %>%
  transmute(
    month = parse_any_month(.data[[month_col]]),
    kwh = readr::parse_number(as.character(.data[[kwh_col]]))
  ) %>%
  filter(!is.na(month)) %>%
  group_by(month) %>%
  summarise(kwh = mean(kwh, na.rm = TRUE), .groups = "drop") %>%
  mutate(kwh = ifelse(is.nan(kwh), NA_real_, kwh)) %>%
  arrange(month)

power_missing_before <- power_data %>%
  summarise(
    missing_month = sum(is.na(month)),
    missing_kwh = sum(is.na(kwh))
  )

kable(power_missing_before, caption = "Power Missing Values Before Cleaning")
Power Missing Values Before Cleaning
missing_month missing_kwh
0 1
all_months_power <- seq(min(power_data$month, na.rm = TRUE), as.Date("2013-12-01"), by = "month")

power_completed <- power_data %>%
  filter(month <= as.Date("2013-12-01")) %>%
  complete(month = all_months_power) %>%
  arrange(month) %>%
  mutate(
    kwh_before_imputation = kwh,
    kwh_imputed = impute_numeric_series(kwh)
  )

power_missing_after <- power_completed %>%
  summarise(
    missing_kwh_before = sum(is.na(kwh_before_imputation)),
    missing_kwh_after = sum(is.na(kwh_imputed))
  )

kable(power_missing_after, caption = "Power Missing Values Before and After Imputation")
Power Missing Values Before and After Imputation
missing_kwh_before missing_kwh_after
1 0

Extreme outlier treatment for power usage

power_limits <- get_iqr_limits(power_completed$kwh_imputed)

power_outlier_summary <- tibble(
  lower_limit = power_limits[["lower"]],
  upper_limit = power_limits[["upper"]],
  outliers_found = sum(power_completed$kwh_imputed < power_limits[["lower"]] |
                         power_completed$kwh_imputed > power_limits[["upper"]], na.rm = TRUE)
)

kable(power_outlier_summary, digits = 2, caption = "Power Outlier Check Using IQR Rule")
Power Outlier Check Using IQR Rule
lower_limit upper_limit outliers_found
2173160 10870171 1
power_clean <- power_completed %>%
  mutate(kwh_clean = cap_outliers_iqr(kwh_imputed))

head(power_clean)
## # A tibble: 6 × 5
##   month          kwh kwh_before_imputation kwh_imputed kwh_clean
##   <date>       <dbl>                 <dbl>       <dbl>     <dbl>
## 1 1998-01-01 6862583               6862583     6862583   6862583
## 2 1998-02-01 5838198               5838198     5838198   5838198
## 3 1998-03-01 5420658               5420658     5420658   5420658
## 4 1998-04-01 5010364               5010364     5010364   5010364
## 5 1998-05-01 4665377               4665377     4665377   4665377
## 6 1998-06-01 6467147               6467147     6467147   6467147
power_clean_check <- power_clean %>%
  summarise(
    missing_clean_values = sum(is.na(kwh_clean)),
    minimum_clean_kwh = min(kwh_clean, na.rm = TRUE),
    maximum_clean_kwh = max(kwh_clean, na.rm = TRUE)
  )

kable(power_clean_check, digits = 0, caption = "Power Cleaned Data Check Before Modeling")
Power Cleaned Data Check Before Modeling
missing_clean_values minimum_clean_kwh maximum_clean_kwh
0 2173160 10655730
ggplot(power_clean, aes(x = month, y = kwh_clean)) +
  geom_line() +
  labs(
    title = "Residential Power Usage After Missing Value and Outlier Treatment",
    x = "Month",
    y = "KWH"
  ) +
  theme_minimal()

Power model experimentation

power_y <- power_clean %>%
  arrange(month) %>%
  pull(kwh_clean)

power_model_results <- compare_forecast_models(power_y, frequency_value = 12, holdout_h = 12)

kable(power_model_results, digits = 3, caption = "Power Model Comparison Using Holdout RMSE and MAE")
Power Model Comparison Using Holdout RMSE and MAE
Model RMSE MAE
STL + ETS 934619.3 620347.2
Seasonal naive 1035537.6 618605.6
ETS 1061192.1 674086.6
ARIMA 1336314.1 917975.7
Naive 1861178.6 1501420.0
Mean 1960601.4 1543871.8
power_best_model <- power_model_results %>%
  filter(!is.na(RMSE)) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  pull(Model)

power_best_model
## [1] "STL + ETS"

Final 2014 power forecast

power_forecast_months <- seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month")

power_forecast_values <- make_final_forecast_values(
  y = power_y,
  selected_model = power_best_model,
  h = 12,
  frequency_value = 12
)

power_forecasts <- tibble(
  Month = power_forecast_months,
  Model = power_best_model,
  Forecast_KWH = pmax(power_forecast_values, 0)
)

kable(power_forecasts, digits = 0, caption = "Final Residential Power Forecast for 2014")
Final Residential Power Forecast for 2014
Month Model Forecast_KWH
2014-01-01 STL + ETS 9698698
2014-02-01 STL + ETS 8666786
2014-03-01 STL + ETS 7199920
2014-04-01 STL + ETS 6405994
2014-05-01 STL + ETS 6085305
2014-06-01 STL + ETS 7800311
2014-07-01 STL + ETS 8230325
2014-08-01 STL + ETS 9513015
2014-09-01 STL + ETS 8465142
2014-10-01 STL + ETS 6302063
2014-11-01 STL + ETS 6001809
2014-12-01 STL + ETS 7888830
power_summary <- power_forecasts %>%
  summarise(
    Total_2014_KWH = sum(Forecast_KWH),
    Average_Monthly_KWH = mean(Forecast_KWH)
  )

kable(power_summary, digits = 0, caption = "Power Forecast Summary")
Power Forecast Summary
Total_2014_KWH Average_Monthly_KWH
92258197 7688183
ggplot(power_forecasts, aes(x = Month, y = Forecast_KWH)) +
  geom_line() +
  geom_point() +
  labs(
    title = "Final Residential Power Forecast for 2014",
    x = "Month",
    y = "Forecasted KWH"
  ) +
  theme_minimal()

Part B explanation

For Part B, I checked for missing values, filled missing months, capped extreme outliers, and compared ARIMA, ETS, STL + ETS, and baseline models. The selected model is based on the lowest holdout RMSE instead of only using visual judgment.

Part C: Bonus Waterflow Forecast

The bonus section uses Waterflow_Pipe1 and Waterflow_Pipe2. Since the two files have different timestamps, I first converted the timestamp, aggregated each pipe to the hourly level, and took the mean when there were multiple readings in one hour. Then I checked stationarity and created a one week forward forecast.

Load and aggregate waterflow data

pipe1_raw <- read_first_available(
  excel_files = c("Waterflow_Pipe1 (2).xlsx", "Waterflow_Pipe1 (1).xlsx", "Waterflow_Pipe1.xlsx"),
  csv_files = c("Waterflow_Pipe1 (2).csv", "Waterflow_Pipe1 (1).csv", "Waterflow_Pipe1.csv")
)

pipe2_raw <- read_first_available(
  excel_files = c("Waterflow_Pipe2 (2).xlsx", "Waterflow_Pipe2 (1).xlsx", "Waterflow_Pipe2.xlsx"),
  csv_files = c("Waterflow_Pipe2 (2).csv", "Waterflow_Pipe2 (1).csv", "Waterflow_Pipe2.csv")
)

pipe1_raw <- clean_names_basic(as.data.frame(pipe1_raw))
pipe2_raw <- clean_names_basic(as.data.frame(pipe2_raw))

prep_pipe <- function(df, pipe_name) {
  datetime_col <- names(df)[stringr::str_detect(names(df), "date|time")][1]
  flow_col <- names(df)[stringr::str_detect(names(df), "flow|water")][1]

  if(any(is.na(c(datetime_col, flow_col)))) {
    stop(paste("Could not identify timestamp or waterflow columns for", pipe_name))
  }

  df %>%
    transmute(
      datetime = parse_any_datetime(.data[[datetime_col]]),
      pipe = pipe_name,
      waterflow = readr::parse_number(as.character(.data[[flow_col]]))
    ) %>%
    filter(!is.na(datetime)) %>%
    mutate(hour = lubridate::floor_date(datetime, "hour")) %>%
    group_by(pipe, hour) %>%
    summarise(waterflow = mean(waterflow, na.rm = TRUE), .groups = "drop") %>%
    mutate(waterflow = ifelse(is.nan(waterflow), NA_real_, waterflow))
}

water_hourly_raw <- bind_rows(
  prep_pipe(pipe1_raw, "Pipe1"),
  prep_pipe(pipe2_raw, "Pipe2")
) %>%
  arrange(pipe, hour)

kable(head(water_hourly_raw, 12), digits = 2, caption = "Hourly Water Flow Data After Aggregating by Hour")
Hourly Water Flow Data After Aggregating by Hour
pipe hour waterflow
Pipe1 2015-10-23 00:00:00 26.10
Pipe1 2015-10-23 01:00:00 18.85
Pipe1 2015-10-23 02:00:00 15.16
Pipe1 2015-10-23 03:00:00 23.08
Pipe1 2015-10-23 04:00:00 15.48
Pipe1 2015-10-23 05:00:00 22.73
Pipe1 2015-10-23 06:00:00 20.59
Pipe1 2015-10-23 07:00:00 18.37
Pipe1 2015-10-23 08:00:00 20.79
Pipe1 2015-10-23 09:00:00 21.22
Pipe1 2015-10-23 10:00:00 16.63
Pipe1 2015-10-23 11:00:00 22.09
water_hourly_raw %>%
  group_by(pipe) %>%
  summarise(
    first_hour = min(hour),
    last_hour = max(hour),
    hourly_rows = n(),
    missing_flow = sum(is.na(waterflow)),
    .groups = "drop"
  ) %>%
  kable(caption = "Waterflow Data Summary Before Gap Filling")
Waterflow Data Summary Before Gap Filling
pipe first_hour last_hour hourly_rows missing_flow
Pipe1 2015-10-23 00:00:00 2015-11-01 23:00:00 236 0
Pipe2 2015-10-23 01:00:00 2015-12-03 16:00:00 667 0

Missing value and outlier treatment for waterflow

all_water_hours <- seq(min(water_hourly_raw$hour, na.rm = TRUE), max(water_hourly_raw$hour, na.rm = TRUE), by = "hour")

water_completed <- water_hourly_raw %>%
  complete(pipe = c("Pipe1", "Pipe2"), hour = all_water_hours) %>%
  arrange(pipe, hour) %>%
  group_by(pipe) %>%
  mutate(
    waterflow_before_imputation = waterflow,
    waterflow_imputed = impute_numeric_series(waterflow)
  ) %>%
  ungroup()

water_missing_summary <- water_completed %>%
  group_by(pipe) %>%
  summarise(
    missing_before = sum(is.na(waterflow_before_imputation)),
    missing_after = sum(is.na(waterflow_imputed)),
    .groups = "drop"
  )

kable(water_missing_summary, caption = "Waterflow Missing Values Before and After Imputation")
Waterflow Missing Values Before and After Imputation
pipe missing_before missing_after
Pipe1 765 0
Pipe2 334 0
water_outlier_summary <- water_completed %>%
  group_by(pipe) %>%
  summarise(
    lower_limit = get_iqr_limits(waterflow_imputed)[["lower"]],
    upper_limit = get_iqr_limits(waterflow_imputed)[["upper"]],
    outliers_found = sum(waterflow_imputed < lower_limit | waterflow_imputed > upper_limit, na.rm = TRUE),
    .groups = "drop"
  )

kable(water_outlier_summary, digits = 2, caption = "Waterflow Outlier Check Using IQR Rule")
Waterflow Outlier Check Using IQR Rule
pipe lower_limit upper_limit outliers_found
Pipe1 8.92 31.73 0
Pipe2 6.86 72.53 7
water_clean <- water_completed %>%
  group_by(pipe) %>%
  mutate(waterflow_clean = cap_outliers_iqr(waterflow_imputed)) %>%
  ungroup()

water_clean_check <- water_clean %>%
  group_by(pipe) %>%
  summarise(
    missing_clean_values = sum(is.na(waterflow_clean)),
    minimum_clean_waterflow = min(waterflow_clean, na.rm = TRUE),
    maximum_clean_waterflow = max(waterflow_clean, na.rm = TRUE),
    .groups = "drop"
  )

kable(water_clean_check, digits = 2, caption = "Waterflow Cleaned Data Check Before Modeling")
Waterflow Cleaned Data Check Before Modeling
pipe missing_clean_values minimum_clean_waterflow maximum_clean_waterflow
Pipe1 0 8.92 31.73
Pipe2 0 6.86 72.53
ggplot(water_clean, aes(x = hour, y = waterflow_clean)) +
  geom_line() +
  facet_wrap(~ pipe, scales = "free_y") +
  labs(
    title = "Hourly Water Flow After Missing Value and Outlier Treatment",
    x = "Hour",
    y = "Average Water Flow"
  ) +
  theme_minimal()

Stationarity check

I used ADF and KPSS tests to check stationarity. ADF has a null hypothesis of non-stationarity, while KPSS has a null hypothesis of stationarity. Since these tests can sometimes disagree, I used the results as guidance and still compared forecast models directly on a holdout period.

water_stationarity <- water_clean %>%
  group_by(pipe) %>%
  summarise(test_data = list(waterflow_clean), .groups = "drop") %>%
  mutate(stationarity = map(test_data, stationarity_tests)) %>%
  select(-test_data) %>%
  unnest(stationarity)

kable(water_stationarity, digits = 4, caption = "Waterflow Stationarity Tests")
Waterflow Stationarity Tests
pipe ADF_p_value KPSS_p_value Stationarity_Note
Pipe1 0.01 0.1000 Stationary based on ADF and KPSS.
Pipe2 0.01 0.0535 Stationary based on ADF and KPSS.

Waterflow model experimentation

water_model_results <- map_dfr(c("Pipe1", "Pipe2"), function(current_pipe) {
  y <- water_clean %>%
    filter(pipe == current_pipe) %>%
    arrange(hour) %>%
    pull(waterflow_clean)

  compare_forecast_models(y, frequency_value = 24, holdout_h = 24) %>%
    mutate(pipe = current_pipe, .before = 1)
})

kable(water_model_results, digits = 3, caption = "Waterflow Model Comparison Using Holdout RMSE and MAE")
Waterflow Model Comparison Using Holdout RMSE and MAE
pipe Model RMSE MAE
Pipe1 Naive 0.000 0.000
Pipe1 Seasonal naive 0.000 0.000
Pipe1 ETS 0.055 0.055
Pipe1 Mean 0.055 0.055
Pipe1 ARIMA 0.056 0.056
Pipe1 STL + ETS 0.058 0.058
Pipe2 Mean 15.513 13.213
Pipe2 STL + ETS 15.730 13.036
Pipe2 ETS 15.981 13.586
Pipe2 ARIMA 16.033 13.850
Pipe2 Naive 16.907 14.256
Pipe2 Seasonal naive 19.415 16.483
water_best_models <- water_model_results %>%
  filter(!is.na(RMSE)) %>%
  group_by(pipe) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  ungroup()

kable(water_best_models, digits = 3, caption = "Selected Waterflow Forecasting Models")
Selected Waterflow Forecasting Models
pipe Model RMSE MAE
Pipe1 Naive 0.000 0.000
Pipe2 Mean 15.513 13.213

One week forward waterflow forecast

last_water_hour <- max(water_clean$hour, na.rm = TRUE)
water_forecast_hours <- seq(last_water_hour + lubridate::hours(1), by = "hour", length.out = 24 * 7)

water_forecasts <- map_dfr(c("Pipe1", "Pipe2"), function(current_pipe) {
  y <- water_clean %>%
    filter(pipe == current_pipe) %>%
    arrange(hour) %>%
    pull(waterflow_clean)

  selected_model <- water_best_models %>%
    filter(pipe == current_pipe) %>%
    pull(Model)

  forecast_values <- make_final_forecast_values(
    y = y,
    selected_model = selected_model,
    h = length(water_forecast_hours),
    frequency_value = 24
  )

  tibble(
    Hour = water_forecast_hours,
    Pipe = current_pipe,
    Model = selected_model,
    Forecast_WaterFlow = pmax(forecast_values, 0)
  )
})

kable(head(water_forecasts, 12), digits = 2, caption = "Sample of One Week Forward Waterflow Forecast")
Sample of One Week Forward Waterflow Forecast
Hour Pipe Model Forecast_WaterFlow
2015-12-03 17:00:00 Pipe1 Naive 20.09
2015-12-03 18:00:00 Pipe1 Naive 20.09
2015-12-03 19:00:00 Pipe1 Naive 20.09
2015-12-03 20:00:00 Pipe1 Naive 20.09
2015-12-03 21:00:00 Pipe1 Naive 20.09
2015-12-03 22:00:00 Pipe1 Naive 20.09
2015-12-03 23:00:00 Pipe1 Naive 20.09
2015-12-04 00:00:00 Pipe1 Naive 20.09
2015-12-04 01:00:00 Pipe1 Naive 20.09
2015-12-04 02:00:00 Pipe1 Naive 20.09
2015-12-04 03:00:00 Pipe1 Naive 20.09
2015-12-04 04:00:00 Pipe1 Naive 20.09
water_summary <- water_forecasts %>%
  group_by(Pipe, Model) %>%
  summarise(
    Average_Forecast_WaterFlow = mean(Forecast_WaterFlow),
    Total_Week_Forecast_WaterFlow = sum(Forecast_WaterFlow),
    .groups = "drop"
  )

kable(water_summary, digits = 2, caption = "Waterflow One Week Forecast Summary")
Waterflow One Week Forecast Summary
Pipe Model Average_Forecast_WaterFlow Total_Week_Forecast_WaterFlow
Pipe1 Naive 20.09 3375.37
Pipe2 Mean 39.76 6679.11
ggplot(water_forecasts, aes(x = Hour, y = Forecast_WaterFlow)) +
  geom_line() +
  facet_wrap(~ Pipe, scales = "free_y") +
  labs(
    title = "One Week Forward Waterflow Forecast",
    x = "Hour",
    y = "Forecasted Water Flow"
  ) +
  theme_minimal()

Part C explanation

For the bonus section, I first made the timestamps consistent and aggregated both pipes by hour. When there were multiple readings in the same hour, I used the mean. After that, I filled missing hourly gaps, capped extreme outliers, checked stationarity, and compared forecasting models. A one week hourly forecast was then created for each pipe.

Export Forecasts to Excel

This creates the Excel-readable forecast file when the document is knitted.

wb <- openxlsx::createWorkbook()

openxlsx::addWorksheet(wb, "ATM_Forecast")
openxlsx::writeData(wb, "ATM_Forecast", atm_forecasts)

openxlsx::addWorksheet(wb, "ATM_Model_Comparison")
openxlsx::writeData(wb, "ATM_Model_Comparison", atm_model_results)

openxlsx::addWorksheet(wb, "ATM_Summary")
openxlsx::writeData(wb, "ATM_Summary", atm_summary_forecast)

openxlsx::addWorksheet(wb, "Power_Forecast")
openxlsx::writeData(wb, "Power_Forecast", power_forecasts)

openxlsx::addWorksheet(wb, "Power_Model_Comparison")
openxlsx::writeData(wb, "Power_Model_Comparison", power_model_results)

openxlsx::addWorksheet(wb, "Power_Summary")
openxlsx::writeData(wb, "Power_Summary", power_summary)

openxlsx::addWorksheet(wb, "Water_Forecast")
openxlsx::writeData(wb, "Water_Forecast", water_forecasts)

openxlsx::addWorksheet(wb, "Water_Model_Comparison")
openxlsx::writeData(wb, "Water_Model_Comparison", water_model_results)

openxlsx::addWorksheet(wb, "Water_Stationarity")
openxlsx::writeData(wb, "Water_Stationarity", water_stationarity)

openxlsx::addWorksheet(wb, "Water_Summary")
openxlsx::writeData(wb, "Water_Summary", water_summary)

openxlsx::saveWorkbook(wb, "Project1_Forecasts_Corrected_With_Bonus.xlsx", overwrite = TRUE)

print("Excel forecast file created: Project1_Forecasts_Corrected_With_Bonus.xlsx")
## [1] "Excel forecast file created: Project1_Forecasts_Corrected_With_Bonus.xlsx"

Final Conclusion

In this revised project, I corrected the main problems from the first submission. I checked and treated missing values, handled extreme outliers, and tested multiple forecasting models instead of only relying on the visuals. For Part A, each ATM was modeled separately and ATM3 was corrected by using the recent 3-day mean instead of allowing a 0 forecast. For Part B, I compared model accuracy and selected the final model based on the holdout RMSE. For the bonus Part C, I aggregated the waterflow data by hour, checked stationarity, and produced a one week forward forecast for both pipes.

Session Information

sessionInfo()
## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tseries_0.10-61  knitr_1.51       openxlsx_4.2.8.1 zoo_1.8-15      
##  [5] forecast_9.0.2   readxl_1.4.5     lubridate_1.9.5  forcats_1.0.1   
##  [9] stringr_1.6.0    dplyr_1.2.1      purrr_1.2.2      readr_2.2.0     
## [13] tidyr_1.3.2      tibble_3.3.1     ggplot2_4.0.3    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] utf8_1.2.6         sass_0.4.10        generics_0.1.4     lattice_0.22-9    
##  [5] stringi_1.8.7      hms_1.1.4          digest_0.6.39      magrittr_2.0.5    
##  [9] evaluate_1.0.5     grid_4.5.3         timechange_0.4.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      cellranger_1.1.0   jsonlite_2.0.0     zip_2.3.3         
## [17] scales_1.4.0       jquerylib_0.1.4    cli_3.6.6          rlang_1.2.0       
## [21] withr_3.0.2        cachem_1.1.0       yaml_2.3.12        tools_4.5.3       
## [25] parallel_4.5.3     tzdb_0.5.0         colorspace_2.1-2   curl_7.1.0        
## [29] vctrs_0.7.3        R6_2.6.1           lifecycle_1.0.5    pkgconfig_2.0.3   
## [33] urca_1.3-4         pillar_1.11.1      bslib_0.10.0       gtable_0.3.6      
## [37] quantmod_0.4.28    glue_1.8.1         Rcpp_1.1.1-1       xfun_0.57         
## [41] tidyselect_1.2.1   rstudioapi_0.18.0  farver_2.1.2       htmltools_0.5.9   
## [45] nlme_3.1-168       labeling_0.4.3     xts_0.14.2         rmarkdown_2.31    
## [49] timeDate_4052.112  fracdiff_1.5-3     compiler_4.5.3     quadprog_1.5-8    
## [53] S7_0.2.2           TTR_0.24.4