Resubmission Note

I revised this project based on the feedback I received. In the original version, I did not clearly show how I handled missing values, extreme outliers, or how I tested different forecasting methods. In this corrected version, I added those steps directly into the analysis.

The main corrections are:

I checked for missing values before forecasting.
I filled missing time series values instead of leaving gaps.
I checked for extreme outliers and capped them using the IQR rule.
I tested ARIMA, ETS, STL-based forecasts, and baseline models.
I compared models using a holdout period instead of only choosing by visual inspection.
I corrected ATM3 so it does not forecast 0. I used the mean of the most recent 3 days because that is more realistic for a short-term ATM forecast.
I also included the optional bonus waterflow section.

Helper Functions

clean_names_basic <- function(df) {
  names(df) <- names(df) %>%
    stringr::str_trim() %>%
    stringr::str_replace_all("[^A-Za-z0-9]+", "_") %>%
    stringr::str_replace_all("_+$", "") %>%
    tolower()
  df
}

read_first_available <- function(excel_files = NULL, csv_files = NULL) {
  if(!is.null(excel_files)) {
    for(file in excel_files) {
      if(file.exists(file)) {
        return(readxl::read_excel(file))
      }
    }
  }

  if(!is.null(csv_files)) {
    for(file in csv_files) {
      if(file.exists(file)) {
        return(readr::read_csv(file, show_col_types = FALSE))
      }
    }
  }

  stop("The file was not found. Please make sure the Excel file is uploaded into the same folder as this RMD.")
}

parse_any_date <- function(x) {
  if(inherits(x, "Date")) return(x)
  if(inherits(x, "POSIXt")) return(as.Date(x))
  if(is.numeric(x)) return(as.Date(x, origin = "1899-12-30"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("mdy HMS", "mdy HM", "mdy", "ymd HMS", "ymd HM", "ymd", "Ymd")
  ))
  as.Date(parsed)
}

parse_any_month <- function(x) {
  if(inherits(x, "Date")) return(lubridate::floor_date(x, "month"))
  if(inherits(x, "POSIXt")) return(lubridate::floor_date(as.Date(x), "month"))
  if(is.numeric(x)) return(lubridate::floor_date(as.Date(x, origin = "1899-12-30"), "month"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("Y-b", "Y-B", "Y-m", "ym", "my", "mdy", "ymd")
  ))
  lubridate::floor_date(as.Date(parsed), "month")
}

parse_any_datetime <- function(x) {
  if(inherits(x, "POSIXt")) return(as.POSIXct(x, tz = "UTC"))
  if(inherits(x, "Date")) return(as.POSIXct(x, tz = "UTC"))
  if(is.numeric(x)) return(as.POSIXct(as.numeric(x) * 86400, origin = "1899-12-30", tz = "UTC"))

  parsed <- suppressWarnings(lubridate::parse_date_time(
    as.character(x),
    orders = c("ymd HMS", "mdy HMS", "ymd HM", "mdy HM", "ymd", "mdy")
  ))
  as.POSIXct(parsed, tz = "UTC")
}

impute_numeric_series <- function(x) {
  x <- as.numeric(x)

  if(sum(!is.na(x)) == 0) {
    return(rep(0, length(x)))
  }

  x_clean <- zoo::na.approx(x, na.rm = FALSE)
  x_clean <- zoo::na.locf(x_clean, na.rm = FALSE)
  x_clean <- zoo::na.locf(x_clean, fromLast = TRUE, na.rm = FALSE)
  x_clean
}

get_iqr_limits <- function(x) {
  x <- as.numeric(x)
  x_finite <- x[is.finite(x)]

  if(length(x_finite) == 0) {
    return(c(lower = 0, upper = 0))
  }

  q1 <- as.numeric(quantile(x_finite, 0.25, na.rm = TRUE, names = FALSE))
  q3 <- as.numeric(quantile(x_finite, 0.75, na.rm = TRUE, names = FALSE))
  iqr_value <- q3 - q1

  if(is.na(iqr_value) || iqr_value == 0) {
    return(c(lower = min(x_finite, na.rm = TRUE), upper = max(x_finite, na.rm = TRUE)))
  }

  lower_value <- q1 - 1.5 * iqr_value
  upper_value <- q3 + 1.5 * iqr_value

  c(lower = lower_value, upper = upper_value)
}

cap_outliers_iqr <- function(x) {
  x <- as.numeric(x)
  limits <- get_iqr_limits(x)
  pmin(pmax(x, limits[["lower"]]), limits[["upper"]])
}

rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2, na.rm = TRUE))
}

mae <- function(actual, predicted) {
  mean(abs(actual - predicted), na.rm = TRUE)
}

safe_model_forecast <- function(model_name, y, h, frequency_value) {
  y <- impute_numeric_series(as.numeric(y))

  if(length(y) < 5 || sum(!is.na(y)) < 5) {
    return(NULL)
  }

  y_ts <- ts(y, frequency = frequency_value)

  result <- tryCatch({
    if(model_name == "Mean") {
      forecast::meanf(y_ts, h = h)
    } else if(model_name == "Naive") {
      forecast::naive(y_ts, h = h)
    } else if(model_name == "Seasonal naive") {
      forecast::snaive(y_ts, h = h)
    } else if(model_name == "ETS") {
      forecast::forecast(forecast::ets(y_ts), h = h)
    } else if(model_name == "ARIMA") {
      forecast::forecast(forecast::auto.arima(y_ts), h = h)
    } else if(model_name == "STL + ETS") {
      forecast::stlf(y_ts, h = h, method = "ets")
    } else {
      stop("Unknown model name")
    }
  }, error = function(e) NULL)

  result
}

compare_forecast_models <- function(y, frequency_value, holdout_h) {
  y <- impute_numeric_series(as.numeric(y))

  if(length(y) < 10) {
    return(tibble(Model = character(), RMSE = numeric(), MAE = numeric()))
  }

  holdout_h <- min(holdout_h, max(2, floor(length(y) / 4)))
  train_y <- head(y, length(y) - holdout_h)
  test_y <- tail(y, holdout_h)

  candidate_models <- c("Mean", "Naive", "Seasonal naive", "ETS", "ARIMA", "STL + ETS")

  results <- map_dfr(candidate_models, function(model_name) {
    fc <- safe_model_forecast(model_name, train_y, holdout_h, frequency_value)

    if(is.null(fc)) {
      return(tibble(Model = model_name, RMSE = NA_real_, MAE = NA_real_))
    }

    preds <- as.numeric(fc$mean)

    tibble(
      Model = model_name,
      RMSE = rmse(test_y, preds),
      MAE = mae(test_y, preds)
    )
  }) %>%
    arrange(RMSE)

  results
}

make_final_forecast_values <- function(y, selected_model, h, frequency_value) {
  y <- impute_numeric_series(as.numeric(y))

  if(selected_model %in% c("Recent 3-day mean", "Recent 3-hour mean", "Recent 3-month mean")) {
    return(rep(mean(tail(y, 3), na.rm = TRUE), h))
  }

  fc <- safe_model_forecast(selected_model, y, h, frequency_value)

  if(is.null(fc)) {
    return(rep(mean(tail(y, 3), na.rm = TRUE), h))
  }

  as.numeric(fc$mean)
}

stationarity_tests <- function(y) {
  y <- impute_numeric_series(as.numeric(y))
  y <- y[is.finite(y)]

  adf_result <- tryCatch(tseries::adf.test(y), error = function(e) NULL)
  kpss_result <- tryCatch(tseries::kpss.test(y), error = function(e) NULL)

  tibble(
    ADF_p_value = ifelse(is.null(adf_result), NA_real_, adf_result$p.value),
    KPSS_p_value = ifelse(is.null(kpss_result), NA_real_, kpss_result$p.value),
    Stationarity_Note = case_when(
      !is.na(ADF_p_value) & !is.na(KPSS_p_value) & ADF_p_value < 0.05 & KPSS_p_value > 0.05 ~ "Stationary based on ADF and KPSS.",
      !is.na(ADF_p_value) & ADF_p_value < 0.05 ~ "ADF suggests stationarity, but KPSS should also be reviewed.",
      TRUE ~ "Not clearly stationary, so ARIMA/differencing is useful before forecasting."
    )
  )
}

Part A: ATM Forecast

The goal of Part A is to forecast how much cash is taken out of four different ATM machines for May 2010. The variable Cash is in hundreds of dollars.

Load the ATM data

atm_raw <- read_first_available(
  excel_files = c("ATM624Data (2).xlsx", "ATM624Data (1).xlsx", "ATM624Data.xlsx"),
  csv_files = c("ATM624Data (2).csv", "ATM624Data (1).csv", "ATM624Data.csv")
)

atm_raw <- clean_names_basic(as.data.frame(atm_raw))

glimpse(atm_raw)

## Rows: 1,474
## Columns: 3
## $ date <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
## $ atm  <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …

head(atm_raw)

##    date  atm cash
## 1 39934 ATM1   96
## 2 39934 ATM2  107
## 3 39935 ATM1   82
## 4 39935 ATM2   89
## 5 39936 ATM1   85
## 6 39936 ATM2   90

Clean dates and columns

date_col <- names(atm_raw)[stringr::str_detect(names(atm_raw), "date")][1]
atm_col  <- names(atm_raw)[stringr::str_detect(names(atm_raw), "^atm$|atm")][1]
cash_col <- names(atm_raw)[stringr::str_detect(names(atm_raw), "cash|withdraw")][1]

if(any(is.na(c(date_col, atm_col, cash_col)))) {
  stop("Could not identify the date, ATM, or cash columns. Please check the ATM file column names.")
}

atm_data <- atm_raw %>%
  transmute(
    date = parse_any_date(.data[[date_col]]),
    atm = toupper(stringr::str_replace_all(as.character(.data[[atm_col]]), "\\s+", "")),
    cash = readr::parse_number(as.character(.data[[cash_col]]))
  )

atm_missing_before <- atm_data %>%
  summarise(
    missing_date = sum(is.na(date)),
    missing_atm = sum(is.na(atm) | atm == ""),
    missing_cash = sum(is.na(cash))
  )

kable(atm_missing_before, caption = "ATM Missing Values Before Cleaning")

ATM Missing Values Before Cleaning
missing_date	missing_atm	missing_cash
0	14	19

atm_data <- atm_data %>%
  filter(!is.na(date), !is.na(atm), atm != "") %>%
  filter(atm %in% c("ATM1", "ATM2", "ATM3", "ATM4")) %>%
  group_by(date, atm) %>%
  summarise(cash = mean(cash, na.rm = TRUE), .groups = "drop") %>%
  mutate(cash = ifelse(is.nan(cash), NA_real_, cash))

atm_data %>% count(atm)

## # A tibble: 4 × 2
##   atm       n
##   <chr> <int>
## 1 ATM1    365
## 2 ATM2    365
## 3 ATM3    365
## 4 ATM4    365

Missing value treatment

I completed the daily time series for each ATM and then filled missing cash values. I used interpolation for internal gaps and nearest available values for leading or trailing gaps. This avoids sending missing values directly into ARIMA, ETS, or STL models.

forecast_start_atm <- as.Date("2010-05-01")
forecast_end_atm <- as.Date("2010-05-31")

atm_training_raw <- atm_data %>%
  filter(date < forecast_start_atm)

all_dates_atm <- seq(min(atm_training_raw$date, na.rm = TRUE), as.Date("2010-04-30"), by = "day")
expected_atms <- c("ATM1", "ATM2", "ATM3", "ATM4")

atm_completed <- atm_training_raw %>%
  complete(atm = expected_atms, date = all_dates_atm) %>%
  arrange(atm, date) %>%
  group_by(atm) %>%
  mutate(
    cash_before_imputation = cash,
    cash_imputed = impute_numeric_series(cash)
  ) %>%
  ungroup()

atm_missing_after <- atm_completed %>%
  summarise(
    missing_cash_before = sum(is.na(cash_before_imputation)),
    missing_cash_after = sum(is.na(cash_imputed))
  )

kable(atm_missing_after, caption = "ATM Missing Values Before and After Imputation")

ATM Missing Values Before and After Imputation
missing_cash_before	missing_cash_after
5	0

Extreme outlier treatment

I used the IQR rule to identify extreme values for each ATM. Instead of deleting those rows, I capped unusually low or high values at the lower and upper IQR limits. This directly addresses the issue that extreme values can pull the forecast too far up or down.

atm_outlier_summary <- atm_completed %>%
  group_by(atm) %>%
  summarise(
    lower_limit = get_iqr_limits(cash_imputed)[["lower"]],
    upper_limit = get_iqr_limits(cash_imputed)[["upper"]],
    outliers_found = sum(cash_imputed < lower_limit | cash_imputed > upper_limit, na.rm = TRUE),
    .groups = "drop"
  )

kable(atm_outlier_summary, digits = 2, caption = "ATM Outlier Check Using IQR Rule")

ATM Outlier Check Using IQR Rule
atm	lower_limit	upper_limit	outliers_found
ATM1	20.50	160.50	51
ATM2	-74.50	193.50	0
ATM3	0.00	96.00	0
ATM4	-745.92	1574.77	3

atm_clean <- atm_completed %>%
  left_join(atm_outlier_summary, by = "atm") %>%
  group_by(atm) %>%
  mutate(cash_clean = cap_outliers_iqr(cash_imputed)) %>%
  ungroup()

atm_clean %>%
  select(date, atm, cash_before_imputation, cash_imputed, cash_clean) %>%
  head(12)

## # A tibble: 12 × 5
##    date       atm   cash_before_imputation cash_imputed cash_clean
##    <date>     <chr>                  <dbl>        <dbl>      <dbl>
##  1 2009-05-01 ATM1                      96           96       96  
##  2 2009-05-02 ATM1                      82           82       82  
##  3 2009-05-03 ATM1                      85           85       85  
##  4 2009-05-04 ATM1                      90           90       90  
##  5 2009-05-05 ATM1                      99           99       99  
##  6 2009-05-06 ATM1                      88           88       88  
##  7 2009-05-07 ATM1                       8            8       20.5
##  8 2009-05-08 ATM1                     104          104      104  
##  9 2009-05-09 ATM1                      87           87       87  
## 10 2009-05-10 ATM1                      93           93       93  
## 11 2009-05-11 ATM1                      86           86       86  
## 12 2009-05-12 ATM1                     111          111      111

atm_clean_check <- atm_clean %>%
  group_by(atm) %>%
  summarise(
    missing_clean_values = sum(is.na(cash_clean)),
    minimum_clean_cash = min(cash_clean, na.rm = TRUE),
    maximum_clean_cash = max(cash_clean, na.rm = TRUE),
    .groups = "drop"
  )

kable(atm_clean_check, digits = 2, caption = "ATM Cleaned Data Check Before Modeling")

ATM Cleaned Data Check Before Modeling
atm	minimum_clean_cash	maximum_clean_cash
ATM1	20.50	160.50
ATM2	0.00	147.00
ATM3	0.00	96.00
ATM4	1.56	1574.77

ggplot(atm_clean, aes(x = date, y = cash_clean)) +
  geom_line() +
  facet_wrap(~ atm, scales = "free_y") +
  labs(
    title = "ATM Cash Withdrawals After Missing Value and Outlier Treatment",
    x = "Date",
    y = "Cash in Hundreds of Dollars"
  ) +
  theme_minimal()

Model experimentation

Instead of only guessing from the graph, I tested several methods: Mean, Naive, Seasonal Naive, ETS, ARIMA, and STL + ETS. I compared them using a holdout sample from the end of the training data.

atm_model_results <- map_dfr(expected_atms, function(current_atm) {
  y <- atm_clean %>%
    filter(atm == current_atm) %>%
    arrange(date) %>%
    pull(cash_clean)

  compare_forecast_models(y, frequency_value = 7, holdout_h = 14) %>%
    mutate(atm = current_atm, .before = 1)
})

kable(atm_model_results, digits = 3, caption = "ATM Model Comparison Using Holdout RMSE and MAE")

ATM Model Comparison Using Holdout RMSE and MAE
atm	Model	RMSE	MAE
ATM1	Seasonal naive	10.007	7.286
ATM1	ARIMA	10.089	7.720
ATM1	ETS	10.336	7.370
ATM1	STL + ETS	11.307	7.014
ATM1	Naive	25.206	16.500
ATM1	Mean	26.467	15.782
ATM2	ETS	13.129	9.823
ATM2	Seasonal naive	13.488	10.071
ATM2	STL + ETS	14.804	9.878
ATM2	ARIMA	14.970	12.614
ATM2	Mean	38.446	34.071
ATM2	Naive	40.611	34.071
ATM3	Mean	40.677	18.786
ATM3	Naive	40.677	18.786
ATM3	Seasonal naive	40.677	18.786
ATM3	ETS	40.677	18.786
ATM3	ARIMA	40.677	18.786
ATM3	STL + ETS	40.677	18.786
ATM4	ARIMA	262.512	213.811
ATM4	Mean	267.344	217.079
ATM4	STL + ETS	268.890	207.192
ATM4	ETS	286.723	234.770
ATM4	Seasonal naive	432.328	347.756
ATM4	Naive	565.702	520.298

atm_best_models <- atm_model_results %>%
  filter(!is.na(RMSE)) %>%
  group_by(atm) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  ungroup() %>%
  mutate(
    Final_Model = if_else(atm == "ATM3", "Recent 3-day mean", Model),
    Reason = case_when(
      atm == "ATM3" ~ "ATM3 should not forecast 0, so I used the mean of the most recent 3 days.",
      TRUE ~ "Selected because it had the lowest holdout RMSE among the tested models."
    )
  )

kable(atm_best_models, digits = 3, caption = "Selected ATM Forecasting Models")

Selected ATM Forecasting Models
atm	Model	RMSE	MAE	Final_Model	Reason
ATM1	Seasonal naive	10.007	7.286	Seasonal naive	Selected because it had the lowest holdout RMSE among the tested models.
ATM2	ETS	13.129	9.823	ETS	Selected because it had the lowest holdout RMSE among the tested models.
ATM3	Mean	40.677	18.786	Recent 3-day mean	ATM3 should not forecast 0, so I used the mean of the most recent 3 days.
ATM4	ARIMA	262.512	213.811	ARIMA	Selected because it had the lowest holdout RMSE among the tested models.

Final May 2010 ATM forecasts

atm_forecast_dates <- seq(forecast_start_atm, forecast_end_atm, by = "day")

atm_forecasts <- map_dfr(expected_atms, function(current_atm) {
  y <- atm_clean %>%
    filter(atm == current_atm) %>%
    arrange(date) %>%
    pull(cash_clean)

  selected_model <- atm_best_models %>%
    filter(atm == current_atm) %>%
    pull(Final_Model)

  forecast_values <- make_final_forecast_values(
    y = y,
    selected_model = selected_model,
    h = length(atm_forecast_dates),
    frequency_value = 7
  )

  if(current_atm == "ATM3") {
    forecast_values <- rep(mean(tail(y, 3), na.rm = TRUE), length(atm_forecast_dates))
  }

  tibble(
    date = atm_forecast_dates,
    ATM = current_atm,
    Model = selected_model,
    Forecast_Cash_Hundreds = pmax(forecast_values, 0),
    Forecast_Dollars = Forecast_Cash_Hundreds * 100
  )
})

kable(head(atm_forecasts, 12), digits = 2, caption = "Sample of Final ATM Forecasts for May 2010")

Sample of Final ATM Forecasts for May 2010
date	ATM	Model	Forecast_Cash_Hundreds	Forecast_Dollars
2010-05-01	ATM1	Seasonal naive	85.0	8500
2010-05-02	ATM1	Seasonal naive	109.0	10900
2010-05-03	ATM1	Seasonal naive	74.0	7400
2010-05-04	ATM1	Seasonal naive	20.5	2050
2010-05-05	ATM1	Seasonal naive	96.0	9600
2010-05-06	ATM1	Seasonal naive	82.0	8200
2010-05-07	ATM1	Seasonal naive	85.0	8500
2010-05-08	ATM1	Seasonal naive	85.0	8500
2010-05-09	ATM1	Seasonal naive	109.0	10900
2010-05-10	ATM1	Seasonal naive	74.0	7400
2010-05-11	ATM1	Seasonal naive	20.5	2050
2010-05-12	ATM1	Seasonal naive	96.0	9600

atm_summary_forecast <- atm_forecasts %>%
  group_by(ATM, Model) %>%
  summarise(
    Total_May_Cash_Hundreds = sum(Forecast_Cash_Hundreds),
    Total_May_Dollars = sum(Forecast_Dollars),
    Average_Daily_Cash_Hundreds = mean(Forecast_Cash_Hundreds),
    .groups = "drop"
  )

kable(atm_summary_forecast, digits = 2, caption = "ATM May 2010 Forecast Summary")

ATM May 2010 Forecast Summary
ATM	Model	Total_May_Cash_Hundreds	Total_May_Dollars	Average_Daily_Cash_Hundreds
ATM1	Seasonal naive	2474.00	247400.0	79.81
ATM2	ETS	1827.57	182757.3	58.95
ATM3	Recent 3-day mean	2717.67	271766.7	87.67
ATM4	ARIMA	13445.40	1344540.3	433.72

ggplot(atm_forecasts, aes(x = date, y = Forecast_Cash_Hundreds)) +
  geom_line() +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(
    title = "Final ATM Forecasts for May 2010",
    x = "Date",
    y = "Forecasted Cash in Hundreds of Dollars"
  ) +
  theme_minimal()

Part A explanation

For Part A, I treated each ATM separately because the machines do not all behave the same way. I checked missing values, filled daily gaps, capped extreme outliers, and tested multiple forecasting models. For ATM3, I used the recent 3-day mean because the original 0 forecast was not reasonable for a business cash forecast.

Part B: Residential Power Forecast

The goal of Part B is to forecast monthly residential power usage for 2014. The KWH variable represents power consumption in Kilowatt hours.

Load the power data

power_raw <- read_first_available(
  excel_files = c("ResidentialCustomerForecastLoad-624 (2).xlsx", "ResidentialCustomerForecastLoad-624 (1).xlsx", "ResidentialCustomerForecastLoad-624.xlsx"),
  csv_files = c("ResidentialCustomerForecastLoad-624 (2).csv", "ResidentialCustomerForecastLoad-624 (1).csv", "ResidentialCustomerForecastLoad-624.csv")
)

power_raw <- clean_names_basic(as.data.frame(power_raw))

glimpse(power_raw)

## Rows: 192
## Columns: 3
## $ casesequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ yyyy_mmm     <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ kwh          <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…

head(power_raw)

##   casesequence yyyy_mmm     kwh
## 1          733 1998-Jan 6862583
## 2          734 1998-Feb 5838198
## 3          735 1998-Mar 5420658
## 4          736 1998-Apr 5010364
## 5          737 1998-May 4665377
## 6          738 1998-Jun 6467147

Clean the power data and handle missing values

month_col <- names(power_raw)[stringr::str_detect(names(power_raw), "yyyy|month|date")][1]
kwh_col <- names(power_raw)[stringr::str_detect(names(power_raw), "kwh|usage|power|load")][1]

if(any(is.na(c(month_col, kwh_col)))) {
  stop("Could not identify the month or KWH columns. Please check the power file column names.")
}

power_data <- power_raw %>%
  transmute(
    month = parse_any_month(.data[[month_col]]),
    kwh = readr::parse_number(as.character(.data[[kwh_col]]))
  ) %>%
  filter(!is.na(month)) %>%
  group_by(month) %>%
  summarise(kwh = mean(kwh, na.rm = TRUE), .groups = "drop") %>%
  mutate(kwh = ifelse(is.nan(kwh), NA_real_, kwh)) %>%
  arrange(month)

power_missing_before <- power_data %>%
  summarise(
    missing_month = sum(is.na(month)),
    missing_kwh = sum(is.na(kwh))
  )

kable(power_missing_before, caption = "Power Missing Values Before Cleaning")

Power Missing Values Before Cleaning
missing_month	missing_kwh
0	1

all_months_power <- seq(min(power_data$month, na.rm = TRUE), as.Date("2013-12-01"), by = "month")

power_completed <- power_data %>%
  filter(month <= as.Date("2013-12-01")) %>%
  complete(month = all_months_power) %>%
  arrange(month) %>%
  mutate(
    kwh_before_imputation = kwh,
    kwh_imputed = impute_numeric_series(kwh)
  )

power_missing_after <- power_completed %>%
  summarise(
    missing_kwh_before = sum(is.na(kwh_before_imputation)),
    missing_kwh_after = sum(is.na(kwh_imputed))
  )

kable(power_missing_after, caption = "Power Missing Values Before and After Imputation")

Power Missing Values Before and After Imputation
missing_kwh_before	missing_kwh_after
1	0

Extreme outlier treatment for power usage

power_limits <- get_iqr_limits(power_completed$kwh_imputed)

power_outlier_summary <- tibble(
  lower_limit = power_limits[["lower"]],
  upper_limit = power_limits[["upper"]],
  outliers_found = sum(power_completed$kwh_imputed < power_limits[["lower"]] |
                         power_completed$kwh_imputed > power_limits[["upper"]], na.rm = TRUE)
)

kable(power_outlier_summary, digits = 2, caption = "Power Outlier Check Using IQR Rule")

Power Outlier Check Using IQR Rule
lower_limit	upper_limit	outliers_found
2173160	10870171	1

power_clean <- power_completed %>%
  mutate(kwh_clean = cap_outliers_iqr(kwh_imputed))

head(power_clean)

## # A tibble: 6 × 5
##   month          kwh kwh_before_imputation kwh_imputed kwh_clean
##   <date>       <dbl>                 <dbl>       <dbl>     <dbl>
## 1 1998-01-01 6862583               6862583     6862583   6862583
## 2 1998-02-01 5838198               5838198     5838198   5838198
## 3 1998-03-01 5420658               5420658     5420658   5420658
## 4 1998-04-01 5010364               5010364     5010364   5010364
## 5 1998-05-01 4665377               4665377     4665377   4665377
## 6 1998-06-01 6467147               6467147     6467147   6467147

power_clean_check <- power_clean %>%
  summarise(
    missing_clean_values = sum(is.na(kwh_clean)),
    minimum_clean_kwh = min(kwh_clean, na.rm = TRUE),
    maximum_clean_kwh = max(kwh_clean, na.rm = TRUE)
  )

kable(power_clean_check, digits = 0, caption = "Power Cleaned Data Check Before Modeling")

Power Cleaned Data Check Before Modeling
missing_clean_values	minimum_clean_kwh	maximum_clean_kwh
0	2173160	10655730

ggplot(power_clean, aes(x = month, y = kwh_clean)) +
  geom_line() +
  labs(
    title = "Residential Power Usage After Missing Value and Outlier Treatment",
    x = "Month",
    y = "KWH"
  ) +
  theme_minimal()

Power model experimentation

power_y <- power_clean %>%
  arrange(month) %>%
  pull(kwh_clean)

power_model_results <- compare_forecast_models(power_y, frequency_value = 12, holdout_h = 12)

kable(power_model_results, digits = 3, caption = "Power Model Comparison Using Holdout RMSE and MAE")

Power Model Comparison Using Holdout RMSE and MAE
Model	RMSE	MAE
STL + ETS	934619.3	620347.2
Seasonal naive	1035537.6	618605.6
ETS	1061192.1	674086.6
ARIMA	1336314.1	917975.7
Naive	1861178.6	1501420.0
Mean	1960601.4	1543871.8

power_best_model <- power_model_results %>%
  filter(!is.na(RMSE)) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  pull(Model)

power_best_model

## [1] "STL + ETS"

Final 2014 power forecast

power_forecast_months <- seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month")

power_forecast_values <- make_final_forecast_values(
  y = power_y,
  selected_model = power_best_model,
  h = 12,
  frequency_value = 12
)

power_forecasts <- tibble(
  Month = power_forecast_months,
  Model = power_best_model,
  Forecast_KWH = pmax(power_forecast_values, 0)
)

kable(power_forecasts, digits = 0, caption = "Final Residential Power Forecast for 2014")

Final Residential Power Forecast for 2014
Month	Model	Forecast_KWH
2014-01-01	STL + ETS	9698698
2014-02-01	STL + ETS	8666786
2014-03-01	STL + ETS	7199920
2014-04-01	STL + ETS	6405994
2014-05-01	STL + ETS	6085305
2014-06-01	STL + ETS	7800311
2014-07-01	STL + ETS	8230325
2014-08-01	STL + ETS	9513015
2014-09-01	STL + ETS	8465142
2014-10-01	STL + ETS	6302063
2014-11-01	STL + ETS	6001809
2014-12-01	STL + ETS	7888830

power_summary <- power_forecasts %>%
  summarise(
    Total_2014_KWH = sum(Forecast_KWH),
    Average_Monthly_KWH = mean(Forecast_KWH)
  )

kable(power_summary, digits = 0, caption = "Power Forecast Summary")

Power Forecast Summary
Total_2014_KWH	Average_Monthly_KWH
92258197	7688183

ggplot(power_forecasts, aes(x = Month, y = Forecast_KWH)) +
  geom_line() +
  geom_point() +
  labs(
    title = "Final Residential Power Forecast for 2014",
    x = "Month",
    y = "Forecasted KWH"
  ) +
  theme_minimal()

Part B explanation

For Part B, I checked for missing values, filled missing months, capped extreme outliers, and compared ARIMA, ETS, STL + ETS, and baseline models. The selected model is based on the lowest holdout RMSE instead of only using visual judgment.

Part C: Bonus Waterflow Forecast

The bonus section uses Waterflow_Pipe1 and Waterflow_Pipe2. Since the two files have different timestamps, I first converted the timestamp, aggregated each pipe to the hourly level, and took the mean when there were multiple readings in one hour. Then I checked stationarity and created a one week forward forecast.

Load and aggregate waterflow data

pipe1_raw <- read_first_available(
  excel_files = c("Waterflow_Pipe1 (2).xlsx", "Waterflow_Pipe1 (1).xlsx", "Waterflow_Pipe1.xlsx"),
  csv_files = c("Waterflow_Pipe1 (2).csv", "Waterflow_Pipe1 (1).csv", "Waterflow_Pipe1.csv")
)

pipe2_raw <- read_first_available(
  excel_files = c("Waterflow_Pipe2 (2).xlsx", "Waterflow_Pipe2 (1).xlsx", "Waterflow_Pipe2.xlsx"),
  csv_files = c("Waterflow_Pipe2 (2).csv", "Waterflow_Pipe2 (1).csv", "Waterflow_Pipe2.csv")
)

pipe1_raw <- clean_names_basic(as.data.frame(pipe1_raw))
pipe2_raw <- clean_names_basic(as.data.frame(pipe2_raw))

prep_pipe <- function(df, pipe_name) {
  datetime_col <- names(df)[stringr::str_detect(names(df), "date|time")][1]
  flow_col <- names(df)[stringr::str_detect(names(df), "flow|water")][1]

  if(any(is.na(c(datetime_col, flow_col)))) {
    stop(paste("Could not identify timestamp or waterflow columns for", pipe_name))
  }

  df %>%
    transmute(
      datetime = parse_any_datetime(.data[[datetime_col]]),
      pipe = pipe_name,
      waterflow = readr::parse_number(as.character(.data[[flow_col]]))
    ) %>%
    filter(!is.na(datetime)) %>%
    mutate(hour = lubridate::floor_date(datetime, "hour")) %>%
    group_by(pipe, hour) %>%
    summarise(waterflow = mean(waterflow, na.rm = TRUE), .groups = "drop") %>%
    mutate(waterflow = ifelse(is.nan(waterflow), NA_real_, waterflow))
}

water_hourly_raw <- bind_rows(
  prep_pipe(pipe1_raw, "Pipe1"),
  prep_pipe(pipe2_raw, "Pipe2")
) %>%
  arrange(pipe, hour)

kable(head(water_hourly_raw, 12), digits = 2, caption = "Hourly Water Flow Data After Aggregating by Hour")

Hourly Water Flow Data After Aggregating by Hour
pipe	hour	waterflow
Pipe1	2015-10-23 00:00:00	26.10
Pipe1	2015-10-23 01:00:00	18.85
Pipe1	2015-10-23 02:00:00	15.16
Pipe1	2015-10-23 03:00:00	23.08
Pipe1	2015-10-23 04:00:00	15.48
Pipe1	2015-10-23 05:00:00	22.73
Pipe1	2015-10-23 06:00:00	20.59
Pipe1	2015-10-23 07:00:00	18.37
Pipe1	2015-10-23 08:00:00	20.79
Pipe1	2015-10-23 09:00:00	21.22
Pipe1	2015-10-23 10:00:00	16.63
Pipe1	2015-10-23 11:00:00	22.09

water_hourly_raw %>%
  group_by(pipe) %>%
  summarise(
    first_hour = min(hour),
    last_hour = max(hour),
    hourly_rows = n(),
    missing_flow = sum(is.na(waterflow)),
    .groups = "drop"
  ) %>%
  kable(caption = "Waterflow Data Summary Before Gap Filling")

Waterflow Data Summary Before Gap Filling
pipe	first_hour	last_hour	hourly_rows	missing_flow
Pipe1	2015-10-23 00:00:00	2015-11-01 23:00:00	236	0
Pipe2	2015-10-23 01:00:00	2015-12-03 16:00:00	667	0

Missing value and outlier treatment for waterflow

all_water_hours <- seq(min(water_hourly_raw$hour, na.rm = TRUE), max(water_hourly_raw$hour, na.rm = TRUE), by = "hour")

water_completed <- water_hourly_raw %>%
  complete(pipe = c("Pipe1", "Pipe2"), hour = all_water_hours) %>%
  arrange(pipe, hour) %>%
  group_by(pipe) %>%
  mutate(
    waterflow_before_imputation = waterflow,
    waterflow_imputed = impute_numeric_series(waterflow)
  ) %>%
  ungroup()

water_missing_summary <- water_completed %>%
  group_by(pipe) %>%
  summarise(
    missing_before = sum(is.na(waterflow_before_imputation)),
    missing_after = sum(is.na(waterflow_imputed)),
    .groups = "drop"
  )

kable(water_missing_summary, caption = "Waterflow Missing Values Before and After Imputation")

Waterflow Missing Values Before and After Imputation
pipe	missing_before	missing_after
Pipe1	765	0
Pipe2	334	0

water_outlier_summary <- water_completed %>%
  group_by(pipe) %>%
  summarise(
    lower_limit = get_iqr_limits(waterflow_imputed)[["lower"]],
    upper_limit = get_iqr_limits(waterflow_imputed)[["upper"]],
    outliers_found = sum(waterflow_imputed < lower_limit | waterflow_imputed > upper_limit, na.rm = TRUE),
    .groups = "drop"
  )

kable(water_outlier_summary, digits = 2, caption = "Waterflow Outlier Check Using IQR Rule")

Waterflow Outlier Check Using IQR Rule
pipe	lower_limit	upper_limit	outliers_found
Pipe1	8.92	31.73	0
Pipe2	6.86	72.53	7

water_clean <- water_completed %>%
  group_by(pipe) %>%
  mutate(waterflow_clean = cap_outliers_iqr(waterflow_imputed)) %>%
  ungroup()

water_clean_check <- water_clean %>%
  group_by(pipe) %>%
  summarise(
    missing_clean_values = sum(is.na(waterflow_clean)),
    minimum_clean_waterflow = min(waterflow_clean, na.rm = TRUE),
    maximum_clean_waterflow = max(waterflow_clean, na.rm = TRUE),
    .groups = "drop"
  )

kable(water_clean_check, digits = 2, caption = "Waterflow Cleaned Data Check Before Modeling")

Waterflow Cleaned Data Check Before Modeling
pipe	missing_clean_values	minimum_clean_waterflow	maximum_clean_waterflow
Pipe1	0	8.92	31.73
Pipe2	0	6.86	72.53

ggplot(water_clean, aes(x = hour, y = waterflow_clean)) +
  geom_line() +
  facet_wrap(~ pipe, scales = "free_y") +
  labs(
    title = "Hourly Water Flow After Missing Value and Outlier Treatment",
    x = "Hour",
    y = "Average Water Flow"
  ) +
  theme_minimal()

Stationarity check

I used ADF and KPSS tests to check stationarity. ADF has a null hypothesis of non-stationarity, while KPSS has a null hypothesis of stationarity. Since these tests can sometimes disagree, I used the results as guidance and still compared forecast models directly on a holdout period.

water_stationarity <- water_clean %>%
  group_by(pipe) %>%
  summarise(test_data = list(waterflow_clean), .groups = "drop") %>%
  mutate(stationarity = map(test_data, stationarity_tests)) %>%
  select(-test_data) %>%
  unnest(stationarity)

kable(water_stationarity, digits = 4, caption = "Waterflow Stationarity Tests")

Waterflow Stationarity Tests
pipe	ADF_p_value	KPSS_p_value	Stationarity_Note
Pipe1	0.01	0.1000	Stationary based on ADF and KPSS.
Pipe2	0.01	0.0535	Stationary based on ADF and KPSS.

Waterflow model experimentation

water_model_results <- map_dfr(c("Pipe1", "Pipe2"), function(current_pipe) {
  y <- water_clean %>%
    filter(pipe == current_pipe) %>%
    arrange(hour) %>%
    pull(waterflow_clean)

  compare_forecast_models(y, frequency_value = 24, holdout_h = 24) %>%
    mutate(pipe = current_pipe, .before = 1)
})

kable(water_model_results, digits = 3, caption = "Waterflow Model Comparison Using Holdout RMSE and MAE")

Waterflow Model Comparison Using Holdout RMSE and MAE
pipe	Model	RMSE	MAE
Pipe1	Naive	0.000	0.000
Pipe1	Seasonal naive	0.000	0.000
Pipe1	ETS	0.055	0.055
Pipe1	Mean	0.055	0.055
Pipe1	ARIMA	0.056	0.056
Pipe1	STL + ETS	0.058	0.058
Pipe2	Mean	15.513	13.213
Pipe2	STL + ETS	15.730	13.036
Pipe2	ETS	15.981	13.586
Pipe2	ARIMA	16.033	13.850
Pipe2	Naive	16.907	14.256
Pipe2	Seasonal naive	19.415	16.483

water_best_models <- water_model_results %>%
  filter(!is.na(RMSE)) %>%
  group_by(pipe) %>%
  slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
  ungroup()

kable(water_best_models, digits = 3, caption = "Selected Waterflow Forecasting Models")

Selected Waterflow Forecasting Models
pipe	Model	RMSE	MAE
Pipe1	Naive	0.000	0.000
Pipe2	Mean	15.513	13.213

One week forward waterflow forecast

last_water_hour <- max(water_clean$hour, na.rm = TRUE)
water_forecast_hours <- seq(last_water_hour + lubridate::hours(1), by = "hour", length.out = 24 * 7)

water_forecasts <- map_dfr(c("Pipe1", "Pipe2"), function(current_pipe) {
  y <- water_clean %>%
    filter(pipe == current_pipe) %>%
    arrange(hour) %>%
    pull(waterflow_clean)

  selected_model <- water_best_models %>%
    filter(pipe == current_pipe) %>%
    pull(Model)

  forecast_values <- make_final_forecast_values(
    y = y,
    selected_model = selected_model,
    h = length(water_forecast_hours),
    frequency_value = 24
  )

  tibble(
    Hour = water_forecast_hours,
    Pipe = current_pipe,
    Model = selected_model,
    Forecast_WaterFlow = pmax(forecast_values, 0)
  )
})

kable(head(water_forecasts, 12), digits = 2, caption = "Sample of One Week Forward Waterflow Forecast")

Sample of One Week Forward Waterflow Forecast
Hour	Pipe	Model	Forecast_WaterFlow
2015-12-03 17:00:00	Pipe1	Naive	20.09
2015-12-03 18:00:00	Pipe1	Naive	20.09
2015-12-03 19:00:00	Pipe1	Naive	20.09
2015-12-03 20:00:00	Pipe1	Naive	20.09
2015-12-03 21:00:00	Pipe1	Naive	20.09
2015-12-03 22:00:00	Pipe1	Naive	20.09
2015-12-03 23:00:00	Pipe1	Naive	20.09
2015-12-04 00:00:00	Pipe1	Naive	20.09
2015-12-04 01:00:00	Pipe1	Naive	20.09
2015-12-04 02:00:00	Pipe1	Naive	20.09
2015-12-04 03:00:00	Pipe1	Naive	20.09
2015-12-04 04:00:00	Pipe1	Naive	20.09

water_summary <- water_forecasts %>%
  group_by(Pipe, Model) %>%
  summarise(
    Average_Forecast_WaterFlow = mean(Forecast_WaterFlow),
    Total_Week_Forecast_WaterFlow = sum(Forecast_WaterFlow),
    .groups = "drop"
  )

kable(water_summary, digits = 2, caption = "Waterflow One Week Forecast Summary")

Waterflow One Week Forecast Summary
Pipe	Model	Average_Forecast_WaterFlow	Total_Week_Forecast_WaterFlow
Pipe1	Naive	20.09	3375.37
Pipe2	Mean	39.76	6679.11

ggplot(water_forecasts, aes(x = Hour, y = Forecast_WaterFlow)) +
  geom_line() +
  facet_wrap(~ Pipe, scales = "free_y") +
  labs(
    title = "One Week Forward Waterflow Forecast",
    x = "Hour",
    y = "Forecasted Water Flow"
  ) +
  theme_minimal()

Part C explanation

For the bonus section, I first made the timestamps consistent and aggregated both pipes by hour. When there were multiple readings in the same hour, I used the mean. After that, I filled missing hourly gaps, capped extreme outliers, checked stationarity, and compared forecasting models. A one week hourly forecast was then created for each pipe.

Export Forecasts to Excel

This creates the Excel-readable forecast file when the document is knitted.

wb <- openxlsx::createWorkbook()

openxlsx::addWorksheet(wb, "ATM_Forecast")
openxlsx::writeData(wb, "ATM_Forecast", atm_forecasts)

openxlsx::addWorksheet(wb, "ATM_Model_Comparison")
openxlsx::writeData(wb, "ATM_Model_Comparison", atm_model_results)

openxlsx::addWorksheet(wb, "ATM_Summary")
openxlsx::writeData(wb, "ATM_Summary", atm_summary_forecast)

openxlsx::addWorksheet(wb, "Power_Forecast")
openxlsx::writeData(wb, "Power_Forecast", power_forecasts)

openxlsx::addWorksheet(wb, "Power_Model_Comparison")
openxlsx::writeData(wb, "Power_Model_Comparison", power_model_results)

openxlsx::addWorksheet(wb, "Power_Summary")
openxlsx::writeData(wb, "Power_Summary", power_summary)

openxlsx::addWorksheet(wb, "Water_Forecast")
openxlsx::writeData(wb, "Water_Forecast", water_forecasts)

openxlsx::addWorksheet(wb, "Water_Model_Comparison")
openxlsx::writeData(wb, "Water_Model_Comparison", water_model_results)

openxlsx::addWorksheet(wb, "Water_Stationarity")
openxlsx::writeData(wb, "Water_Stationarity", water_stationarity)

openxlsx::addWorksheet(wb, "Water_Summary")
openxlsx::writeData(wb, "Water_Summary", water_summary)

openxlsx::saveWorkbook(wb, "Project1_Forecasts_Corrected_With_Bonus.xlsx", overwrite = TRUE)

print("Excel forecast file created: Project1_Forecasts_Corrected_With_Bonus.xlsx")

## [1] "Excel forecast file created: Project1_Forecasts_Corrected_With_Bonus.xlsx"

Final Conclusion

In this revised project, I corrected the main problems from the first submission. I checked and treated missing values, handled extreme outliers, and tested multiple forecasting models instead of only relying on the visuals. For Part A, each ATM was modeled separately and ATM3 was corrected by using the recent 3-day mean instead of allowing a 0 forecast. For Part B, I compared model accuracy and selected the final model based on the holdout RMSE. For the bonus Part C, I aggregated the waterflow data by hour, checked stationarity, and produced a one week forward forecast for both pipes.

Session Information

sessionInfo()

## R version 4.5.3 (2026-03-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Ubuntu 24.04.4 LTS
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/openblas-pthread/libblas.so.3 
## LAPACK: /usr/lib/x86_64-linux-gnu/openblas-pthread/libopenblasp-r0.3.26.so;  LAPACK version 3.12.0
## 
## locale:
##  [1] LC_CTYPE=C.UTF-8       LC_NUMERIC=C           LC_TIME=C.UTF-8       
##  [4] LC_COLLATE=C.UTF-8     LC_MONETARY=C.UTF-8    LC_MESSAGES=C.UTF-8   
##  [7] LC_PAPER=C.UTF-8       LC_NAME=C              LC_ADDRESS=C          
## [10] LC_TELEPHONE=C         LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C   
## 
## time zone: UTC
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] tseries_0.10-61  knitr_1.51       openxlsx_4.2.8.1 zoo_1.8-15      
##  [5] forecast_9.0.2   readxl_1.4.5     lubridate_1.9.5  forcats_1.0.1   
##  [9] stringr_1.6.0    dplyr_1.2.1      purrr_1.2.2      readr_2.2.0     
## [13] tidyr_1.3.2      tibble_3.3.1     ggplot2_4.0.3    tidyverse_2.0.0 
## 
## loaded via a namespace (and not attached):
##  [1] utf8_1.2.6         sass_0.4.10        generics_0.1.4     lattice_0.22-9    
##  [5] stringi_1.8.7      hms_1.1.4          digest_0.6.39      magrittr_2.0.5    
##  [9] evaluate_1.0.5     grid_4.5.3         timechange_0.4.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      cellranger_1.1.0   jsonlite_2.0.0     zip_2.3.3         
## [17] scales_1.4.0       jquerylib_0.1.4    cli_3.6.6          rlang_1.2.0       
## [21] withr_3.0.2        cachem_1.1.0       yaml_2.3.12        tools_4.5.3       
## [25] parallel_4.5.3     tzdb_0.5.0         colorspace_2.1-2   curl_7.1.0        
## [29] vctrs_0.7.3        R6_2.6.1           lifecycle_1.0.5    pkgconfig_2.0.3   
## [33] urca_1.3-4         pillar_1.11.1      bslib_0.10.0       gtable_0.3.6      
## [37] quantmod_0.4.28    glue_1.8.1         Rcpp_1.1.1-1       xfun_0.57         
## [41] tidyselect_1.2.1   rstudioapi_0.18.0  farver_2.1.2       htmltools_0.5.9   
## [45] nlme_3.1-168       labeling_0.4.3     xts_0.14.2         rmarkdown_2.31    
## [49] timeDate_4052.112  fracdiff_1.5-3     compiler_4.5.3     quadprog_1.5-8    
## [53] S7_0.2.2           TTR_0.24.4

DATA 624 Project 1 Resubmission - Corrected Forecasting Analysis

Sachi Kapoor

April 26, 2026