Resubmission Note
I revised this project based on the feedback I received. In the
original version, I did not clearly show how I handled missing values,
extreme outliers, or how I tested different forecasting methods. In this
corrected version, I added those steps directly into the analysis.
The main corrections are:
- I checked for missing values before forecasting.
- I filled missing time series values instead of leaving gaps.
- I checked for extreme outliers and capped them using the IQR
rule.
- I tested ARIMA, ETS, STL-based forecasts, and baseline models.
- I compared models using a holdout period instead of only choosing by
visual inspection.
- I corrected ATM3 so it does not forecast 0. I used the mean of the
most recent 3 days because that is more realistic for a short-term ATM
forecast.
- I also included the optional bonus waterflow section.
Helper Functions
clean_names_basic <- function(df) {
names(df) <- names(df) %>%
stringr::str_trim() %>%
stringr::str_replace_all("[^A-Za-z0-9]+", "_") %>%
stringr::str_replace_all("_+$", "") %>%
tolower()
df
}
read_first_available <- function(excel_files = NULL, csv_files = NULL) {
if(!is.null(excel_files)) {
for(file in excel_files) {
if(file.exists(file)) {
return(readxl::read_excel(file))
}
}
}
if(!is.null(csv_files)) {
for(file in csv_files) {
if(file.exists(file)) {
return(readr::read_csv(file, show_col_types = FALSE))
}
}
}
stop("The file was not found. Please make sure the Excel file is uploaded into the same folder as this RMD.")
}
parse_any_date <- function(x) {
if(inherits(x, "Date")) return(x)
if(inherits(x, "POSIXt")) return(as.Date(x))
if(is.numeric(x)) return(as.Date(x, origin = "1899-12-30"))
parsed <- suppressWarnings(lubridate::parse_date_time(
as.character(x),
orders = c("mdy HMS", "mdy HM", "mdy", "ymd HMS", "ymd HM", "ymd", "Ymd")
))
as.Date(parsed)
}
parse_any_month <- function(x) {
if(inherits(x, "Date")) return(lubridate::floor_date(x, "month"))
if(inherits(x, "POSIXt")) return(lubridate::floor_date(as.Date(x), "month"))
if(is.numeric(x)) return(lubridate::floor_date(as.Date(x, origin = "1899-12-30"), "month"))
parsed <- suppressWarnings(lubridate::parse_date_time(
as.character(x),
orders = c("Y-b", "Y-B", "Y-m", "ym", "my", "mdy", "ymd")
))
lubridate::floor_date(as.Date(parsed), "month")
}
parse_any_datetime <- function(x) {
if(inherits(x, "POSIXt")) return(as.POSIXct(x, tz = "UTC"))
if(inherits(x, "Date")) return(as.POSIXct(x, tz = "UTC"))
if(is.numeric(x)) return(as.POSIXct(as.numeric(x) * 86400, origin = "1899-12-30", tz = "UTC"))
parsed <- suppressWarnings(lubridate::parse_date_time(
as.character(x),
orders = c("ymd HMS", "mdy HMS", "ymd HM", "mdy HM", "ymd", "mdy")
))
as.POSIXct(parsed, tz = "UTC")
}
impute_numeric_series <- function(x) {
x <- as.numeric(x)
if(sum(!is.na(x)) == 0) {
return(rep(0, length(x)))
}
x_clean <- zoo::na.approx(x, na.rm = FALSE)
x_clean <- zoo::na.locf(x_clean, na.rm = FALSE)
x_clean <- zoo::na.locf(x_clean, fromLast = TRUE, na.rm = FALSE)
x_clean
}
get_iqr_limits <- function(x) {
x <- as.numeric(x)
q1 <- quantile(x, 0.25, na.rm = TRUE)
q3 <- quantile(x, 0.75, na.rm = TRUE)
iqr_value <- q3 - q1
if(is.na(iqr_value) || iqr_value == 0) {
return(c(lower = min(x, na.rm = TRUE), upper = max(x, na.rm = TRUE)))
}
c(
lower = q1 - 1.5 * iqr_value,
upper = q3 + 1.5 * iqr_value
)
}
cap_outliers_iqr <- function(x) {
limits <- get_iqr_limits(x)
pmin(pmax(as.numeric(x), limits["lower"]), limits["upper"])
}
rmse <- function(actual, predicted) {
sqrt(mean((actual - predicted)^2, na.rm = TRUE))
}
mae <- function(actual, predicted) {
mean(abs(actual - predicted), na.rm = TRUE)
}
safe_model_forecast <- function(model_name, y, h, frequency_value) {
y <- impute_numeric_series(as.numeric(y))
if(length(y) < 5 || sum(!is.na(y)) < 5) {
return(NULL)
}
y_ts <- ts(y, frequency = frequency_value)
result <- tryCatch({
if(model_name == "Mean") {
forecast::meanf(y_ts, h = h)
} else if(model_name == "Naive") {
forecast::naive(y_ts, h = h)
} else if(model_name == "Seasonal naive") {
forecast::snaive(y_ts, h = h)
} else if(model_name == "ETS") {
forecast::forecast(forecast::ets(y_ts), h = h)
} else if(model_name == "ARIMA") {
forecast::forecast(forecast::auto.arima(y_ts), h = h)
} else if(model_name == "STL + ETS") {
forecast::stlf(y_ts, h = h, method = "ets")
} else {
stop("Unknown model name")
}
}, error = function(e) NULL)
result
}
compare_forecast_models <- function(y, frequency_value, holdout_h) {
y <- impute_numeric_series(as.numeric(y))
if(length(y) < 10) {
return(tibble(Model = character(), RMSE = numeric(), MAE = numeric()))
}
holdout_h <- min(holdout_h, max(2, floor(length(y) / 4)))
train_y <- head(y, length(y) - holdout_h)
test_y <- tail(y, holdout_h)
candidate_models <- c("Mean", "Naive", "Seasonal naive", "ETS", "ARIMA", "STL + ETS")
results <- map_dfr(candidate_models, function(model_name) {
fc <- safe_model_forecast(model_name, train_y, holdout_h, frequency_value)
if(is.null(fc)) {
return(tibble(Model = model_name, RMSE = NA_real_, MAE = NA_real_))
}
preds <- as.numeric(fc$mean)
tibble(
Model = model_name,
RMSE = rmse(test_y, preds),
MAE = mae(test_y, preds)
)
}) %>%
arrange(RMSE)
results
}
make_final_forecast_values <- function(y, selected_model, h, frequency_value) {
y <- impute_numeric_series(as.numeric(y))
if(selected_model %in% c("Recent 3-day mean", "Recent 3-hour mean", "Recent 3-month mean")) {
return(rep(mean(tail(y, 3), na.rm = TRUE), h))
}
fc <- safe_model_forecast(selected_model, y, h, frequency_value)
if(is.null(fc)) {
return(rep(mean(tail(y, 3), na.rm = TRUE), h))
}
as.numeric(fc$mean)
}
stationarity_tests <- function(y) {
y <- impute_numeric_series(as.numeric(y))
y <- y[is.finite(y)]
adf_result <- tryCatch(tseries::adf.test(y), error = function(e) NULL)
kpss_result <- tryCatch(tseries::kpss.test(y), error = function(e) NULL)
tibble(
ADF_p_value = ifelse(is.null(adf_result), NA_real_, adf_result$p.value),
KPSS_p_value = ifelse(is.null(kpss_result), NA_real_, kpss_result$p.value),
Stationarity_Note = case_when(
!is.na(ADF_p_value) & !is.na(KPSS_p_value) & ADF_p_value < 0.05 & KPSS_p_value > 0.05 ~ "Stationary based on ADF and KPSS.",
!is.na(ADF_p_value) & ADF_p_value < 0.05 ~ "ADF suggests stationarity, but KPSS should also be reviewed.",
TRUE ~ "Not clearly stationary, so ARIMA/differencing is useful before forecasting."
)
)
}
Part A: ATM Forecast
The goal of Part A is to forecast how much cash is taken out of four
different ATM machines for May 2010. The variable Cash is in hundreds of
dollars.
Load the ATM data
atm_raw <- read_first_available(
excel_files = c("ATM624Data (2).xlsx", "ATM624Data (1).xlsx", "ATM624Data.xlsx"),
csv_files = c("ATM624Data (2).csv", "ATM624Data (1).csv", "ATM624Data.csv")
)
atm_raw <- clean_names_basic(as.data.frame(atm_raw))
glimpse(atm_raw)
## Rows: 1,474
## Columns: 3
## $ date <dbl> 39934, 39934, 39935, 39935, 39936, 39936, 39937, 39937, 39938, 39…
## $ atm <chr> "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "ATM1", "ATM2", "…
## $ cash <dbl> 96, 107, 82, 89, 85, 90, 90, 55, 99, 79, 88, 19, 8, 2, 104, 103, …
head(atm_raw)
## date atm cash
## 1 39934 ATM1 96
## 2 39934 ATM2 107
## 3 39935 ATM1 82
## 4 39935 ATM2 89
## 5 39936 ATM1 85
## 6 39936 ATM2 90
Clean dates and columns
date_col <- names(atm_raw)[stringr::str_detect(names(atm_raw), "date")][1]
atm_col <- names(atm_raw)[stringr::str_detect(names(atm_raw), "^atm$|atm")][1]
cash_col <- names(atm_raw)[stringr::str_detect(names(atm_raw), "cash|withdraw")][1]
if(any(is.na(c(date_col, atm_col, cash_col)))) {
stop("Could not identify the date, ATM, or cash columns. Please check the ATM file column names.")
}
atm_data <- atm_raw %>%
transmute(
date = parse_any_date(.data[[date_col]]),
atm = toupper(stringr::str_replace_all(as.character(.data[[atm_col]]), "\\s+", "")),
cash = readr::parse_number(as.character(.data[[cash_col]]))
)
atm_missing_before <- atm_data %>%
summarise(
missing_date = sum(is.na(date)),
missing_atm = sum(is.na(atm) | atm == ""),
missing_cash = sum(is.na(cash))
)
kable(atm_missing_before, caption = "ATM Missing Values Before Cleaning")
ATM Missing Values Before Cleaning
| 0 |
14 |
19 |
atm_data <- atm_data %>%
filter(!is.na(date), !is.na(atm), atm != "") %>%
filter(atm %in% c("ATM1", "ATM2", "ATM3", "ATM4")) %>%
group_by(date, atm) %>%
summarise(cash = mean(cash, na.rm = TRUE), .groups = "drop") %>%
mutate(cash = ifelse(is.nan(cash), NA_real_, cash))
atm_data %>% count(atm)
## # A tibble: 4 × 2
## atm n
## <chr> <int>
## 1 ATM1 365
## 2 ATM2 365
## 3 ATM3 365
## 4 ATM4 365
Missing value treatment
I completed the daily time series for each ATM and then filled
missing cash values. I used interpolation for internal gaps and nearest
available values for leading or trailing gaps. This avoids sending
missing values directly into ARIMA, ETS, or STL models.
forecast_start_atm <- as.Date("2010-05-01")
forecast_end_atm <- as.Date("2010-05-31")
atm_training_raw <- atm_data %>%
filter(date < forecast_start_atm)
all_dates_atm <- seq(min(atm_training_raw$date, na.rm = TRUE), as.Date("2010-04-30"), by = "day")
expected_atms <- c("ATM1", "ATM2", "ATM3", "ATM4")
atm_completed <- atm_training_raw %>%
complete(atm = expected_atms, date = all_dates_atm) %>%
arrange(atm, date) %>%
group_by(atm) %>%
mutate(
cash_before_imputation = cash,
cash_imputed = impute_numeric_series(cash)
) %>%
ungroup()
atm_missing_after <- atm_completed %>%
summarise(
missing_cash_before = sum(is.na(cash_before_imputation)),
missing_cash_after = sum(is.na(cash_imputed))
)
kable(atm_missing_after, caption = "ATM Missing Values Before and After Imputation")
ATM Missing Values Before and After Imputation
| 5 |
0 |
Extreme outlier treatment
I used the IQR rule to identify extreme values for each ATM. Instead
of deleting those rows, I capped unusually low or high values at the
lower and upper IQR limits. This directly addresses the issue that
extreme values can pull the forecast too far up or down.
atm_outlier_summary <- atm_completed %>%
group_by(atm) %>%
summarise(
lower_limit = get_iqr_limits(cash_imputed)["lower"],
upper_limit = get_iqr_limits(cash_imputed)["upper"],
outliers_found = sum(cash_imputed < lower_limit | cash_imputed > upper_limit, na.rm = TRUE),
.groups = "drop"
)
kable(atm_outlier_summary, digits = 2, caption = "ATM Outlier Check Using IQR Rule")
ATM Outlier Check Using IQR Rule
| ATM1 |
NA |
NA |
0 |
| ATM2 |
NA |
NA |
0 |
| ATM3 |
0 |
96 |
0 |
| ATM4 |
NA |
NA |
0 |
atm_clean <- atm_completed %>%
left_join(atm_outlier_summary, by = "atm") %>%
group_by(atm) %>%
mutate(cash_clean = cap_outliers_iqr(cash_imputed)) %>%
ungroup()
atm_clean %>%
select(date, atm, cash_before_imputation, cash_imputed, cash_clean) %>%
head(12)
## # A tibble: 12 × 5
## date atm cash_before_imputation cash_imputed cash_clean
## <date> <chr> <dbl> <dbl> <dbl>
## 1 2009-05-01 ATM1 96 96 NA
## 2 2009-05-02 ATM1 82 82 NA
## 3 2009-05-03 ATM1 85 85 NA
## 4 2009-05-04 ATM1 90 90 NA
## 5 2009-05-05 ATM1 99 99 NA
## 6 2009-05-06 ATM1 88 88 NA
## 7 2009-05-07 ATM1 8 8 NA
## 8 2009-05-08 ATM1 104 104 NA
## 9 2009-05-09 ATM1 87 87 NA
## 10 2009-05-10 ATM1 93 93 NA
## 11 2009-05-11 ATM1 86 86 NA
## 12 2009-05-12 ATM1 111 111 NA
ggplot(atm_clean, aes(x = date, y = cash_clean)) +
geom_line() +
facet_wrap(~ atm, scales = "free_y") +
labs(
title = "ATM Cash Withdrawals After Missing Value and Outlier Treatment",
x = "Date",
y = "Cash in Hundreds of Dollars"
) +
theme_minimal()

Model experimentation
Instead of only guessing from the graph, I tested several methods:
Mean, Naive, Seasonal Naive, ETS, ARIMA, and STL + ETS. I compared them
using a holdout sample from the end of the training data.
atm_model_results <- map_dfr(expected_atms, function(current_atm) {
y <- atm_clean %>%
filter(atm == current_atm) %>%
arrange(date) %>%
pull(cash_clean)
compare_forecast_models(y, frequency_value = 7, holdout_h = 14) %>%
mutate(atm = current_atm, .before = 1)
})
kable(atm_model_results, digits = 3, caption = "ATM Model Comparison Using Holdout RMSE and MAE")
ATM Model Comparison Using Holdout RMSE and MAE
| ATM1 |
Mean |
0.000 |
0.000 |
| ATM1 |
Naive |
0.000 |
0.000 |
| ATM1 |
Seasonal naive |
0.000 |
0.000 |
| ATM1 |
ETS |
0.000 |
0.000 |
| ATM1 |
ARIMA |
0.000 |
0.000 |
| ATM1 |
STL + ETS |
0.000 |
0.000 |
| ATM2 |
Mean |
0.000 |
0.000 |
| ATM2 |
Naive |
0.000 |
0.000 |
| ATM2 |
Seasonal naive |
0.000 |
0.000 |
| ATM2 |
ETS |
0.000 |
0.000 |
| ATM2 |
ARIMA |
0.000 |
0.000 |
| ATM2 |
STL + ETS |
0.000 |
0.000 |
| ATM3 |
Mean |
40.677 |
18.786 |
| ATM3 |
Naive |
40.677 |
18.786 |
| ATM3 |
Seasonal naive |
40.677 |
18.786 |
| ATM3 |
ETS |
40.677 |
18.786 |
| ATM3 |
ARIMA |
40.677 |
18.786 |
| ATM3 |
STL + ETS |
40.677 |
18.786 |
| ATM4 |
Mean |
0.000 |
0.000 |
| ATM4 |
Naive |
0.000 |
0.000 |
| ATM4 |
Seasonal naive |
0.000 |
0.000 |
| ATM4 |
ETS |
0.000 |
0.000 |
| ATM4 |
ARIMA |
0.000 |
0.000 |
| ATM4 |
STL + ETS |
0.000 |
0.000 |
atm_best_models <- atm_model_results %>%
filter(!is.na(RMSE)) %>%
group_by(atm) %>%
slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
ungroup() %>%
mutate(
Final_Model = if_else(atm == "ATM3", "Recent 3-day mean", Model),
Reason = case_when(
atm == "ATM3" ~ "ATM3 should not forecast 0, so I used the mean of the most recent 3 days.",
TRUE ~ "Selected because it had the lowest holdout RMSE among the tested models."
)
)
kable(atm_best_models, digits = 3, caption = "Selected ATM Forecasting Models")
Selected ATM Forecasting Models
| ATM1 |
Mean |
0.000 |
0.000 |
Mean |
Selected because it had the lowest holdout RMSE among
the tested models. |
| ATM2 |
Mean |
0.000 |
0.000 |
Mean |
Selected because it had the lowest holdout RMSE among
the tested models. |
| ATM3 |
Mean |
40.677 |
18.786 |
Recent 3-day mean |
ATM3 should not forecast 0, so I used the mean of the
most recent 3 days. |
| ATM4 |
Mean |
0.000 |
0.000 |
Mean |
Selected because it had the lowest holdout RMSE among
the tested models. |
Final May 2010 ATM forecasts
atm_forecast_dates <- seq(forecast_start_atm, forecast_end_atm, by = "day")
atm_forecasts <- map_dfr(expected_atms, function(current_atm) {
y <- atm_clean %>%
filter(atm == current_atm) %>%
arrange(date) %>%
pull(cash_clean)
selected_model <- atm_best_models %>%
filter(atm == current_atm) %>%
pull(Final_Model)
forecast_values <- make_final_forecast_values(
y = y,
selected_model = selected_model,
h = length(atm_forecast_dates),
frequency_value = 7
)
if(current_atm == "ATM3") {
forecast_values <- rep(mean(tail(y, 3), na.rm = TRUE), length(atm_forecast_dates))
}
tibble(
date = atm_forecast_dates,
ATM = current_atm,
Model = selected_model,
Forecast_Cash_Hundreds = pmax(forecast_values, 0),
Forecast_Dollars = Forecast_Cash_Hundreds * 100
)
})
kable(head(atm_forecasts, 12), digits = 2, caption = "Sample of Final ATM Forecasts for May 2010")
Sample of Final ATM Forecasts for May 2010
| 2010-05-01 |
ATM1 |
Mean |
0 |
0 |
| 2010-05-02 |
ATM1 |
Mean |
0 |
0 |
| 2010-05-03 |
ATM1 |
Mean |
0 |
0 |
| 2010-05-04 |
ATM1 |
Mean |
0 |
0 |
| 2010-05-05 |
ATM1 |
Mean |
0 |
0 |
| 2010-05-06 |
ATM1 |
Mean |
0 |
0 |
| 2010-05-07 |
ATM1 |
Mean |
0 |
0 |
| 2010-05-08 |
ATM1 |
Mean |
0 |
0 |
| 2010-05-09 |
ATM1 |
Mean |
0 |
0 |
| 2010-05-10 |
ATM1 |
Mean |
0 |
0 |
| 2010-05-11 |
ATM1 |
Mean |
0 |
0 |
| 2010-05-12 |
ATM1 |
Mean |
0 |
0 |
atm_summary_forecast <- atm_forecasts %>%
group_by(ATM, Model) %>%
summarise(
Total_May_Cash_Hundreds = sum(Forecast_Cash_Hundreds),
Total_May_Dollars = sum(Forecast_Dollars),
Average_Daily_Cash_Hundreds = mean(Forecast_Cash_Hundreds),
.groups = "drop"
)
kable(atm_summary_forecast, digits = 2, caption = "ATM May 2010 Forecast Summary")
ATM May 2010 Forecast Summary
| ATM1 |
Mean |
0.00 |
0.0 |
0.00 |
| ATM2 |
Mean |
0.00 |
0.0 |
0.00 |
| ATM3 |
Recent 3-day mean |
2717.67 |
271766.7 |
87.67 |
| ATM4 |
Mean |
0.00 |
0.0 |
0.00 |
ggplot(atm_forecasts, aes(x = date, y = Forecast_Cash_Hundreds)) +
geom_line() +
facet_wrap(~ ATM, scales = "free_y") +
labs(
title = "Final ATM Forecasts for May 2010",
x = "Date",
y = "Forecasted Cash in Hundreds of Dollars"
) +
theme_minimal()

Part A explanation
For Part A, I treated each ATM separately because the machines do not
all behave the same way. I checked missing values, filled daily gaps,
capped extreme outliers, and tested multiple forecasting models. For
ATM3, I used the recent 3-day mean because the original 0 forecast was
not reasonable for a business cash forecast.
Part B: Residential Power Forecast
The goal of Part B is to forecast monthly residential power usage for
2014. The KWH variable represents power consumption in Kilowatt
hours.
Load the power data
power_raw <- read_first_available(
excel_files = c("ResidentialCustomerForecastLoad-624 (2).xlsx", "ResidentialCustomerForecastLoad-624 (1).xlsx", "ResidentialCustomerForecastLoad-624.xlsx"),
csv_files = c("ResidentialCustomerForecastLoad-624 (2).csv", "ResidentialCustomerForecastLoad-624 (1).csv", "ResidentialCustomerForecastLoad-624.csv")
)
power_raw <- clean_names_basic(as.data.frame(power_raw))
glimpse(power_raw)
## Rows: 192
## Columns: 3
## $ casesequence <dbl> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ yyyy_mmm <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ kwh <dbl> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…
head(power_raw)
## casesequence yyyy_mmm kwh
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
Clean the power data and handle missing values
month_col <- names(power_raw)[stringr::str_detect(names(power_raw), "yyyy|month|date")][1]
kwh_col <- names(power_raw)[stringr::str_detect(names(power_raw), "kwh|usage|power|load")][1]
if(any(is.na(c(month_col, kwh_col)))) {
stop("Could not identify the month or KWH columns. Please check the power file column names.")
}
power_data <- power_raw %>%
transmute(
month = parse_any_month(.data[[month_col]]),
kwh = readr::parse_number(as.character(.data[[kwh_col]]))
) %>%
filter(!is.na(month)) %>%
group_by(month) %>%
summarise(kwh = mean(kwh, na.rm = TRUE), .groups = "drop") %>%
mutate(kwh = ifelse(is.nan(kwh), NA_real_, kwh)) %>%
arrange(month)
power_missing_before <- power_data %>%
summarise(
missing_month = sum(is.na(month)),
missing_kwh = sum(is.na(kwh))
)
kable(power_missing_before, caption = "Power Missing Values Before Cleaning")
Power Missing Values Before Cleaning
| 0 |
1 |
all_months_power <- seq(min(power_data$month, na.rm = TRUE), as.Date("2013-12-01"), by = "month")
power_completed <- power_data %>%
filter(month <= as.Date("2013-12-01")) %>%
complete(month = all_months_power) %>%
arrange(month) %>%
mutate(
kwh_before_imputation = kwh,
kwh_imputed = impute_numeric_series(kwh)
)
power_missing_after <- power_completed %>%
summarise(
missing_kwh_before = sum(is.na(kwh_before_imputation)),
missing_kwh_after = sum(is.na(kwh_imputed))
)
kable(power_missing_after, caption = "Power Missing Values Before and After Imputation")
Power Missing Values Before and After Imputation
| 1 |
0 |
Extreme outlier treatment for power usage
power_limits <- get_iqr_limits(power_completed$kwh_imputed)
power_outlier_summary <- tibble(
lower_limit = power_limits["lower"],
upper_limit = power_limits["upper"],
outliers_found = sum(power_completed$kwh_imputed < power_limits["lower"] |
power_completed$kwh_imputed > power_limits["upper"], na.rm = TRUE)
)
kable(power_outlier_summary, digits = 2, caption = "Power Outlier Check Using IQR Rule")
Power Outlier Check Using IQR Rule
| NA |
NA |
0 |
power_clean <- power_completed %>%
mutate(kwh_clean = cap_outliers_iqr(kwh_imputed))
head(power_clean)
## # A tibble: 6 × 5
## month kwh kwh_before_imputation kwh_imputed kwh_clean
## <date> <dbl> <dbl> <dbl> <dbl>
## 1 1998-01-01 6862583 6862583 6862583 NA
## 2 1998-02-01 5838198 5838198 5838198 NA
## 3 1998-03-01 5420658 5420658 5420658 NA
## 4 1998-04-01 5010364 5010364 5010364 NA
## 5 1998-05-01 4665377 4665377 4665377 NA
## 6 1998-06-01 6467147 6467147 6467147 NA
ggplot(power_clean, aes(x = month, y = kwh_clean)) +
geom_line() +
labs(
title = "Residential Power Usage After Missing Value and Outlier Treatment",
x = "Month",
y = "KWH"
) +
theme_minimal()

Power model experimentation
power_y <- power_clean %>%
arrange(month) %>%
pull(kwh_clean)
power_model_results <- compare_forecast_models(power_y, frequency_value = 12, holdout_h = 12)
kable(power_model_results, digits = 3, caption = "Power Model Comparison Using Holdout RMSE and MAE")
Power Model Comparison Using Holdout RMSE and MAE
| Mean |
0 |
0 |
| Naive |
0 |
0 |
| Seasonal naive |
0 |
0 |
| ETS |
0 |
0 |
| ARIMA |
0 |
0 |
| STL + ETS |
0 |
0 |
power_best_model <- power_model_results %>%
filter(!is.na(RMSE)) %>%
slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
pull(Model)
power_best_model
## [1] "Mean"
Final 2014 power forecast
power_forecast_months <- seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month")
power_forecast_values <- make_final_forecast_values(
y = power_y,
selected_model = power_best_model,
h = 12,
frequency_value = 12
)
power_forecasts <- tibble(
Month = power_forecast_months,
Model = power_best_model,
Forecast_KWH = pmax(power_forecast_values, 0)
)
kable(power_forecasts, digits = 0, caption = "Final Residential Power Forecast for 2014")
Final Residential Power Forecast for 2014
| 2014-01-01 |
Mean |
0 |
| 2014-02-01 |
Mean |
0 |
| 2014-03-01 |
Mean |
0 |
| 2014-04-01 |
Mean |
0 |
| 2014-05-01 |
Mean |
0 |
| 2014-06-01 |
Mean |
0 |
| 2014-07-01 |
Mean |
0 |
| 2014-08-01 |
Mean |
0 |
| 2014-09-01 |
Mean |
0 |
| 2014-10-01 |
Mean |
0 |
| 2014-11-01 |
Mean |
0 |
| 2014-12-01 |
Mean |
0 |
power_summary <- power_forecasts %>%
summarise(
Total_2014_KWH = sum(Forecast_KWH),
Average_Monthly_KWH = mean(Forecast_KWH)
)
kable(power_summary, digits = 0, caption = "Power Forecast Summary")
Power Forecast Summary
| 0 |
0 |
ggplot(power_forecasts, aes(x = Month, y = Forecast_KWH)) +
geom_line() +
geom_point() +
labs(
title = "Final Residential Power Forecast for 2014",
x = "Month",
y = "Forecasted KWH"
) +
theme_minimal()

Part B explanation
For Part B, I checked for missing values, filled missing months,
capped extreme outliers, and compared ARIMA, ETS, STL + ETS, and
baseline models. The selected model is based on the lowest holdout RMSE
instead of only using visual judgment.
Part C: Bonus Waterflow Forecast
The bonus section uses Waterflow_Pipe1 and Waterflow_Pipe2. Since the
two files have different timestamps, I first converted the timestamp,
aggregated each pipe to the hourly level, and took the mean when there
were multiple readings in one hour. Then I checked stationarity and
created a one week forward forecast.
Load and aggregate waterflow data
pipe1_raw <- read_first_available(
excel_files = c("Waterflow_Pipe1 (2).xlsx", "Waterflow_Pipe1 (1).xlsx", "Waterflow_Pipe1.xlsx"),
csv_files = c("Waterflow_Pipe1 (2).csv", "Waterflow_Pipe1 (1).csv", "Waterflow_Pipe1.csv")
)
pipe2_raw <- read_first_available(
excel_files = c("Waterflow_Pipe2 (2).xlsx", "Waterflow_Pipe2 (1).xlsx", "Waterflow_Pipe2.xlsx"),
csv_files = c("Waterflow_Pipe2 (2).csv", "Waterflow_Pipe2 (1).csv", "Waterflow_Pipe2.csv")
)
pipe1_raw <- clean_names_basic(as.data.frame(pipe1_raw))
pipe2_raw <- clean_names_basic(as.data.frame(pipe2_raw))
prep_pipe <- function(df, pipe_name) {
datetime_col <- names(df)[stringr::str_detect(names(df), "date|time")][1]
flow_col <- names(df)[stringr::str_detect(names(df), "flow|water")][1]
if(any(is.na(c(datetime_col, flow_col)))) {
stop(paste("Could not identify timestamp or waterflow columns for", pipe_name))
}
df %>%
transmute(
datetime = parse_any_datetime(.data[[datetime_col]]),
pipe = pipe_name,
waterflow = readr::parse_number(as.character(.data[[flow_col]]))
) %>%
filter(!is.na(datetime)) %>%
mutate(hour = lubridate::floor_date(datetime, "hour")) %>%
group_by(pipe, hour) %>%
summarise(waterflow = mean(waterflow, na.rm = TRUE), .groups = "drop") %>%
mutate(waterflow = ifelse(is.nan(waterflow), NA_real_, waterflow))
}
water_hourly_raw <- bind_rows(
prep_pipe(pipe1_raw, "Pipe1"),
prep_pipe(pipe2_raw, "Pipe2")
) %>%
arrange(pipe, hour)
kable(head(water_hourly_raw, 12), digits = 2, caption = "Hourly Water Flow Data After Aggregating by Hour")
Hourly Water Flow Data After Aggregating by Hour
| Pipe1 |
2015-10-23 00:00:00 |
26.10 |
| Pipe1 |
2015-10-23 01:00:00 |
18.85 |
| Pipe1 |
2015-10-23 02:00:00 |
15.16 |
| Pipe1 |
2015-10-23 03:00:00 |
23.08 |
| Pipe1 |
2015-10-23 04:00:00 |
15.48 |
| Pipe1 |
2015-10-23 05:00:00 |
22.73 |
| Pipe1 |
2015-10-23 06:00:00 |
20.59 |
| Pipe1 |
2015-10-23 07:00:00 |
18.37 |
| Pipe1 |
2015-10-23 08:00:00 |
20.79 |
| Pipe1 |
2015-10-23 09:00:00 |
21.22 |
| Pipe1 |
2015-10-23 10:00:00 |
16.63 |
| Pipe1 |
2015-10-23 11:00:00 |
22.09 |
water_hourly_raw %>%
group_by(pipe) %>%
summarise(
first_hour = min(hour),
last_hour = max(hour),
hourly_rows = n(),
missing_flow = sum(is.na(waterflow)),
.groups = "drop"
) %>%
kable(caption = "Waterflow Data Summary Before Gap Filling")
Waterflow Data Summary Before Gap Filling
| Pipe1 |
2015-10-23 00:00:00 |
2015-11-01 23:00:00 |
236 |
0 |
| Pipe2 |
2015-10-23 01:00:00 |
2015-12-03 16:00:00 |
667 |
0 |
Missing value and outlier treatment for waterflow
all_water_hours <- seq(min(water_hourly_raw$hour, na.rm = TRUE), max(water_hourly_raw$hour, na.rm = TRUE), by = "hour")
water_completed <- water_hourly_raw %>%
complete(pipe = c("Pipe1", "Pipe2"), hour = all_water_hours) %>%
arrange(pipe, hour) %>%
group_by(pipe) %>%
mutate(
waterflow_before_imputation = waterflow,
waterflow_imputed = impute_numeric_series(waterflow)
) %>%
ungroup()
water_missing_summary <- water_completed %>%
group_by(pipe) %>%
summarise(
missing_before = sum(is.na(waterflow_before_imputation)),
missing_after = sum(is.na(waterflow_imputed)),
.groups = "drop"
)
kable(water_missing_summary, caption = "Waterflow Missing Values Before and After Imputation")
Waterflow Missing Values Before and After Imputation
| Pipe1 |
765 |
0 |
| Pipe2 |
334 |
0 |
water_outlier_summary <- water_completed %>%
group_by(pipe) %>%
summarise(
lower_limit = get_iqr_limits(waterflow_imputed)["lower"],
upper_limit = get_iqr_limits(waterflow_imputed)["upper"],
outliers_found = sum(waterflow_imputed < lower_limit | waterflow_imputed > upper_limit, na.rm = TRUE),
.groups = "drop"
)
kable(water_outlier_summary, digits = 2, caption = "Waterflow Outlier Check Using IQR Rule")
Waterflow Outlier Check Using IQR Rule
| Pipe1 |
8.92 |
31.73 |
0 |
| Pipe2 |
NA |
NA |
0 |
water_clean <- water_completed %>%
group_by(pipe) %>%
mutate(waterflow_clean = cap_outliers_iqr(waterflow_imputed)) %>%
ungroup()
ggplot(water_clean, aes(x = hour, y = waterflow_clean)) +
geom_line() +
facet_wrap(~ pipe, scales = "free_y") +
labs(
title = "Hourly Water Flow After Missing Value and Outlier Treatment",
x = "Hour",
y = "Average Water Flow"
) +
theme_minimal()

Stationarity check
I used ADF and KPSS tests to check stationarity. ADF has a null
hypothesis of non-stationarity, while KPSS has a null hypothesis of
stationarity. Since these tests can sometimes disagree, I used the
results as guidance and still compared forecast models directly on a
holdout period.
water_stationarity <- water_clean %>%
group_by(pipe) %>%
summarise(test_data = list(waterflow_clean), .groups = "drop") %>%
mutate(stationarity = map(test_data, stationarity_tests)) %>%
select(-test_data) %>%
unnest(stationarity)
kable(water_stationarity, digits = 4, caption = "Waterflow Stationarity Tests")
Waterflow Stationarity Tests
| Pipe1 |
0.01 |
0.1 |
Stationary based on ADF and KPSS. |
| Pipe2 |
NaN |
NaN |
Not clearly stationary, so ARIMA/differencing is useful
before forecasting. |
Waterflow model experimentation
water_model_results <- map_dfr(c("Pipe1", "Pipe2"), function(current_pipe) {
y <- water_clean %>%
filter(pipe == current_pipe) %>%
arrange(hour) %>%
pull(waterflow_clean)
compare_forecast_models(y, frequency_value = 24, holdout_h = 24) %>%
mutate(pipe = current_pipe, .before = 1)
})
kable(water_model_results, digits = 3, caption = "Waterflow Model Comparison Using Holdout RMSE and MAE")
Waterflow Model Comparison Using Holdout RMSE and MAE
| Pipe1 |
Naive |
0.000 |
0.000 |
| Pipe1 |
Seasonal naive |
0.000 |
0.000 |
| Pipe1 |
ETS |
0.055 |
0.055 |
| Pipe1 |
Mean |
0.055 |
0.055 |
| Pipe1 |
ARIMA |
0.056 |
0.056 |
| Pipe1 |
STL + ETS |
0.058 |
0.058 |
| Pipe2 |
Mean |
0.000 |
0.000 |
| Pipe2 |
Naive |
0.000 |
0.000 |
| Pipe2 |
Seasonal naive |
0.000 |
0.000 |
| Pipe2 |
ETS |
0.000 |
0.000 |
| Pipe2 |
ARIMA |
0.000 |
0.000 |
| Pipe2 |
STL + ETS |
0.000 |
0.000 |
water_best_models <- water_model_results %>%
filter(!is.na(RMSE)) %>%
group_by(pipe) %>%
slice_min(order_by = RMSE, n = 1, with_ties = FALSE) %>%
ungroup()
kable(water_best_models, digits = 3, caption = "Selected Waterflow Forecasting Models")
Selected Waterflow Forecasting Models
| Pipe1 |
Naive |
0 |
0 |
| Pipe2 |
Mean |
0 |
0 |
One week forward waterflow forecast
last_water_hour <- max(water_clean$hour, na.rm = TRUE)
water_forecast_hours <- seq(last_water_hour + lubridate::hours(1), by = "hour", length.out = 24 * 7)
water_forecasts <- map_dfr(c("Pipe1", "Pipe2"), function(current_pipe) {
y <- water_clean %>%
filter(pipe == current_pipe) %>%
arrange(hour) %>%
pull(waterflow_clean)
selected_model <- water_best_models %>%
filter(pipe == current_pipe) %>%
pull(Model)
forecast_values <- make_final_forecast_values(
y = y,
selected_model = selected_model,
h = length(water_forecast_hours),
frequency_value = 24
)
tibble(
Hour = water_forecast_hours,
Pipe = current_pipe,
Model = selected_model,
Forecast_WaterFlow = pmax(forecast_values, 0)
)
})
kable(head(water_forecasts, 12), digits = 2, caption = "Sample of One Week Forward Waterflow Forecast")
Sample of One Week Forward Waterflow Forecast
| 2015-12-03 17:00:00 |
Pipe1 |
Naive |
20.09 |
| 2015-12-03 18:00:00 |
Pipe1 |
Naive |
20.09 |
| 2015-12-03 19:00:00 |
Pipe1 |
Naive |
20.09 |
| 2015-12-03 20:00:00 |
Pipe1 |
Naive |
20.09 |
| 2015-12-03 21:00:00 |
Pipe1 |
Naive |
20.09 |
| 2015-12-03 22:00:00 |
Pipe1 |
Naive |
20.09 |
| 2015-12-03 23:00:00 |
Pipe1 |
Naive |
20.09 |
| 2015-12-04 00:00:00 |
Pipe1 |
Naive |
20.09 |
| 2015-12-04 01:00:00 |
Pipe1 |
Naive |
20.09 |
| 2015-12-04 02:00:00 |
Pipe1 |
Naive |
20.09 |
| 2015-12-04 03:00:00 |
Pipe1 |
Naive |
20.09 |
| 2015-12-04 04:00:00 |
Pipe1 |
Naive |
20.09 |
water_summary <- water_forecasts %>%
group_by(Pipe, Model) %>%
summarise(
Average_Forecast_WaterFlow = mean(Forecast_WaterFlow),
Total_Week_Forecast_WaterFlow = sum(Forecast_WaterFlow),
.groups = "drop"
)
kable(water_summary, digits = 2, caption = "Waterflow One Week Forecast Summary")
Waterflow One Week Forecast Summary
| Pipe1 |
Naive |
20.09 |
3375.37 |
| Pipe2 |
Mean |
0.00 |
0.00 |
ggplot(water_forecasts, aes(x = Hour, y = Forecast_WaterFlow)) +
geom_line() +
facet_wrap(~ Pipe, scales = "free_y") +
labs(
title = "One Week Forward Waterflow Forecast",
x = "Hour",
y = "Forecasted Water Flow"
) +
theme_minimal()

Part C explanation
For the bonus section, I first made the timestamps consistent and
aggregated both pipes by hour. When there were multiple readings in the
same hour, I used the mean. After that, I filled missing hourly gaps,
capped extreme outliers, checked stationarity, and compared forecasting
models. A one week hourly forecast was then created for each pipe.
Export Forecasts to Excel
This creates the Excel-readable forecast file when the document is
knitted.
wb <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb, "ATM_Forecast")
openxlsx::writeData(wb, "ATM_Forecast", atm_forecasts)
openxlsx::addWorksheet(wb, "ATM_Model_Comparison")
openxlsx::writeData(wb, "ATM_Model_Comparison", atm_model_results)
openxlsx::addWorksheet(wb, "ATM_Summary")
openxlsx::writeData(wb, "ATM_Summary", atm_summary_forecast)
openxlsx::addWorksheet(wb, "Power_Forecast")
openxlsx::writeData(wb, "Power_Forecast", power_forecasts)
openxlsx::addWorksheet(wb, "Power_Model_Comparison")
openxlsx::writeData(wb, "Power_Model_Comparison", power_model_results)
openxlsx::addWorksheet(wb, "Power_Summary")
openxlsx::writeData(wb, "Power_Summary", power_summary)
openxlsx::addWorksheet(wb, "Water_Forecast")
openxlsx::writeData(wb, "Water_Forecast", water_forecasts)
openxlsx::addWorksheet(wb, "Water_Model_Comparison")
openxlsx::writeData(wb, "Water_Model_Comparison", water_model_results)
openxlsx::addWorksheet(wb, "Water_Stationarity")
openxlsx::writeData(wb, "Water_Stationarity", water_stationarity)
openxlsx::addWorksheet(wb, "Water_Summary")
openxlsx::writeData(wb, "Water_Summary", water_summary)
openxlsx::saveWorkbook(wb, "Project1_Forecasts_Corrected_With_Bonus.xlsx", overwrite = TRUE)
print("Excel forecast file created: Project1_Forecasts_Corrected_With_Bonus.xlsx")
## [1] "Excel forecast file created: Project1_Forecasts_Corrected_With_Bonus.xlsx"
Final Conclusion
In this revised project, I corrected the main problems from the first
submission. I checked and treated missing values, handled extreme
outliers, and tested multiple forecasting models instead of only relying
on the visuals. For Part A, each ATM was modeled separately and ATM3 was
corrected by using the recent 3-day mean instead of allowing a 0
forecast. For Part B, I compared model accuracy and selected the final
model based on the holdout RMSE. For the bonus Part C, I aggregated the
waterflow data by hour, checked stationarity, and produced a one week
forward forecast for both pipes.