This project has three parts. Part A forecasts daily ATM cash withdrawals for May 2010. Part B forecasts monthly residential electricity usage for 2014. Part C is the bonus section and forecasts one week of hourly waterflow.
library(readxl)
library(dplyr)
library(tidyr)
library(lubridate)
library(forecast)
library(writexl)
library(zoo)
library(ggplot2)
library(knitr)
# read the Excel files from the same folder as the R Markdown file.
atm_raw <- read_excel("ATM624Data.xlsx")
power_raw <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
pipe1_raw <- read_excel("Waterflow_Pipe1.xlsx")
pipe2_raw <- read_excel("Waterflow_Pipe2.xlsx")
# convert Excel serial dates to R dates.
atm_raw <- atm_raw %>%
mutate(DATE = as.Date(DATE, origin = "1899-12-30"))
# convert Excel date-time serial values to POSIXct date-time values.
pipe1_raw <- pipe1_raw %>%
mutate(`Date Time` = as.POSIXct(`Date Time` * 86400,
origin = "1899-12-30",
tz = "UTC")) %>%
mutate(`Date Time` = floor_date(`Date Time`, unit = "minute"))
pipe2_raw <- pipe2_raw %>%
mutate(`Date Time` = as.POSIXct(`Date Time` * 86400,
origin = "1899-12-30",
tz = "UTC")) %>%
mutate(`Date Time` = floor_date(`Date Time`, unit = "minute"))
# quick previews.
head(atm_raw)
head(power_raw)
head(pipe1_raw)
head(pipe2_raw)
# fit one selected forecasting method and return the forecast
forecast_candidate <- function(y_train, h, model_name) {
switch(
model_name,
"Mean" = meanf(y_train, h = h),
"Naive" = naive(y_train, h = h),
"Seasonal Naive" = snaive(y_train, h = h),
"ETS" = forecast(ets(y_train), h = h),
"ARIMA" = forecast(auto.arima(y_train), h = h),
"STL + ETS" = stlf(y_train, h = h, method = "ets"),
"Holt-Winters Additive" = hw(y_train, h = h, seasonal = "additive", damped = TRUE)
)
}
# calculate RMSE, MAE, and MAPE on the holdout period
accuracy_row <- function(actual, predicted) {
actual <- as.numeric(actual)
predicted <- as.numeric(predicted)
mape_values <- abs((actual - predicted) / ifelse(actual == 0, NA, actual))
tibble(
RMSE = sqrt(mean((actual - predicted)^2, na.rm = TRUE)),
MAE = mean(abs(actual - predicted), na.rm = TRUE),
MAPE = ifelse(all(is.na(mape_values)), NA_real_, mean(mape_values, na.rm = TRUE) * 100)
)
}
# compare candidate models using the same holdout period
evaluate_models <- function(y_values, series_name, frequency_value, holdout_n, model_names) {
y_values <- as.numeric(y_values)
y_values <- y_values[!is.na(y_values)]
n <- length(y_values)
holdout_n <- min(holdout_n, n - max(10, frequency_value * 2))
if (holdout_n < 1) {
return(tibble(
Series = series_name,
Model = model_names,
RMSE = NA_real_,
MAE = NA_real_,
MAPE = NA_real_
))
}
train_values <- y_values[1:(n - holdout_n)]
test_values <- y_values[(n - holdout_n + 1):n]
train_ts <- ts(train_values, frequency = frequency_value)
bind_rows(lapply(model_names, function(model_name) {
fc <- tryCatch(
forecast_candidate(train_ts, h = length(test_values), model_name = model_name),
error = function(e) NULL
)
if (is.null(fc)) {
tibble(
Series = series_name,
Model = model_name,
RMSE = NA_real_,
MAE = NA_real_,
MAPE = NA_real_
)
} else {
accuracy_row(test_values, as.numeric(fc$mean)) %>%
mutate(
Series = series_name,
Model = model_name
) %>%
select(Series, Model, RMSE, MAE, MAPE)
}
}))
}
The goal in Part A is to forecast how much cash will be taken out of four different ATM machines for May 2010. The cash variable is recorded in hundreds of dollars.
I treated each ATM as a separate time series because the four machines do not behave the same way. ATM1, ATM2, and ATM4 show short term day of week behavior. ATM3 is very unusual because it is inactive for almost the whole data set and only becomes active very late in the series. Because of that, I did not force one single model onto all four machines.
atm_base <- atm_raw %>%
filter(!is.na(ATM)) %>%
mutate(
DATE = as.Date(DATE),
Cash = as.numeric(Cash)
) %>%
arrange(ATM, DATE)
clean_atm_series <- function(data, atm_name) {
one_atm <- data %>%
filter(ATM == atm_name) %>%
select(DATE, ATM, Cash) %>%
distinct(DATE, .keep_all = TRUE) %>%
arrange(DATE)
# create a complete daily sequence so missing ATM dates can be counted
full_series <- tibble(DATE = seq(min(one_atm$DATE), max(one_atm$DATE), by = "day")) %>%
left_join(one_atm, by = "DATE") %>%
mutate(
ATM = atm_name,
missing_flag = is.na(Cash)
)
if (atm_name == "ATM3") {
clean_values <- na.approx(full_series$Cash, na.rm = FALSE)
clean_values <- na.locf(clean_values, na.rm = FALSE)
clean_values <- na.locf(clean_values, fromLast = TRUE, na.rm = FALSE)
full_series %>%
mutate(
Cash_Clean = as.numeric(clean_values),
outlier_flag = FALSE
)
} else {
# use tsclean to handle missing values and possible outliers for the stable ATM series
clean_ts <- tsclean(ts(full_series$Cash, frequency = 7), replace.missing = TRUE)
full_series %>%
mutate(
Cash_Clean = as.numeric(clean_ts),
outlier_flag = !missing_flag & !is.na(Cash) & abs(Cash - Cash_Clean) > 1e-8
)
}
}
atm_names <- sort(unique(atm_base$ATM))
atm_clean <- bind_rows(lapply(atm_names, function(x) clean_atm_series(atm_base, x)))
atm_audit <- atm_clean %>%
group_by(ATM) %>%
summarise(
First_Date = min(DATE),
Last_Date = max(DATE),
Daily_Rows_After_Completion = n(),
Missing_Values_or_Missing_Dates = sum(missing_flag),
Extreme_Outliers_Replaced = sum(outlier_flag),
.groups = "drop"
)
atm_audit %>%
kable(caption = "Part A data quality audit after completing daily series")
| ATM | First_Date | Last_Date | Daily_Rows_After_Completion | Missing_Values_or_Missing_Dates | Extreme_Outliers_Replaced |
|---|---|---|---|---|---|
| ATM1 | 2009-05-01 | 2010-04-30 | 365 | 3 | 26 |
| ATM2 | 2009-05-01 | 2010-04-30 | 365 | 2 | 10 |
| ATM3 | 2009-05-01 | 2010-04-30 | 365 | 0 | 0 |
| ATM4 | 2009-05-01 | 2010-04-30 | 365 | 0 | 1 |
atm_clean %>%
filter(missing_flag | outlier_flag) %>%
select(ATM, DATE, Cash, Cash_Clean, missing_flag, outlier_flag) %>%
arrange(ATM, DATE) %>%
head(20) %>%
kable(digits = 2, caption = "First missing values or outliers handled in Part A")
| ATM | DATE | Cash | Cash_Clean | missing_flag | outlier_flag |
|---|---|---|---|---|---|
| ATM1 | 2009-06-05 | 3 | 102.99 | FALSE | TRUE |
| ATM1 | 2009-06-13 | NA | 125.59 | TRUE | FALSE |
| ATM1 | 2009-06-16 | NA | 131.13 | TRUE | FALSE |
| ATM1 | 2009-06-22 | NA | 97.69 | TRUE | FALSE |
| ATM1 | 2009-07-10 | 19 | 108.58 | FALSE | TRUE |
| ATM1 | 2009-08-02 | 68 | 99.88 | FALSE | TRUE |
| ATM1 | 2009-08-25 | 174 | 133.47 | FALSE | TRUE |
| ATM1 | 2009-09-11 | 1 | 115.44 | FALSE | TRUE |
| ATM1 | 2009-09-22 | 180 | 119.37 | FALSE | TRUE |
| ATM1 | 2009-11-30 | 6 | 105.15 | FALSE | TRUE |
| ATM1 | 2009-12-01 | 140 | 98.69 | FALSE | TRUE |
| ATM1 | 2009-12-04 | 140 | 95.86 | FALSE | TRUE |
| ATM1 | 2009-12-08 | 135 | 86.02 | FALSE | TRUE |
| ATM1 | 2009-12-29 | 42 | 92.20 | FALSE | TRUE |
| ATM1 | 2009-12-30 | 2 | 72.76 | FALSE | TRUE |
| ATM1 | 2010-01-01 | 132 | 94.00 | FALSE | TRUE |
| ATM1 | 2010-01-11 | 123 | 105.79 | FALSE | TRUE |
| ATM1 | 2010-01-29 | 152 | 93.12 | FALSE | TRUE |
| ATM1 | 2010-02-02 | 123 | 74.03 | FALSE | TRUE |
| ATM1 | 2010-02-05 | 138 | 87.88 | FALSE | TRUE |
The audit table documents how missing observations and extreme outliers were handled. For ATM1, ATM2, and ATM4, I use the cleaned value for modeling. For ATM3, I keep the long sequence of zeros because it looks like the machine was inactive for most of the history rather than randomly missing.
# Compare raw cash values with the cleaned values used in modeling.
atm_clean %>%
ggplot(aes(x = DATE)) +
geom_line(aes(y = Cash), alpha = 0.45) +
geom_line(aes(y = Cash_Clean), linewidth = 0.5) +
facet_wrap(~ ATM, scales = "free_y", ncol = 2) +
labs(
title = "ATM daily cash withdrawals: raw and cleaned series",
x = "Date",
y = "Cash in hundreds of dollars"
)
The plot shows that ATM1, ATM2, and ATM4 have weekly movement but different levels. ATM4 has the largest scale and the most volatility. ATM3 is different because it is almost all zeros and only becomes active near the end of the data set.
I compared ATM1, ATM2, and ATM4 using the last 31 days as a holdout set. The models tested were mean, naive, seasonal naive, ETS, ARIMA, and STL plus ETS. This makes the model choice more transparent because each ATM is evaluated using the same holdout period.
atm_model_names <- c("Mean", "Naive", "Seasonal Naive", "ETS", "ARIMA", "STL + ETS")
# compare the class models using the last 31 days as a holdout set
atm_model_metrics <- bind_rows(lapply(c("ATM1", "ATM2", "ATM4"), function(atm_name) {
y <- atm_clean %>%
filter(ATM == atm_name) %>%
arrange(DATE) %>%
pull(Cash_Clean)
evaluate_models(
y_values = y,
series_name = atm_name,
frequency_value = 7,
holdout_n = 31,
model_names = atm_model_names
)
})) %>%
arrange(Series, RMSE)
atm_model_metrics %>%
mutate(across(c(RMSE, MAE, MAPE), ~ round(.x, 2))) %>%
kable(caption = "Part A holdout model comparison for ATM1, ATM2, and ATM4")
| Series | Model | RMSE | MAE | MAPE |
|---|---|---|---|---|
| ATM1 | ARIMA | 13.39 | 10.42 | 49.71 |
| ATM1 | ETS | 13.77 | 10.68 | 53.72 |
| ATM1 | Seasonal Naive | 15.86 | 13.16 | 70.79 |
| ATM1 | STL + ETS | 16.75 | 13.09 | 85.40 |
| ATM1 | Mean | 32.16 | 20.21 | 314.36 |
| ATM1 | Naive | 67.20 | 63.29 | 126.73 |
| ATM2 | ETS | 13.41 | 10.46 | 26.15 |
| ATM2 | STL + ETS | 15.92 | 12.49 | 102.61 |
| ATM2 | ARIMA | 16.16 | 11.88 | 88.32 |
| ATM2 | Seasonal Naive | 16.39 | 13.52 | 40.36 |
| ATM2 | Mean | 37.39 | 32.16 | 693.07 |
| ATM2 | Naive | 71.32 | 60.83 | 100.00 |
| ATM4 | STL + ETS | 245.09 | 203.15 | 525.96 |
| ATM4 | ARIMA | 275.94 | 233.11 | 1030.13 |
| ATM4 | Mean | 281.24 | 235.57 | 1119.09 |
| ATM4 | ETS | 303.89 | 255.06 | 1227.94 |
| ATM4 | Seasonal Naive | 329.58 | 260.18 | 372.84 |
| ATM4 | Naive | 466.06 | 378.19 | 117.24 |
# select the lowest RMSE model for each ATM
atm_chosen_non3 <- atm_model_metrics %>%
filter(!is.na(RMSE)) %>%
group_by(Series) %>%
arrange(RMSE, MAE, .by_group = TRUE) %>%
slice(1) %>%
ungroup() %>%
transmute(ATM = Series, Model = Model, RMSE = RMSE, MAE = MAE, MAPE = MAPE)
Based on the holdout test, I select the lowest RMSE model for ATM1, ATM2, and ATM4. If two models are close, I would prefer the simpler one, but the table above keeps the selection transparent.
ATM3 needs separate treatment because it does not have a normal full-year operating history. A mean forecast is reasonable here, but I also document why models like ETS, ARIMA, and STL are not reliable for this machine.
atm3_active <- atm_clean %>%
filter(ATM == "ATM3", Cash_Clean > 0) %>%
arrange(DATE)
atm3_active_n <- nrow(atm3_active)
atm3_level <- mean(atm3_active$Cash_Clean, na.rm = TRUE)
atm3_model_note <- tibble(
ATM = "ATM3",
Model = c("Mean of active days", "ETS", "ARIMA", "STL + ETS"),
Decision = c("Selected", "Not selected", "Not selected", "Not selected"),
Reason = c(
paste0("Only ", atm3_active_n, " active observations are available, so the mean of active days is the most stable choice."),
"The active history is too short for a dependable exponential smoothing model.",
"The active history is too short for a dependable ARIMA model.",
"STL requires a meaningful seasonal history, which ATM3 does not have after activation."
)
)
atm3_active %>%
select(DATE, Cash_Clean) %>%
kable(digits = 2, caption = "ATM3 active observations used for the mean forecast")
| DATE | Cash_Clean |
|---|---|
| 2010-04-28 | 96 |
| 2010-04-29 | 82 |
| 2010-04-30 | 85 |
atm3_model_note %>%
kable(caption = "ATM3 model decision")
| ATM | Model | Decision | Reason |
|---|---|---|---|
| ATM3 | Mean of active days | Selected | Only 3 active observations are available, so the mean of active days is the most stable choice. |
| ATM3 | ETS | Not selected | The active history is too short for a dependable exponential smoothing model. |
| ATM3 | ARIMA | Not selected | The active history is too short for a dependable ARIMA model. |
| ATM3 | STL + ETS | Not selected | STL requires a meaningful seasonal history, which ATM3 does not have after activation. |
atm3_choice <- tibble(
ATM = "ATM3",
Model = "Mean of active days",
RMSE = NA_real_,
MAE = NA_real_,
MAPE = NA_real_
)
atm_chosen <- bind_rows(atm_chosen_non3, atm3_choice) %>%
arrange(ATM)
For ATM3, I use the mean of the active observations. I do not use the long zero period to pull the forecast down because those zeros appear to represent a machine that was not active, not normal customer behavior after activation.
may_dates <- seq(as.Date("2010-05-01"), as.Date("2010-05-31"), by = "day")
make_atm_final_forecast <- function(atm_name) {
chosen_model <- atm_chosen %>%
filter(ATM == atm_name) %>%
pull(Model)
if (atm_name == "ATM3") {
return(tibble(
Date = may_dates,
ATM = atm_name,
Model = chosen_model,
Forecast_Cash_Hundreds = atm3_level
))
}
y <- atm_clean %>%
filter(ATM == atm_name) %>%
arrange(DATE) %>%
pull(Cash_Clean)
fc <- forecast_candidate(ts(y, frequency = 7), h = length(may_dates), model_name = chosen_model)
tibble(
Date = may_dates,
ATM = atm_name,
Model = chosen_model,
Forecast_Cash_Hundreds = as.numeric(fc$mean)
)
}
atm_final <- bind_rows(lapply(atm_names, make_atm_final_forecast)) %>%
arrange(Date, ATM)
atm_final %>%
filter(Date <= as.Date("2010-05-03")) %>%
kable(digits = 2, caption = "Part A forecast sample for the first three days of May 2010")
| Date | ATM | Model | Forecast_Cash_Hundreds |
|---|---|---|---|
| 2010-05-01 | ATM1 | ARIMA | 86.42 |
| 2010-05-01 | ATM2 | ETS | 71.67 |
| 2010-05-01 | ATM3 | Mean of active days | 87.67 |
| 2010-05-01 | ATM4 | STL + ETS | 410.89 |
| 2010-05-02 | ATM1 | ARIMA | 100.34 |
| 2010-05-02 | ATM2 | ETS | 75.28 |
| 2010-05-02 | ATM3 | Mean of active days | 87.67 |
| 2010-05-02 | ATM4 | STL + ETS | 454.86 |
| 2010-05-03 | ATM1 | ARIMA | 75.36 |
| 2010-05-03 | ATM2 | ETS | 11.07 |
| 2010-05-03 | ATM3 | Mean of active days | 87.67 |
| 2010-05-03 | ATM4 | STL + ETS | 446.56 |
The final ATM forecast uses the cleaned daily series after missing values and possible outliers were handled. ATM1, ATM2, and ATM4 use the model with the lowest holdout RMSE. ATM3 is handled separately because it was inactive for most of the year, so using the mean of the active days is more reasonable than forcing a full seasonal model.
The goal in Part B is to model residential power usage from January 1998 through December 2013 and forecast the monthly values for 2014.
This series is monthly, long, and seasonal. For utility usage, a model that captures a smooth trend and repeated yearly seasonality is a natural choice.
power_data <- power_raw %>%
transmute(
Date = as.Date(paste0(`YYYY-MMM`, "-01"), format = "%Y-%b-%d"),
KWH = as.numeric(KWH)
) %>%
arrange(Date)
power_ts_raw <- ts(power_data$KWH, start = c(1998, 1), frequency = 12)
# clean the power series by filling the missing month and replacing possible outliers
power_ts_clean <- tsclean(power_ts_raw, replace.missing = TRUE)
power_data <- power_data %>%
mutate(
KWH_Clean = as.numeric(power_ts_clean),
missing_flag = is.na(KWH),
outlier_flag = !missing_flag & !is.na(KWH) & abs(KWH - KWH_Clean) > 1e-8
)
power_audit <- power_data %>%
summarise(
First_Month = min(Date),
Last_Month = max(Date),
Months = n(),
Missing_Months = sum(missing_flag),
Extreme_Outliers_Replaced = sum(outlier_flag)
)
power_audit %>%
kable(caption = "Part B data quality audit")
| First_Month | Last_Month | Months | Missing_Months | Extreme_Outliers_Replaced |
|---|---|---|---|---|
| 1998-01-01 | 2013-12-01 | 192 | 1 | 2 |
power_data %>%
filter(missing_flag | outlier_flag) %>%
select(Date, KWH, KWH_Clean, missing_flag, outlier_flag) %>%
kable(digits = 0, caption = "Part B missing values and outliers handled")
| Date | KWH | KWH_Clean | missing_flag | outlier_flag |
|---|---|---|---|---|
| 2008-09-01 | NA | 7436335 | TRUE | FALSE |
| 2010-07-01 | 770523 | 7686742 | FALSE | TRUE |
| 2013-12-01 | 9606304 | 6987874 | FALSE | TRUE |
power_data %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = KWH), alpha = 0.45) +
geom_line(aes(y = KWH_Clean), linewidth = 0.5) +
labs(
title = "Residential monthly power usage: raw and cleaned series",
x = "Date",
y = "KWH"
)
The plot shows a strong yearly seasonal pattern and a changing long-term level. This supports testing models that can handle seasonality, such as ETS, ARIMA, seasonal naive, and STL-based forecasting.
I use the last 12 months as a holdout set. This is a natural choice because the final forecast also needs 12 months ahead.
power_model_names <- c("Mean", "Naive", "Seasonal Naive", "ETS", "ARIMA", "STL + ETS", "Holt-Winters Additive")
# compare power models using the last 12 months as the holdout period
power_model_metrics <- evaluate_models(
y_values = as.numeric(power_ts_clean),
series_name = "Residential Power",
frequency_value = 12,
holdout_n = 12,
model_names = power_model_names
) %>%
arrange(RMSE)
power_model_metrics %>%
mutate(across(c(RMSE, MAE, MAPE), ~ round(.x, 2))) %>%
kable(caption = "Part B holdout model comparison")
| Series | Model | RMSE | MAE | MAPE |
|---|---|---|---|---|
| Residential Power | Seasonal Naive | 579612.4 | 400403.1 | 4.91 |
| Residential Power | STL + ETS | 620155.5 | 402249.2 | 5.04 |
| Residential Power | ARIMA | 628999.0 | 444805.7 | 5.51 |
| Residential Power | Holt-Winters Additive | 698867.0 | 431084.6 | 5.37 |
| Residential Power | ETS | 705554.1 | 422014.3 | 5.18 |
| Residential Power | Naive | 1651544.2 | 1283217.5 | 15.82 |
| Residential Power | Mean | 1721859.6 | 1313853.7 | 15.96 |
power_chosen <- power_model_metrics %>%
filter(!is.na(RMSE)) %>%
arrange(RMSE, MAE) %>%
slice(1)
power_chosen_model <- power_chosen$Model[1]
power_chosen %>%
mutate(across(c(RMSE, MAE, MAPE), ~ round(.x, 2))) %>%
kable(caption = "Part B selected model based on holdout RMSE")
| Series | Model | RMSE | MAE | MAPE |
|---|---|---|---|---|
| Residential Power | Seasonal Naive | 579612.4 | 400403.1 | 4.91 |
This comparison gives a direct reason for the final model choice. I use the model with the lowest holdout RMSE for the 2014 forecast.
power_fit <- forecast_candidate(power_ts_clean, h = 12, model_name = power_chosen_model)
power_final <- tibble(
Month = seq(as.Date("2014-01-01"), as.Date("2014-12-01"), by = "month"),
Model = power_chosen_model,
Forecast_KWH = as.numeric(power_fit$mean)
)
power_final %>%
mutate(
Month = format(Month, "%Y-%m"),
Forecast_KWH = format(round(Forecast_KWH, 0), big.mark = ",", scientific = FALSE)
) %>%
kable(caption = "Part B 2014 monthly forecast")
| Month | Model | Forecast_KWH |
|---|---|---|
| 2014-01 | Seasonal Naive | 10,655,730 |
| 2014-02 | Seasonal Naive | 7,681,798 |
| 2014-03 | Seasonal Naive | 6,517,514 |
| 2014-04 | Seasonal Naive | 6,105,359 |
| 2014-05 | Seasonal Naive | 5,940,475 |
| 2014-06 | Seasonal Naive | 7,920,627 |
| 2014-07 | Seasonal Naive | 8,415,321 |
| 2014-08 | Seasonal Naive | 9,080,226 |
| 2014-09 | Seasonal Naive | 7,968,220 |
| 2014-10 | Seasonal Naive | 5,759,367 |
| 2014-11 | Seasonal Naive | 5,769,083 |
| 2014-12 | Seasonal Naive | 6,987,874 |
plot(power_fit, main = paste("Residential Power Forecast for 2014 -", power_chosen_model))
The residential power forecast now uses a more complete process than my first submission. I filled the missing month, checked possible outliers, and compared several seasonal forecasting models. The final 2014 forecast uses the model with the lowest holdout RMSE.
The goal in Part C is to time base sequence the waterflow data, aggregate the readings by hour using the mean, check whether the data is stable enough to forecast, and then produce a one week forecast.
The two waterflow files have different timestamp structures. Pipe1 has irregular timestamps and Pipe2 is already hourly, but I still aggregate both to hourly mean flow so the treatment is consistent.
prepare_pipe <- function(data) {
data %>%
mutate(hour = floor_date(`Date Time`, unit = "hour")) %>%
group_by(hour) %>%
summarise(WaterFlow = mean(WaterFlow, na.rm = TRUE), .groups = "drop") %>%
arrange(hour) %>%
complete(hour = seq(min(hour), max(hour), by = "hour")) %>%
mutate(
missing_flag = is.na(WaterFlow),
WaterFlow = na.approx(WaterFlow, na.rm = FALSE),
WaterFlow = na.locf(WaterFlow, na.rm = FALSE),
WaterFlow = na.locf(WaterFlow, fromLast = TRUE, na.rm = FALSE)
)
}
pipe1_hourly <- prepare_pipe(pipe1_raw)
pipe2_hourly <- prepare_pipe(pipe2_raw)
pipe_audit <- tibble(
Pipe = c("Pipe1", "Pipe2"),
First_Hour = c(min(pipe1_hourly$hour), min(pipe2_hourly$hour)),
Last_Hour = c(max(pipe1_hourly$hour), max(pipe2_hourly$hour)),
Hours = c(nrow(pipe1_hourly), nrow(pipe2_hourly)),
Filled_Hours = c(sum(pipe1_hourly$missing_flag), sum(pipe2_hourly$missing_flag))
)
pipe_audit %>%
kable(caption = "Part C hourly aggregation audit")
| Pipe | First_Hour | Last_Hour | Hours | Filled_Hours |
|---|---|---|---|---|
| Pipe1 | 2015-10-23 00:00:00 | 2015-11-01 23:00:00 | 240 | 4 |
| Pipe2 | 2015-10-23 01:00:00 | 2015-12-03 16:00:00 | 1000 | 333 |
head(pipe1_hourly)
head(pipe2_hourly)
par(mfrow = c(2, 1))
plot(pipe1_hourly$hour, pipe1_hourly$WaterFlow, type = "l",
main = "Pipe1 hourly mean flow", xlab = "Hour", ylab = "WaterFlow")
plot(pipe2_hourly$hour, pipe2_hourly$WaterFlow, type = "l",
main = "Pipe2 hourly mean flow", xlab = "Hour", ylab = "WaterFlow")
par(mfrow = c(1, 1))
After hourly aggregation, both pipe series are easier to model. Pipe1 has a shorter history and more visible level movement, while Pipe2 has a longer history.
pipe1_ts <- ts(pipe1_hourly$WaterFlow, frequency = 24)
pipe2_ts <- ts(pipe2_hourly$WaterFlow, frequency = 24)
pipe_stationarity <- tibble(
Series = c("Pipe1", "Pipe2"),
ndiffs = c(ndiffs(pipe1_ts), ndiffs(pipe2_ts)),
nsdiffs = c(nsdiffs(pipe1_ts), nsdiffs(pipe2_ts))
)
pipe_stationarity %>%
kable(caption = "Part C differencing check")
| Series | ndiffs | nsdiffs |
|---|---|---|
| Pipe1 | 0 | 0 |
| Pipe2 | 1 | 0 |
pipe1_fit <- holt(pipe1_ts, h = 168)
pipe2_fit <- forecast(ets(pipe2_ts), h = 168)
pipe1_future_time <- seq(max(pipe1_hourly$hour) + hours(1), by = "hour", length.out = 168)
pipe2_future_time <- seq(max(pipe2_hourly$hour) + hours(1), by = "hour", length.out = 168)
pipe1_final <- tibble(
DateTime = pipe1_future_time,
Model = "Holt linear trend on hourly mean flow",
Forecast_WaterFlow = as.numeric(pipe1_fit$mean)
)
pipe2_final <- tibble(
DateTime = pipe2_future_time,
Model = "ETS on hourly mean flow",
Forecast_WaterFlow = as.numeric(pipe2_fit$mean)
)
pipe1_final %>%
head(12) %>%
kable(digits = 4, caption = "Part C Pipe1 forecast sample")
| DateTime | Model | Forecast_WaterFlow |
|---|---|---|
| 2015-11-02 00:00:00 | Holt linear trend on hourly mean flow | 20.1320 |
| 2015-11-02 01:00:00 | Holt linear trend on hourly mean flow | 20.1324 |
| 2015-11-02 02:00:00 | Holt linear trend on hourly mean flow | 20.1328 |
| 2015-11-02 03:00:00 | Holt linear trend on hourly mean flow | 20.1333 |
| 2015-11-02 04:00:00 | Holt linear trend on hourly mean flow | 20.1337 |
| 2015-11-02 05:00:00 | Holt linear trend on hourly mean flow | 20.1341 |
| 2015-11-02 06:00:00 | Holt linear trend on hourly mean flow | 20.1345 |
| 2015-11-02 07:00:00 | Holt linear trend on hourly mean flow | 20.1350 |
| 2015-11-02 08:00:00 | Holt linear trend on hourly mean flow | 20.1354 |
| 2015-11-02 09:00:00 | Holt linear trend on hourly mean flow | 20.1358 |
| 2015-11-02 10:00:00 | Holt linear trend on hourly mean flow | 20.1363 |
| 2015-11-02 11:00:00 | Holt linear trend on hourly mean flow | 20.1367 |
pipe2_final %>%
head(12) %>%
kable(digits = 4, caption = "Part C Pipe2 forecast sample")
| DateTime | Model | Forecast_WaterFlow |
|---|---|---|
| 2015-12-03 17:00:00 | ETS on hourly mean flow | 44.3658 |
| 2015-12-03 18:00:00 | ETS on hourly mean flow | 44.3658 |
| 2015-12-03 19:00:00 | ETS on hourly mean flow | 44.3658 |
| 2015-12-03 20:00:00 | ETS on hourly mean flow | 44.3658 |
| 2015-12-03 21:00:00 | ETS on hourly mean flow | 44.3658 |
| 2015-12-03 22:00:00 | ETS on hourly mean flow | 44.3658 |
| 2015-12-03 23:00:00 | ETS on hourly mean flow | 44.3658 |
| 2015-12-04 00:00:00 | ETS on hourly mean flow | 44.3658 |
| 2015-12-04 01:00:00 | ETS on hourly mean flow | 44.3658 |
| 2015-12-04 02:00:00 | ETS on hourly mean flow | 44.3658 |
| 2015-12-04 03:00:00 | ETS on hourly mean flow | 44.3658 |
| 2015-12-04 04:00:00 | ETS on hourly mean flow | 44.3658 |
I converted both waterflow files into regular hourly series by taking the mean of all readings within each hour. After filling the missing hourly gaps, I checked whether differencing was needed. Since this was only a one-week forecast, I kept the final methods simple and easy to interpret.
The next chunk saves all final forecast tables into one Excel workbook. This is useful because the project instructions ask for an Excel readable forecast file.
model_summary <- bind_rows(
atm_chosen %>%
transmute(
Part = paste("Part A", ATM),
Chosen_Model = Model,
Holdout_RMSE = RMSE,
Holdout_MAE = MAE,
Holdout_MAPE = MAPE
),
tibble(
Part = "Part B Power",
Chosen_Model = power_chosen_model,
Holdout_RMSE = power_chosen$RMSE[1],
Holdout_MAE = power_chosen$MAE[1],
Holdout_MAPE = power_chosen$MAPE[1]
),
tibble(
Part = c("Part C Pipe1", "Part C Pipe2"),
Chosen_Model = c("Holt linear trend on hourly mean flow", "ETS on hourly mean flow"),
Holdout_RMSE = NA_real_,
Holdout_MAE = NA_real_,
Holdout_MAPE = NA_real_
)
)
data_quality_audit <- bind_rows(
atm_audit %>%
transmute(
Section = paste("Part A", ATM),
Item_1 = as.character(First_Date),
Item_2 = as.character(Last_Date),
Count_1_Name = "Missing values or missing dates",
Count_1 = Missing_Values_or_Missing_Dates,
Count_2_Name = "Extreme outliers replaced",
Count_2 = Extreme_Outliers_Replaced
),
power_audit %>%
transmute(
Section = "Part B Power",
Item_1 = as.character(First_Month),
Item_2 = as.character(Last_Month),
Count_1_Name = "Missing months",
Count_1 = Missing_Months,
Count_2_Name = "Extreme outliers replaced",
Count_2 = Extreme_Outliers_Replaced
),
pipe_audit %>%
transmute(
Section = paste("Part C", Pipe),
Item_1 = as.character(First_Hour),
Item_2 = as.character(Last_Hour),
Count_1_Name = "Filled hourly gaps",
Count_1 = Filled_Hours,
Count_2_Name = "Extreme outliers replaced",
Count_2 = NA_real_
)
)
write_xlsx(
list(
Model_Summary = model_summary,
Data_Quality_Audit = data_quality_audit,
Part_A_Model_Comparison = atm_model_metrics,
Part_A_ATM3_Decision = atm3_model_note,
Part_B_Model_Comparison = power_model_metrics,
ATM_May2010_Forecast = atm_final,
Power_2014_Forecast = power_final,
Pipe1_1Week_Forecast = pipe1_final,
Pipe2_1Week_Forecast = pipe2_final
),
"Project1_Revised_Final_Forecasts.xlsx"
)