Libraries

library(dplyr)
library(tidyverse)
library(ggplot2)
library(forecast)
library(lubridate)
library(ggfortify)
library(tsibble)
library(tseries)

Loading the data

df1 <- read.csv("data to csv/ATM624Data.csv")
dim(df1)
## [1] 1474    3
df2 <- read.csv("data to csv/ResidentialCustomerForecastLoad-624.csv")
dim(df2)
## [1] 192   3
df3 <- read.csv("data to csv/Waterflow_Pipe1.csv")
dim(df3)
## [1] 1000    2
df4 <- read.csv("data to csv/Waterflow_Pipe2.csv")
dim(df4)
## [1] 1000    2

Part A: ATM Forcast

This is the ATM data from 4 ATMS. I want to forecast cash withdrawals for May 2010 using the preceding 12-months data.

str(df1)
## 'data.frame':    1474 obs. of  3 variables:
##  $ DATE: chr  "5/1/2009 12:00:00 AM" "5/1/2009 12:00:00 AM" "5/2/2009 12:00:00 AM" "5/2/2009 12:00:00 AM" ...
##  $ ATM : chr  "ATM1" "ATM2" "ATM1" "ATM2" ...
##  $ Cash: int  96 107 82 89 85 90 90 55 99 79 ...

Data preprocessing

atm_data <- df1

# working with a date-time object
atm_data$DATE <- as.POSIXct(atm_data$DATE, format = "%m/%d/%Y %I:%M:%S %p")

# summary
summary(atm_data)
##       DATE                           ATM                 Cash        
##  Min.   :2009-05-01 00:00:00.0   Length:1474        Min.   :    0.0  
##  1st Qu.:2009-08-01 00:00:00.0   Class :character   1st Qu.:    0.5  
##  Median :2009-11-01 00:00:00.0   Mode  :character   Median :   73.0  
##  Mean   :2009-10-31 19:33:27.5                      Mean   :  155.6  
##  3rd Qu.:2010-02-01 00:00:00.0                      3rd Qu.:  114.0  
##  Max.   :2010-05-14 00:00:00.0                      Max.   :10920.0  
##                                                     NA's   :19
# 'this is to sum up daily cash usage per ATM.
atm_daily <- atm_data %>%
  group_by(DATE, ATM) %>%
  summarise(DailyCash = sum(Cash), .groups = "drop")

head(atm_daily)
## # A tibble: 6 × 3
##   DATE                ATM   DailyCash
##   <dttm>              <chr>     <int>
## 1 2009-05-01 00:00:00 ATM1         96
## 2 2009-05-01 00:00:00 ATM2        107
## 3 2009-05-01 00:00:00 ATM3          0
## 4 2009-05-01 00:00:00 ATM4        777
## 5 2009-05-02 00:00:00 ATM1         82
## 6 2009-05-02 00:00:00 ATM2         89
unique(atm_data$ATM)
## [1] "ATM1" "ATM2" ""     "ATM3" "ATM4"
table(atm_data$ATM)
## 
##      ATM1 ATM2 ATM3 ATM4 
##   14  365  365  365  365
###

# converting amounts to dollars rather than 100's of dollars
atm_daily <- atm_daily %>%
  mutate(DailyCash = DailyCash * 100)
head(atm_daily)
## # A tibble: 6 × 3
##   DATE                ATM   DailyCash
##   <dttm>              <chr>     <dbl>
## 1 2009-05-01 00:00:00 ATM1       9600
## 2 2009-05-01 00:00:00 ATM2      10700
## 3 2009-05-01 00:00:00 ATM3          0
## 4 2009-05-01 00:00:00 ATM4      77700
## 5 2009-05-02 00:00:00 ATM1       8200
## 6 2009-05-02 00:00:00 ATM2       8900

The data is now wrangled a little bit better for me to work with.

EDA

I want to explore the data more and get a better understanding

# checking missing values in cash
sum(is.na(atm_data$Cash)) 
## [1] 19
atm_data %>% filter(is.na(Cash)) %>% arrange(ATM, DATE)
##          DATE  ATM Cash
## 1  2010-05-01        NA
## 2  2010-05-02        NA
## 3  2010-05-03        NA
## 4  2010-05-04        NA
## 5  2010-05-05        NA
## 6  2010-05-06        NA
## 7  2010-05-07        NA
## 8  2010-05-08        NA
## 9  2010-05-09        NA
## 10 2010-05-10        NA
## 11 2010-05-11        NA
## 12 2010-05-12        NA
## 13 2010-05-13        NA
## 14 2010-05-14        NA
## 15 2009-06-13 ATM1   NA
## 16 2009-06-16 ATM1   NA
## 17 2009-06-22 ATM1   NA
## 18 2009-06-18 ATM2   NA
## 19 2009-06-24 ATM2   NA
# save for potential future use if needed
future_dates <- atm_data %>% filter(ATM == "", !is.na(Cash))

atm_data %>%
  group_by(ATM) %>%
  summarise(
    first_date = min(DATE),
    last_date = max(DATE),
    num_days = n()
  )
## # A tibble: 5 × 4
##   ATM    first_date          last_date           num_days
##   <chr>  <dttm>              <dttm>                 <int>
## 1 ""     2010-05-01 00:00:00 2010-05-14 00:00:00       14
## 2 "ATM1" 2009-05-01 00:00:00 2010-04-30 00:00:00      365
## 3 "ATM2" 2009-05-01 00:00:00 2010-04-30 00:00:00      365
## 4 "ATM3" 2009-05-01 00:00:00 2010-04-30 00:00:00      365
## 5 "ATM4" 2009-05-01 00:00:00 2010-04-30 00:00:00      365

I’ll start with working on ATM1 to start with, and then will see how this may work for the other ATMs.

Creating Time Series Objects for ATM1 to experiment:

# ATM1 only
atm1_data <- atm_data %>%
  filter(ATM == "ATM1", !is.na(Cash)) %>%
  arrange(DATE)

# date range & rows
summary(atm1_data$DATE)
##                       Min.                    1st Qu. 
## "2009-05-01 00:00:00.0000" "2009-08-02 06:00:00.0000" 
##                     Median                       Mean 
## "2009-10-31 12:00:00.0000" "2009-10-31 03:13:05.6353" 
##                    3rd Qu.                       Max. 
## "2010-01-29 18:00:00.0000" "2010-04-30 00:00:00.0000"
nrow(atm1_data)
## [1] 362
# time series object
ts_atm1 <- ts(atm1_data$Cash, frequency = 7)
# explore it
autoplot(ts_atm1) +
  labs(title = "ATM1 Daily Withdrawals", y = "Cash (hundreds of dollars)", x = "Time")

Shows high-frequency fluctuation, with a recurring weekly pattern. Some moderate variation in the mean level over time — possibly calendar or event-related, but not a strong long-term trend.

Now moving to decomposition:

atm1_decomp <- decompose(ts_atm1, type = "additive")
autoplot(atm1_decomp)
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

Shows a moderate undulation trend, possibly cyclical rather than directional. Clear weekly pattern (peaks and troughs roughly every 7 days). I think seasonal + trend model captured the structure well.

Model Fitting and Forecasting

# ARIMA
model_arima <- auto.arima(ts_atm1)
summary(model_arima)
## Series: ts_atm1 
## ARIMA(0,0,1)(0,1,1)[7] 
## 
## Coefficients:
##          ma1     sma1
##       0.1674  -0.5813
## s.e.  0.0565   0.0472
## 
## sigma^2 = 666.6:  log likelihood = -1658.32
## AIC=3322.63   AICc=3322.7   BIC=3334.25
## 
## Training set error measures:
##                       ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -0.09700089 25.49555 16.17552 -106.2792 122.4165 0.8594986
##                     ACF1
## Training set -0.01116544
# Forecasting next 31 days
forecast_arima <- forecast(model_arima, h = 31)

autoplot(forecast_arima) +
  labs(title = "ATM1 Forecast for May 2010 (ARIMA)", y = "Cash (hundreds)", x = "Date")

The ARIMA forecast captured seasonality and gave a reasonable 31-day projection. Confidence intervals expand (as expected), showing increasing uncertainty. The model summary was:

  • RMSE: ~25.5 (units = hundreds of dollars)
  • MAE: ~16.2 (same units)
  • MAPE: High (~122%) — inflated due to division by low values (withdrawals near zero)
  • ACF1 near zero → residuals are not strongly autocorrelated

What I have learned so far about the data:

  • It is well-suited to seasonal ARIMA (SARIMA)
  • a weekly seasonal pattern is very prominent and well captured above
  • the forecast seems stable and aligned with past trends
# ETS model (Exponential Smoothing)
model_ets <- ets(ts_atm1)
forecast_ets <- forecast(model_ets, h = 31)

autoplot(forecast_ets) +
  labs(title = "ATM1 Forecast for May 2010 (ETS)", y = "Cash (hundreds)", x = "Date")

The ETS forcast pattern is VERY similar visually to ARIMA’s. Still shows seasonality. ETS may overreact to recent levels more than ARIMA? Not sure.

# forecast values as a df
atm1_forecast_df <- data.frame(
  DATE = seq(as.Date("2010-05-01"), by = "day", length.out = 31),
  ATM = "ATM1",
  Forecast = as.numeric(forecast_arima$mean),
  Lo80 = as.numeric(forecast_arima$lower[, 1]),
  Hi80 = as.numeric(forecast_arima$upper[, 1]),
  Lo95 = as.numeric(forecast_arima$lower[, 2]),
  Hi95 = as.numeric(forecast_arima$upper[, 2])
)

# avoiding negative lower bounds
atm1_forecast_df <- atm1_forecast_df %>%
  mutate(Lo80 = pmax(0, Lo80),
         Lo95 = pmax(0, Lo95))

head(atm1_forecast_df)
##         DATE  ATM   Forecast     Lo80      Hi80     Lo95      Hi95
## 1 2010-05-01 ATM1  86.805607 53.71784 119.89337 36.20224 137.40898
## 2 2010-05-02 ATM1 100.640560 67.09247 134.18865 49.33319 151.94793
## 3 2010-05-03 ATM1  74.714560 41.16647 108.26265 23.40719 126.02193
## 4 2010-05-04 ATM1   4.762635  0.00000  38.31072  0.00000  56.07001
## 5 2010-05-05 ATM1 100.063192 66.51510 133.61128 48.75582 151.37057
## 6 2010-05-06 ATM1  79.356704 45.80862 112.90479 28.04933 130.66408

Now I am going to do this for all 4 ATMs:

# function to make things easier
forecast_one_atm <- function(atm_name, full_data, h = 31) {
  atm_df <- full_data %>%
    filter(ATM == atm_name, !is.na(Cash)) %>%
    arrange(DATE)

  if (nrow(atm_df) < 365) {
    warning(paste("ATM", atm_name, "has fewer than 365 days."))
  }

  ts_data <- ts(atm_df$Cash, frequency = 7)
  model <- auto.arima(ts_data)
  fc <- forecast(model, h = h)

  data.frame(
    DATE = seq(as.Date("2010-05-01"), by = "day", length.out = h),
    ATM = atm_name,
    Forecast = as.numeric(fc$mean),
    Lo80 = pmax(0, as.numeric(fc$lower[,1])),  # Cap lower bounds at 0
    Hi80 = as.numeric(fc$upper[,1]),
    Lo95 = pmax(0, as.numeric(fc$lower[,2])),
    Hi95 = as.numeric(fc$upper[,2])
  )
}

# apply function
atm_names <- c("ATM1", "ATM2", "ATM3", "ATM4")

all_forecasts <- lapply(atm_names, forecast_one_atm, full_data = atm_data)
## Warning in FUN(X[[i]], ...): ATM ATM1 has fewer than 365 days.
## Warning in FUN(X[[i]], ...): ATM ATM2 has fewer than 365 days.
combined_forecast_df <- bind_rows(all_forecasts)

# Preview
head(combined_forecast_df)
##         DATE  ATM   Forecast     Lo80      Hi80     Lo95      Hi95
## 1 2010-05-01 ATM1  86.805607 53.71784 119.89337 36.20224 137.40898
## 2 2010-05-02 ATM1 100.640560 67.09247 134.18865 49.33319 151.94793
## 3 2010-05-03 ATM1  74.714560 41.16647 108.26265 23.40719 126.02193
## 4 2010-05-04 ATM1   4.762635  0.00000  38.31072  0.00000  56.07001
## 5 2010-05-05 ATM1 100.063192 66.51510 133.61128 48.75582 151.37057
## 6 2010-05-06 ATM1  79.356704 45.80862 112.90479 28.04933 130.66408

Now I have a clean combined forecast dataset for all 4 ATMs for May 1–31, 2010 using ARIMA models.

ggplot(combined_forecast_df, aes(x = DATE, y = Forecast)) +
  geom_line(color = "blue") +
  geom_ribbon(aes(ymin = Lo95, ymax = Hi95), fill = "blue", alpha = 0.2) +
  facet_wrap(~ ATM, ncol = 2, scales = "free_y") +
  labs(title = "Forecasts for May 2010 – All ATMs (ARIMA)",
       y = "Cash (hundreds of dollars)",
       x = "Date")

Saving for potential use later;

write.csv(combined_forecast_df, "ATM_Forecast_May2010.csv", row.names = FALSE)

Problem

However, there is a problem. ATMs 1 and 2 show a reasonable forcast with weekly seasonal structure and a forcast that match historical values. But this is not true for ATMs 3 and 4! THey show flat and near zero forcast, and super wide confidence intervals. So need to investigate this better, and not use the same method across all ATMs.

atm_data %>%
  filter(ATM %in% c("ATM1", "ATM2", "ATM3", "ATM4")) %>%
  group_by(ATM) %>%
  summarise(
    min_cash = min(Cash, na.rm = TRUE),
    max_cash = max(Cash, na.rm = TRUE),
    mean_cash = mean(Cash, na.rm = TRUE),
    median_cash = median(Cash, na.rm = TRUE),
    sd_cash = sd(Cash, na.rm = TRUE),
    q25 = quantile(Cash, 0.25, na.rm = TRUE),
    q75 = quantile(Cash, 0.75, na.rm = TRUE)
  )
## # A tibble: 4 × 8
##   ATM   min_cash max_cash mean_cash median_cash sd_cash   q25   q75
##   <chr>    <int>    <int>     <dbl>       <dbl>   <dbl> <dbl> <dbl>
## 1 ATM1         1      180    83.9            91   36.7   73     108
## 2 ATM2         0      147    62.6            67   38.9   25.5    93
## 3 ATM3         0       96     0.721           0    7.94   0       0
## 4 ATM4         2    10920   474.            404  651.   124     705
atm_data %>%
  filter(ATM %in% c("ATM1", "ATM2", "ATM3", "ATM4")) %>%
  ggplot(aes(x = Cash)) +
  geom_histogram(bins = 50, fill = "steelblue", color = "white") +
  facet_wrap(~ ATM, scales = "free") +
  labs(title = "Distribution of Cash Withdrawals by ATM",
       x = "Cash Withdrawn (hundreds of dollars)", y = "Count")
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_bin()`).

atm_data %>%
  filter(ATM %in% c("ATM1", "ATM2", "ATM3", "ATM4")) %>%
  ggplot(aes(x = ATM, y = Cash)) +
  geom_boxplot(fill = "skyblue") +
  scale_y_continuous(trans = "log1p") +  # Use log scale to show skew without dropping outliers
  labs(title = "Boxplot of Cash Withdrawals (Log Scale)",
       y = "Cash (hundreds of dollars, log1p transformed)",
       x = "ATM")
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

The distributions for the cash withdrawals are very different. There is extreme skewness for ATMs 3 & 4.

  • ATM1: Reasonably symmetric, small spread, unimodal. Ideal for ARIMA or ETS. Weekly pattern visible. No transformation needed.
  • ATM2: Slight left skew (due to more small values), mild spread. Still ARIMA-compatible.
  • ATM3: Mostly zero withdrawals, median = 0, mean < 1. Not time series–like at all! I think it could be treated as a binary classification (withdrawal yes/no), even though when a withdrawal is made it is still a zero, which makes little sense. Time series modeling is inappropriate in this form I believe. I will remove this ATM altogether, because even if I use it to construct a binary classifier, it has only 3 withdrawals that are non-zero, so too little to be able to predict. I will conclude that this ATM has something wrong with it that requires investigation: either it is malfunctioning or it is located in a place that is not needed at all.
  • ATM4: Highly skewed, massive outliers (max = 10,920), SD > mean. Need to use log transformation. ARIMA alone will overfit on spikes or underpredict consistently.

OK, now let’s fix the models!

# ATM4 - apply log transformation
atm4_df <- atm_data %>%
  filter(ATM == "ATM4", !is.na(Cash)) %>%
  arrange(DATE) %>%
  mutate(log_cash = log1p(Cash))

ts_atm4 <- ts(atm4_df$log_cash, frequency = 7)

# ARIMA
model_arima4 <- auto.arima(ts_atm4)
fc_arima4 <- forecast(model_arima4, h = 31)

# ETS
model_ets4 <- ets(ts_atm4)
fc_ets4 <- forecast(model_ets4, h = 31)

## Back transform forecasts:
arima_df <- data.frame(
  DATE = seq(as.Date("2010-05-01"), by = "day", length.out = 31),
  ATM = "ATM4",
  Forecast = expm1(fc_arima4$mean),
  Lo80 = pmax(0, expm1(fc_arima4$lower[,1])),
  Hi80 = expm1(fc_arima4$upper[,1]),
  Lo95 = pmax(0, expm1(fc_arima4$lower[,2])),
  Hi95 = expm1(fc_arima4$upper[,2]),
  Method = "ARIMA"
)

ets_df <- data.frame(
  DATE = arima_df$DATE,
  ATM = "ATM4",
  Forecast = expm1(fc_ets4$mean),
  Lo80 = pmax(0, expm1(fc_ets4$lower[,1])),
  Hi80 = expm1(fc_ets4$upper[,1]),
  Lo95 = pmax(0, expm1(fc_ets4$lower[,2])),
  Hi95 = expm1(fc_ets4$upper[,2]),
  Method = "ETS"
)

bind_rows(arima_df, ets_df) %>%
  ggplot(aes(x = DATE, y = Forecast, color = Method)) +
  geom_line() +
  geom_ribbon(aes(ymin = Lo95, ymax = Hi95, fill = Method), alpha = 0.2, color = NA) +
  facet_wrap(~ Method) +
  labs(title = "ATM4 Forecast (Log-Transformed Models)", y = "Forecast (hundreds of dollars)", x = "Date")

AIC(model_arima4, model_ets4)
##              df      AIC
## model_arima4  4 1214.319
## model_ets4   10 2319.008
  • Looking visually, ARIMA shows tighter, flatter forecasts with narrower intervals. Potentially underfitting the scale of past values?
  • ETS shows stronger weekly seasonality and higher peaks — better aligned with past high variability.
  • ETS is more responsive to the historical weekly spikes in ATM4, and is likely the better choice.
  • Looking at the AIC, even though ETS seemed more “responsive” visually, it is overfitting. the ARIMA model seems to provide a better balance of fit and parsimony.

Finalizing models

# ATM1 forecast
atm1_forecast_df$Method <- "ARIMA"

# ATM2 forecast
atm2_df <- atm_data %>%
  filter(ATM == "ATM2", !is.na(Cash)) %>%
  arrange(DATE)

ts_atm2 <- ts(atm2_df$Cash, frequency = 7)
model_atm2 <- auto.arima(ts_atm2)
fc_atm2 <- forecast(model_atm2, h = 31)

atm2_forecast_df <- data.frame(
  DATE = seq(as.Date("2010-05-01"), by = "day", length.out = 31),
  ATM = "ATM2",
  Forecast = as.numeric(fc_atm2$mean),
  Lo80 = as.numeric(fc_atm2$lower[, 1]),
  Hi80 = as.numeric(fc_atm2$upper[, 1]),
  Lo95 = as.numeric(fc_atm2$lower[, 2]),
  Hi95 = as.numeric(fc_atm2$upper[, 2]),
  Method = "ARIMA"
)

# Combine
combined_forecast_df <- bind_rows(atm1_forecast_df, atm2_forecast_df, arima_df)

# export forecasts
write.csv(combined_forecast_df, "ATM_Forecast_May2010_Final.csv", row.names = FALSE)

# visual summary
ggplot(combined_forecast_df, aes(x = DATE, y = Forecast)) +
  geom_line(color = "blue") +
  geom_ribbon(aes(ymin = Lo95, ymax = Hi95), fill = "lightblue", alpha = 0.3) +
  facet_wrap(~ ATM, scales = "free_y") +
  labs(title = "Final ATM Forecasts for May 2010 (ARIMA Models)",
       x = "Date", y = "Cash Withdrawn (hundreds of dollars)")

  • ATM1 and ATM2 show weekly seasonality with moderate forecast intervals — excellent ARIMA fit.
  • ATM4, after log transformation and back-transformation, has somewhat stabilized forecasts. The absolute values are much higher, but the scale and spread are now reasonable and interpretable.

Final Summary:

This analysis involved forecasting daily cash withdrawals for four ATMs using one year of historical data (May 1, 2009 – April 30, 2010). After data cleaning and exploratory analysis, I evaluated the usage patterns and distributional characteristics across the ATMs. ATM1 and ATM2 showed consistent daily activity with weekly seasonality and moderate variance. These were modeled using seasonal ARIMA models without transformation, which gave forecasts with relatively narrow prediction intervals.

ATM4, however, showed extreme right-skew with a small number of very large withdrawals. A log transformation was applied to stabilize the variance before modeling. After evaluating both ARIMA and ETS models, the log-transformed ARIMA was selected based on substantially lower AIC. ATM3 was excluded from modeling after exploratory analysis revealed only three non-zero withdrawal days in the entire year, suggesting a non-functional unit or placement in a location with no demand. Final forecasts for May 2010 were generated using the selected ARIMA models.

================================================================================

Part B: Residential Customer Forecast Load

str(df2)
## 'data.frame':    192 obs. of  3 variables:
##  $ CaseSequence: int  733 734 735 736 737 738 739 740 741 742 ...
##  $ YYYY.MMM    : chr  "1998-Jan" "1998-Feb" "1998-Mar" "1998-Apr" ...
##  $ KWH         : int  6862583 5838198 5420658 5010364 4665377 6467147 8914755 8607428 6989888 6345620 ...

Converting to time series and exploring the data

ts_kwh <- ts(df2$KWH, start = c(1998, 1), frequency = 12)

autoplot(ts_kwh) +
  labs(title = "Monthly Residential Power Consumption (1998–2013)",
       x = "Year", y = "KWH")

# decomp_kwh <- decompose(ts_kwh, type = "multiplicative")  # or "additive" if variance is constant
# autoplot(decomp_kwh)

#Getting an error so checking for NAs with the decompose code:
which(is.na(ts_kwh))  
## [1] 129
sum(is.na(ts_kwh))    
## [1] 1
time(ts_kwh)[129]
## [1] 2008.667

There is a msising value at position 129. So I’ll try to fix it with interpolation:

library(imputeTS)
## 
## Attaching package: 'imputeTS'
## The following object is masked from 'package:tseries':
## 
##     na.remove
ts_kwh_clean <- na.interpolation(ts_kwh, option = "linear")
## Warning: na.interpolation will be replaced by na_interpolation.
##            Functionality stays the same.
##            The new function name better fits modern R code style guidelines.
##            Please adjust your code accordingly.
# recheck
sum(is.na(ts_kwh_clean))
## [1] 0

Re-try decomposition:

# classic decomposition
decomp_kwh <- decompose(ts_kwh_clean, type = "additive")
autoplot(decomp_kwh)
## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).

# STL decomposition
stl_kwh <- stl(ts_kwh_clean, s.window = "periodic")
autoplot(stl_kwh)

Both decompositions are somewhat similar, but STL may be a better option here given the 16-year and long time horizon for the data.

Let’s proceed with both ARIMA and ETS models:

ARIMA

model_arima_kwh <- auto.arima(ts_kwh_clean)
summary(model_arima_kwh)
## Series: ts_kwh_clean 
## ARIMA(0,0,1)(1,1,1)[12] with drift 
## 
## Coefficients:
##          ma1     sar1     sma1     drift
##       0.2439  -0.1518  -0.7311  7615.302
## s.e.  0.0723   0.0963   0.0821  1953.403
## 
## sigma^2 = 7.466e+11:  log likelihood = -2719.89
## AIC=5449.79   AICc=5450.13   BIC=5465.75
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE   MAPE      MASE       ACF1
## Training set -25089.69 827254.2 493308.5 -5.511184 11.685 0.7080556 0.01283694
forecast_arima_kwh <- forecast(model_arima_kwh, h = 12)

autoplot(forecast_arima_kwh) +
  labs(title = "ARIMA Forecast for 2014", y = "KWH", x = "Date")

ETS

model_ets_kwh <- ets(ts_kwh_clean)
summary(model_ets_kwh)
## ETS(M,N,M) 
## 
## Call:
## ets(y = ts_kwh_clean)
## 
##   Smoothing parameters:
##     alpha = 0.1137 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 6134772.7867 
##     s = 0.9547 0.7533 0.8603 1.1912 1.2556 1.1992
##            0.9982 0.7684 0.8161 0.9175 1.0618 1.2237
## 
##   sigma:  0.1199
## 
##      AIC     AICc      BIC 
## 6224.057 6226.784 6272.919 
## 
## Training set error measures:
##                    ME   RMSE      MAE      MPE     MAPE      MASE      ACF1
## Training set 61009.73 835107 503972.9 -4.39013 12.04006 0.7233624 0.1698584
forecast_ets_kwh <- forecast(model_ets_kwh, h = 12)

autoplot(forecast_ets_kwh) +
  labs(title = "ETS Forecast for 2014", y = "KWH", x = "Date")

STL + Forecast

forecast_stl <- stlf(ts_kwh_clean, h = 12)
autoplot(forecast_stl) +
  labs(title = "STL Forecast for 2014", y = "KWH", x = "Date")

## Interpretation

  • The three models are quite similar, with very slight differences.
  • ARIMA shows an AIC of 5449.79, and an RMSE of 827,254, MAPE (%) of 11.69. ARIMA seems to capture trend well with tight CIs.
  • ETS (M,N,M) has an AIC of 6224.06, RMSE 835,107, MAPE (%) 12.04. It is also reasonably fit
  • STL + Naive: seems visually a good fit also
  • ARIMA has the lowest AIC and lowest RMSE, making it the best model in terms of statistical fit.

Final output and saving it

forecast_df <- data.frame(
  DATE = seq(as.Date("2014-01-01"), by = "month", length.out = 12),
  Forecast = as.numeric(forecast_arima_kwh$mean),
  Lo80 = as.numeric(forecast_arima_kwh$lower[,1]),
  Hi80 = as.numeric(forecast_arima_kwh$upper[,1]),
  Lo95 = as.numeric(forecast_arima_kwh$lower[,2]),
  Hi95 = as.numeric(forecast_arima_kwh$upper[,2])
)

write.csv(forecast_df, "Residential_KWH_Forecast_2014_ARIMA.csv", row.names = FALSE)

autoplot(forecast_arima_kwh) +
  labs(title = "Final Residential Power Forecast for 2014 (ARIMA)",
       y = "KWH", x = "Date")

Conclusion:

This dataset presented monthly power usage over 16 years, with clear seasonal peaks and a visible post-2010 trend increase. The residential power consumption data from January 1998 to December 2013 was modeled using seasonal ARIMA, ETS, and STL decomposition approaches. After interpolating one missing month, decomposition revealed both a strong seasonal component and a post-2010 upward trend. Among the models tested, the ARIMA with drift produced the lowest AIC and RMSE and demonstrated stable predictive behavior. This model was selected for forecasting monthly KWH for 2014, which gave plausible projections consistent with historical patterns and seasonal effects.

================================================================================

Part C: Waterflow

str(df3)
## 'data.frame':    1000 obs. of  2 variables:
##  $ Date.Time: chr  "10/23/15 12:24 AM" "10/23/15 12:40 AM" "10/23/15 12:53 AM" "10/23/15 12:55 AM" ...
##  $ WaterFlow: num  23.4 28 23.1 30 6 ...
str(df4)
## 'data.frame':    1000 obs. of  2 variables:
##  $ Date.Time: chr  "10/23/15 1:00 AM" "10/23/15 2:00 AM" "10/23/15 3:00 AM" "10/23/15 4:00 AM" ...
##  $ WaterFlow: num  18.8 43.1 38 36.1 31.9 ...
head(df3$Date.Time, 10)
##  [1] "10/23/15 12:24 AM" "10/23/15 12:40 AM" "10/23/15 12:53 AM"
##  [4] "10/23/15 12:55 AM" "10/23/15 1:19 AM"  "10/23/15 1:23 AM" 
##  [7] "10/23/15 1:50 AM"  "10/23/15 1:55 AM"  "10/23/15 1:59 AM" 
## [10] "10/23/15 2:51 AM"
head(df4$Date.Time, 10)
##  [1] "10/23/15 1:00 AM"  "10/23/15 2:00 AM"  "10/23/15 3:00 AM" 
##  [4] "10/23/15 4:00 AM"  "10/23/15 5:00 AM"  "10/23/15 6:00 AM" 
##  [7] "10/23/15 7:00 AM"  "10/23/15 8:00 AM"  "10/23/15 9:00 AM" 
## [10] "10/23/15 10:00 AM"

Converting to time formats and combining

# timestamps
df3$DateTime <- as.POSIXct(df3$Date.Time, format = "%m/%d/%y %I:%M %p", tz = "UTC")
df4$DateTime <- as.POSIXct(df4$Date.Time, format = "%m/%d/%y %I:%M %p", tz = "UTC")

summary(df3$DateTime)
##                     Min.                  1st Qu.                   Median 
## "2015-10-23 00:24:00.00" "2015-10-25 11:21:15.00" "2015-10-27 20:07:00.00" 
##                     Mean                  3rd Qu.                     Max. 
## "2015-10-27 20:48:46.25" "2015-10-30 08:24:45.00" "2015-11-01 23:35:00.00"
summary(df4$DateTime)
##                  Min.               1st Qu.                Median 
## "2015-10-23 01:00:00" "2015-11-02 10:45:00" "2015-11-12 20:30:00" 
##                  Mean               3rd Qu.                  Max. 
## "2015-11-12 20:30:00" "2015-11-23 06:15:00" "2015-12-03 16:00:00"
# Combine
df_combined <- bind_rows(
  df3 %>% select(DateTime, WaterFlow),
  df4 %>% select(DateTime, WaterFlow)
)

# the hour and average within each hour
df_hourly <- df_combined %>%
  mutate(Hour = floor_date(DateTime, unit = "hour")) %>%
  group_by(Hour) %>%
  summarise(WaterFlow = mean(WaterFlow, na.rm = TRUE), .groups = "drop")

str(df_hourly)
## tibble [1,001 × 2] (S3: tbl_df/tbl/data.frame)
##  $ Hour     : POSIXct[1:1001], format: "2015-10-23 00:00:00" "2015-10-23 01:00:00" ...
##  $ WaterFlow: num [1:1001] 26.1 18.8 24.5 25.6 22.4 ...
head(df_hourly)
## # A tibble: 6 × 2
##   Hour                WaterFlow
##   <dttm>                  <dbl>
## 1 2015-10-23 00:00:00      26.1
## 2 2015-10-23 01:00:00      18.8
## 3 2015-10-23 02:00:00      24.5
## 4 2015-10-23 03:00:00      25.6
## 5 2015-10-23 04:00:00      22.4
## 6 2015-10-23 05:00:00      23.6
ggplot(df_hourly, aes(x = Hour, y = WaterFlow)) +
  geom_line() +
  labs(title = "Hourly Water Flow",
       x = "Time", y = "Flow (units)")

# aggregate to hourly level
df_hourly <- df_combined %>%
  mutate(Hour = floor_date(DateTime, unit = "hour")) %>%
  group_by(Hour) %>%
  summarise(WaterFlow = mean(WaterFlow, na.rm = TRUE), .groups = "drop") %>%
  arrange(Hour) 

# time interval regularity
summary(diff(df_hourly$Hour))
##   Length    Class     Mode 
##     1000 difftime  numeric
# a tsibble
ts_hourly <- df_hourly %>%
  as_tsibble(index = Hour)

# a base R ts() object
ts_regular <- ts(df_hourly$WaterFlow, frequency = 24)

# Check stationarity
adf_result <- adf.test(ts_regular)
## Warning in adf.test(ts_regular): p-value smaller than printed p-value
print(adf_result)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  ts_regular
## Dickey-Fuller = -7.0271, Lag order = 9, p-value = 0.01
## alternative hypothesis: stationary
# the series
ggplot(df_hourly, aes(x = Hour, y = WaterFlow)) +
  geom_line(color = "steelblue") +
  labs(title = "Hourly Water Flow Time Series", x = "Time", y = "Flow (units)")

Notes:

Forecasting the Next 168 Hours (1 Week)

I’ll start with an ARIMA model

model_arima <- auto.arima(ts_regular)
summary(model_arima)
## Series: ts_regular 
## ARIMA(0,1,1)(0,0,1)[24] 
## 
## Coefficients:
##           ma1    sma1
##       -0.9648  0.0779
## s.e.   0.0083  0.0316
## 
## sigma^2 = 211.8:  log likelihood = -4097.13
## AIC=8200.26   AICc=8200.29   BIC=8214.99
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.5276691 14.53148 11.11968 -26.27769 47.80311 0.7354472
##                     ACF1
## Training set -0.02338918
forecast_arima <- forecast(model_arima, h = 168)

autoplot(forecast_arima) +
  labs(title = "Water Flow Forecast – ARIMA (Next 168 Hours)",
       y = "Water Flow", x = "Time")

Then I’ll do an ETS model

model_ets <- ets(ts_regular)
summary(model_ets)
## ETS(M,N,N) 
## 
## Call:
## ets(y = ts_regular)
## 
##   Smoothing parameters:
##     alpha = 0.0372 
## 
##   Initial states:
##     l = 20.6363 
## 
##   sigma:  0.3974
## 
##      AIC     AICc      BIC 
## 12168.50 12168.53 12183.23 
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set 0.6300612 14.58435 11.16381 -25.97375 47.94456 0.7383665
##                     ACF1
## Training set -0.02263184
forecast_ets <- forecast(model_ets, h = 168)

autoplot(forecast_ets) +
  labs(title = "Water Flow Forecast – ETS (Next 168 Hours)",
       y = "Water Flow", x = "Time")

Interpretation:

  • both models are functioning well
  • ARIMA is strongly preferred because AIC is a lot lower and RMSE and MAPE are slightly better. Visual forecast is stable and adapts slightly better to recent variance.
  • ETS fit is non-seasonal and multiplicative

Export the forecast

forecast_df <- data.frame(
  Time = seq(from = max(df_hourly$Hour) + hours(1),
             by = "hour", length.out = 168),
  Forecast = as.numeric(forecast_arima$mean),
  Lo80 = as.numeric(forecast_arima$lower[, 1]),
  Hi80 = as.numeric(forecast_arima$upper[, 1]),
  Lo95 = as.numeric(forecast_arima$lower[, 2]),
  Hi95 = as.numeric(forecast_arima$upper[, 2])
)

write.csv(forecast_df, "WaterFlow_Forecast_1Week.csv", row.names = FALSE)

Conclusion:

This final part included merging two datasets with different timestamp granularities. After aggregating to hourly intervals and ensuring regular spacing, I checked for stationarity. The ADF test confirmed that the differenced series was stationary, allowing me to proceed with ARIMA modeling. I compared both ARIMA and ETS models over a 168-hour (1-week) horizon. Although both had similar RMSE, ARIMA performed significantly better in terms of AIC and visual interpretability, leading me to use it for the final forecast.