library(tidyverse)
library(readxl)
library(forecast)
library(tseries)
library(lubridate)
url <- "https://github.com/vincent-usny/624_pro1/raw/refs/heads/main/ResidentialCustomerForecastLoad-624.xlsx"
tmp <- tempfile(fileext = ".xlsx")
download.file(url, tmp, mode = "wb")
df <- read_excel(tmp) %>%
rename(Period = `YYYY-MMM`) %>%
mutate(Date = parse_date_time(Period, orders = "Y-b")) %>%
arrange(Date) %>%
filter(!is.na(KWH))
df %>%
summarise(
Start = min(Date),
End = max(Date),
N = n(),
Mean = round(mean(KWH), 0),
SD = round(sd(KWH), 0),
Min = round(min(KWH), 0),
Max = round(max(KWH), 0),
Missing = sum(is.na(KWH))
)
## # A tibble: 1 × 8
## Start End N Mean SD Min Max
## <dttm> <dttm> <int> <dbl> <dbl> <dbl> <dbl>
## 1 1998-01-01 00:00:00 2013-12-01 00:00:00 191 6502475 1447571 770523 10655730
## # ℹ 1 more variable: Missing <int>
ggplot(df, aes(Date, KWH)) +
geom_line(color = "#2c7bb6") +
geom_smooth(se = FALSE, color = "orange", linetype = "dashed") +
scale_y_continuous(labels = scales::comma) +
labs(
title = "Residential Power Usage: Jan 1998 - Dec 2013",
x = NULL,
y = "KWH"
) +
theme_minimal()
The time series plot shows a stable repeating pattern from 1998 to 2013 with no clear upward or downward trend
df %>%
mutate(Month = month(Date, label = TRUE),
Year = factor(year(Date))) %>%
ggplot(aes(Month, KWH, group = Year, color = Year)) +
geom_line(alpha = 0.6) +
scale_y_continuous(labels = scales::comma) +
labs(title = "Seasonal Pattern by Year", x = NULL, y = "KWH") +
theme_minimal()
All years follow the same U-shaped curve with troughs in spring (April-May) and twin peaks in summer (July-August) and winter (December-January), confirming a robust and stable annual seasonal cycle.
kwh_ts <- ts(df$KWH, start = c(1998, 1), frequency = 12)
adf.test(kwh_ts)
##
## Augmented Dickey-Fuller Test
##
## data: kwh_ts
## Dickey-Fuller = -4.8347, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
par(mfrow = c(1, 2))
acf(kwh_ts, main = "ACF", lag.max = 48)
pacf(kwh_ts, main = "PACF", lag.max = 48)
par(mfrow = c(1, 1))
The ACF shows strong, slowly decaying spikes at lags 12, 24, and 36, confirming dominant annual seasonality
kwh_ts %>%
decompose() %>%
autoplot() +
labs(title = "Classical Decomposition of KWH Series") +
theme_minimal()
The decomposistion trend component is essentially flat across the 16 years, the seasonal component is large and consistent
# SARIMA - auto selected
fit_arima <- auto.arima(kwh_ts, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)
# ETS
fit_ets <- ets(kwh_ts)
summary(fit_arima)
## Series: kwh_ts
## ARIMA(0,0,1)(2,1,0)[12] with drift
##
## Coefficients:
## ma1 sar1 sar2 drift
## 0.2410 -0.6912 -0.3694 8777.733
## s.e. 0.0763 0.0708 0.0695 3720.505
##
## sigma^2 = 9.143e+11: log likelihood = -2720.48
## AIC=5450.96 AICc=5451.3 BIC=5466.89
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -9931.666 915257.7 570670.9 -4.872097 12.23851 0.7669296
## ACF1
## Training set -0.00412785
summary(fit_ets)
## ETS(M,N,M)
##
## Call:
## ets(y = kwh_ts)
##
## Smoothing parameters:
## alpha = 0.1467
## gamma = 0.2628
##
## Initial states:
## l = 6039826.7914
## s = 0.9173 0.7963 0.9097 1.2039 1.2848 1.2511
## 0.9907 0.8011 0.8409 0.8827 0.9655 1.156
##
## sigma: 0.1601
##
## AIC AICc BIC
## 6302.917 6305.659 6351.701
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 29765.89 1006104 660162.2 -4.650549 13.79142 0.8871978 0.1563559
checkresiduals(fit_arima)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(2,1,0)[12] with drift
## Q* = 16.628, df = 21, p-value = 0.7334
##
## Model df: 3. Total lags used: 24
checkresiduals(fit_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,M)
## Q* = 54.91, df = 24, p-value = 0.0003219
##
## Model df: 0. Total lags used: 24
tibble(
Model = c("SARIMA", "ETS"),
AIC = c(AIC(fit_arima), AIC(fit_ets)),
RMSE = c(
accuracy(fit_arima)[, "RMSE"],
accuracy(fit_ets)[, "RMSE"]
)
) %>%
mutate(across(where(is.numeric), round, 2))
## # A tibble: 2 × 3
## Model AIC RMSE
## <chr> <dbl> <dbl>
## 1 SARIMA 5451. 915258.
## 2 ETS 6303. 1006104.
h <- 12
fc_arima <- forecast(fit_arima, h = h)
fc_ets <- forecast(fit_ets, h = h)
# Pick best model by AIC
best_fc <- if (AIC(fit_arima) <= AIC(fit_ets)) fc_arima else fc_ets
best_model_name <- if (AIC(fit_arima) <= AIC(fit_ets)) "SARIMA" else "ETS"
cat("Selected model:", best_model_name, "\n")
## Selected model: SARIMA
autoplot(best_fc) +
scale_y_continuous(labels = scales::comma) +
labs(
title = paste("2014 Monthly KWH Forecast --", best_model_name),
subtitle = "Shaded bands show 80% and 95% prediction intervals",
x = NULL,
y = "KWH"
) +
theme_minimal()
The 2014 forecast closely mirrors the historical seasonal pattern with summer and winter peaks, and the prediction intervals widen modestly through the year
tibble(
Month = month.abb,
Year = 2014,
Forecast = round(as.numeric(best_fc$mean), 0),
Lo80 = round(as.numeric(best_fc$lower[, 1]), 0),
Hi80 = round(as.numeric(best_fc$upper[, 1]), 0),
Lo95 = round(as.numeric(best_fc$lower[, 2]), 0),
Hi95 = round(as.numeric(best_fc$upper[, 2]), 0)
)
## # A tibble: 12 × 7
## Month Year Forecast Lo80 Hi80 Lo95 Hi95
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Jan 2014 10080231 8854833 11305629 8206147 11954315
## 2 Feb 2014 8435125 7174640 9695610 6507379 10362871
## 3 Mar 2014 6623562 5363077 7884048 4695817 8551308
## 4 Apr 2014 5994860 4734375 7255345 4067115 7922606
## 5 May 2014 5946664 4686179 7207149 4018918 7874410
## 6 Jun 2014 8182984 6922499 9443469 6255238 10110729
## 7 Jul 2014 9403939 8143454 10664424 7476193 11331684
## 8 Aug 2014 9922091 8661606 11182576 7994345 11849837
## 9 Sep 2014 8413878 7153393 9674363 6486133 10341624
## 10 Oct 2014 5860254 4599769 7120739 3932508 7788000
## 11 Nov 2014 6116389 4855904 7376874 4188644 8044135
## 12 Dec 2014 8366451 7105966 9626936 6438705 10294196