See the docx file for summary report

library(tidyverse)
library(readxl)
library(forecast)
library(tseries)
library(lubridate)
url <- "https://github.com/vincent-usny/624_pro1/raw/refs/heads/main/ResidentialCustomerForecastLoad-624.xlsx"
tmp <- tempfile(fileext = ".xlsx")
download.file(url, tmp, mode = "wb")

df <- read_excel(tmp) %>%
  rename(Period = `YYYY-MMM`) %>%
  mutate(Date = parse_date_time(Period, orders = "Y-b")) %>%
  arrange(Date) %>%
  filter(!is.na(KWH))
df %>%
  summarise(
    Start   = min(Date),
    End     = max(Date),
    N       = n(),
    Mean    = round(mean(KWH), 0),
    SD      = round(sd(KWH), 0),
    Min     = round(min(KWH), 0),
    Max     = round(max(KWH), 0),
    Missing = sum(is.na(KWH))
  )
## # A tibble: 1 × 8
##   Start               End                     N    Mean      SD    Min      Max
##   <dttm>              <dttm>              <int>   <dbl>   <dbl>  <dbl>    <dbl>
## 1 1998-01-01 00:00:00 2013-12-01 00:00:00   191 6502475 1447571 770523 10655730
## # ℹ 1 more variable: Missing <int>
ggplot(df, aes(Date, KWH)) +
  geom_line(color = "#2c7bb6") +
  geom_smooth(se = FALSE, color = "orange", linetype = "dashed") +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title = "Residential Power Usage: Jan 1998 - Dec 2013",
    x     = NULL,
    y     = "KWH"
  ) +
  theme_minimal()

The time series plot shows a stable repeating pattern from 1998 to 2013 with no clear upward or downward trend

df %>%
  mutate(Month = month(Date, label = TRUE),
         Year  = factor(year(Date))) %>%
  ggplot(aes(Month, KWH, group = Year, color = Year)) +
  geom_line(alpha = 0.6) +
  scale_y_continuous(labels = scales::comma) +
  labs(title = "Seasonal Pattern by Year", x = NULL, y = "KWH") +
  theme_minimal()

All years follow the same U-shaped curve with troughs in spring (April-May) and twin peaks in summer (July-August) and winter (December-January), confirming a robust and stable annual seasonal cycle.

kwh_ts <- ts(df$KWH, start = c(1998, 1), frequency = 12)
adf.test(kwh_ts)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  kwh_ts
## Dickey-Fuller = -4.8347, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
par(mfrow = c(1, 2))
acf(kwh_ts,  main = "ACF",  lag.max = 48)
pacf(kwh_ts, main = "PACF", lag.max = 48)

par(mfrow = c(1, 1))

The ACF shows strong, slowly decaying spikes at lags 12, 24, and 36, confirming dominant annual seasonality

kwh_ts %>%
  decompose() %>%
  autoplot() +
  labs(title = "Classical Decomposition of KWH Series") +
  theme_minimal()

The decomposistion trend component is essentially flat across the 16 years, the seasonal component is large and consistent

# SARIMA - auto selected
fit_arima <- auto.arima(kwh_ts, seasonal = TRUE, stepwise = FALSE, approximation = FALSE)

# ETS
fit_ets <- ets(kwh_ts)

summary(fit_arima)
## Series: kwh_ts 
## ARIMA(0,0,1)(2,1,0)[12] with drift 
## 
## Coefficients:
##          ma1     sar1     sar2     drift
##       0.2410  -0.6912  -0.3694  8777.733
## s.e.  0.0763   0.0708   0.0695  3720.505
## 
## sigma^2 = 9.143e+11:  log likelihood = -2720.48
## AIC=5450.96   AICc=5451.3   BIC=5466.89
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -9931.666 915257.7 570670.9 -4.872097 12.23851 0.7669296
##                     ACF1
## Training set -0.00412785
summary(fit_ets)
## ETS(M,N,M) 
## 
## Call:
## ets(y = kwh_ts)
## 
##   Smoothing parameters:
##     alpha = 0.1467 
##     gamma = 0.2628 
## 
##   Initial states:
##     l = 6039826.7914 
##     s = 0.9173 0.7963 0.9097 1.2039 1.2848 1.2511
##            0.9907 0.8011 0.8409 0.8827 0.9655 1.156
## 
##   sigma:  0.1601
## 
##      AIC     AICc      BIC 
## 6302.917 6305.659 6351.701 
## 
## Training set error measures:
##                    ME    RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 29765.89 1006104 660162.2 -4.650549 13.79142 0.8871978 0.1563559
checkresiduals(fit_arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(2,1,0)[12] with drift
## Q* = 16.628, df = 21, p-value = 0.7334
## 
## Model df: 3.   Total lags used: 24
checkresiduals(fit_ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,M)
## Q* = 54.91, df = 24, p-value = 0.0003219
## 
## Model df: 0.   Total lags used: 24
tibble(
  Model = c("SARIMA", "ETS"),
  AIC   = c(AIC(fit_arima), AIC(fit_ets)),
  RMSE  = c(
    accuracy(fit_arima)[, "RMSE"],
    accuracy(fit_ets)[, "RMSE"]
  )
) %>%
  mutate(across(where(is.numeric), round, 2))
## # A tibble: 2 × 3
##   Model    AIC     RMSE
##   <chr>  <dbl>    <dbl>
## 1 SARIMA 5451.  915258.
## 2 ETS    6303. 1006104.
h <- 12

fc_arima <- forecast(fit_arima, h = h)
fc_ets   <- forecast(fit_ets,   h = h)

# Pick best model by AIC
best_fc <- if (AIC(fit_arima) <= AIC(fit_ets)) fc_arima else fc_ets
best_model_name <- if (AIC(fit_arima) <= AIC(fit_ets)) "SARIMA" else "ETS"
cat("Selected model:", best_model_name, "\n")
## Selected model: SARIMA
autoplot(best_fc) +
  scale_y_continuous(labels = scales::comma) +
  labs(
    title    = paste("2014 Monthly KWH Forecast --", best_model_name),
    subtitle = "Shaded bands show 80% and 95% prediction intervals",
    x        = NULL,
    y        = "KWH"
  ) +
  theme_minimal()

The 2014 forecast closely mirrors the historical seasonal pattern with summer and winter peaks, and the prediction intervals widen modestly through the year

tibble(
  Month    = month.abb,
  Year     = 2014,
  Forecast = round(as.numeric(best_fc$mean), 0),
  Lo80     = round(as.numeric(best_fc$lower[, 1]), 0),
  Hi80     = round(as.numeric(best_fc$upper[, 1]), 0),
  Lo95     = round(as.numeric(best_fc$lower[, 2]), 0),
  Hi95     = round(as.numeric(best_fc$upper[, 2]), 0)
)
## # A tibble: 12 × 7
##    Month  Year Forecast    Lo80     Hi80    Lo95     Hi95
##    <chr> <dbl>    <dbl>   <dbl>    <dbl>   <dbl>    <dbl>
##  1 Jan    2014 10080231 8854833 11305629 8206147 11954315
##  2 Feb    2014  8435125 7174640  9695610 6507379 10362871
##  3 Mar    2014  6623562 5363077  7884048 4695817  8551308
##  4 Apr    2014  5994860 4734375  7255345 4067115  7922606
##  5 May    2014  5946664 4686179  7207149 4018918  7874410
##  6 Jun    2014  8182984 6922499  9443469 6255238 10110729
##  7 Jul    2014  9403939 8143454 10664424 7476193 11331684
##  8 Aug    2014  9922091 8661606 11182576 7994345 11849837
##  9 Sep    2014  8413878 7153393  9674363 6486133 10341624
## 10 Oct    2014  5860254 4599769  7120739 3932508  7788000
## 11 Nov    2014  6116389 4855904  7376874 4188644  8044135
## 12 Dec    2014  8366451 7105966  9626936 6438705 10294196