Part B - Residential Power Usage

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

I converted the file to a csv, loaded it to github, and load it into R here:

library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.5.2
## Warning: package 'ggplot2' was built under R version 4.5.2
## Warning: package 'tibble' was built under R version 4.5.2
## Warning: package 'readr' was built under R version 4.5.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.6
## ✔ forcats   1.0.1     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.1     ✔ tibble    3.3.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.5.2
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.2 ──
## ✔ tsibble     1.1.6     ✔ feasts      0.4.2
## ✔ tsibbledata 0.4.1     ✔ fable       0.5.0
## Warning: package 'tsibble' was built under R version 4.5.2
## Warning: package 'tsibbledata' was built under R version 4.5.2
## Warning: package 'feasts' was built under R version 4.5.2
## Warning: package 'fabletools' was built under R version 4.5.2
## Warning: package 'fable' was built under R version 4.5.2
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
power_data <- read.csv('https://raw.githubusercontent.com/samanthabarbaro/data624/refs/heads/main/ResidentialCustomerForecastLoad-624.csv', header = TRUE)

glimpse(power_data)
## Rows: 192
## Columns: 3
## $ CaseSequence <int> 733, 734, 735, 736, 737, 738, 739, 740, 741, 742, 743, 74…
## $ YYYY.MMM     <chr> "1998-Jan", "1998-Feb", "1998-Mar", "1998-Apr", "1998-May…
## $ KWH          <int> 6862583, 5838198, 5420658, 5010364, 4665377, 6467147, 891…

The dates imported as characters and need to be converted to the year-month data format.

Check for missing values

#one missing value here
power_data |> filter(is.na(KWH))
##   CaseSequence YYYY.MMM KWH
## 1          861 2008-Sep  NA
power_data |> filter(is.na(CaseSequence))
## [1] CaseSequence YYYY.MMM     KWH         
## <0 rows> (or 0-length row.names)
power_data |> filter(is.na(YYYY.MMM))
## [1] CaseSequence YYYY.MMM     KWH         
## <0 rows> (or 0-length row.names)

I will look at the time plot first, but it’s possible that the best indicator for power use on one day is the previous month or the same month, previous year.

Converting dates

I tried a few methods here, and eventually ended up separating the years/months and pasting them back together.

power_data <- power_data |> 
  separate(2, into = c("year", "month"), sep = "-", remove = FALSE)

#format as date column, convert df to a tsibble
power_data <- power_data |> 
  mutate(year_month = yearmonth(paste(year, month, sep = "-"))) |> 
  as_tsibble(index = year_month)

#plot the tsibble
autoplot(power_data, KWH)

#There's an outlier in 2010

values_2010 <- power_data |>
  filter(year == 2010)

values_2010
## # A tsibble: 12 x 6 [1M]
##    CaseSequence YYYY.MMM year  month     KWH year_month
##           <int> <chr>    <chr> <chr>   <int>      <mth>
##  1          877 2010-Jan 2010  Jan   9397357   2010 Jan
##  2          878 2010-Feb 2010  Feb   8390677   2010 Feb
##  3          879 2010-Mar 2010  Mar   7347915   2010 Mar
##  4          880 2010-Apr 2010  Apr   5776131   2010 Apr
##  5          881 2010-May 2010  May   4919289   2010 May
##  6          882 2010-Jun 2010  Jun   6696292   2010 Jun
##  7          883 2010-Jul 2010  Jul    770523   2010 Jul
##  8          884 2010-Aug 2010  Aug   7922701   2010 Aug
##  9          885 2010-Sep 2010  Sep   7819472   2010 Sep
## 10          886 2010-Oct 2010  Oct   5875917   2010 Oct
## 11          887 2010-Nov 2010  Nov   4800733   2010 Nov
## 12          888 2010-Dec 2010  Dec   6152583   2010 Dec

It looks like July 2010 is off by one digit, at 1/10 of the typical value. I will replace 770523 with 7705230.

Fixing missing values and outliers

#changing the outlier
power_data <- power_data |> 
  mutate(KWH = case_when(
    KWH == 770523 ~ 7705230, 
    TRUE ~ KWH
  ))

#The time plot makes more sense now
#We can also more clearly see the upward trend
autoplot(power_data, KWH)

group_month <- power_data |> 
    as_tibble() |> 
    group_by(month) |> 
    summarise(avg_kwh = mean(KWH, na.rm = TRUE))

group_month
## # A tibble: 12 × 2
##    month  avg_kwh
##    <chr>    <dbl>
##  1 Apr   5299734.
##  2 Aug   8298211.
##  3 Dec   6283175.
##  4 Feb   6946556.
##  5 Jan   8007422.
##  6 Jul   7843315.
##  7 Jun   6536092.
##  8 Mar   5971450.
##  9 May   5039034.
## 10 Nov   4953619.
## 11 Oct   5657164.
## 12 Sep   7702333.
sept_values <- power_data |>
  filter(month == "Sep")

sept_values
## # A tsibble: 16 x 6 [1M]
##    CaseSequence YYYY.MMM year  month     KWH year_month
##           <int> <chr>    <chr> <chr>   <dbl>      <mth>
##  1          741 1998-Sep 1998  Sep   6989888   1998 Sep
##  2          753 1999-Sep 1999  Sep   7899368   1999 Sep
##  3          765 2000-Sep 2000  Sep   8912169   2000 Sep
##  4          777 2001-Sep 2001  Sep   7112069   2001 Sep
##  5          789 2002-Sep 2002  Sep   8245227   2002 Sep
##  6          801 2003-Sep 2003  Sep   7791791   2003 Sep
##  7          813 2004-Sep 2004  Sep   6690366   2004 Sep
##  8          825 2005-Sep 2005  Sep   7057213   2005 Sep
##  9          837 2006-Sep 2006  Sep   7296355   2006 Sep
## 10          849 2007-Sep 2007  Sep   7666970   2007 Sep
## 11          861 2008-Sep 2008  Sep        NA   2008 Sep
## 12          873 2009-Sep 2009  Sep   7583146   2009 Sep
## 13          885 2010-Sep 2010  Sep   7819472   2010 Sep
## 14          897 2011-Sep 2011  Sep   8943599   2011 Sep
## 15          909 2012-Sep 2012  Sep   7559148   2012 Sep
## 16          921 2013-Sep 2013  Sep   7968220   2013 Sep

I’m going to take the mean of Sept 2007 and Sept 2009 to impute this value .

(7666970 + 7583146) /2
## [1] 7625058
# 7625058

power_data_i <- power_data |> 
  mutate(KWH = case_when(
    is.na(KWH)  ~ 7625058, 
    TRUE ~ KWH
  ))

autoplot(power_data_i, KWH)

Decomposition

Seasonal (Yearly), trend is generally going upward (though it’s a little wobbly and unpredictable). There’s a clear seasonality with two phases in the year (a small spike early on for winter, a bigger spike later for summer), which, as the years go by, changes somewhat – sometimes the winter spike is bigger, sometime summer spike is bigger.

dcmp <- power_data_i |>
  model(stl = STL(KWH)) |>
  components()

autoplot(dcmp)

Selecting methods

  • Seasonal naive - Given the somewhat stable demand for the past two years, this seems like a relatively solid option

  • Multiplicative Holt-Winters because the variations do seem to get bigger as the mean gets larger. I will also use whatever method ETS auto-selects for comparison.

  • ARIMA - I will select models based on the ACF, PACF, and auto-ARIMA

Seasonal naive

#I'll use 80% (~12 years) of data for training.
#data set goes from Jan 1998 to Dec 2013
myseries_train <- power_data_i |>
  filter(year_month < yearmonth("2010 Dec"))

fit_snaive <- myseries_train |>
  model(snaive = SNAIVE(KWH))

fc_snaive <- fit_snaive |>
  forecast(h = "3 years")

#confidence interval
fc_snaive |>
  autoplot(power_data_i)

#no confidence interval
fc_snaive |>
  autoplot(power_data_i, level = NULL)

#it's underestimating the new (bigger) spikes for winter 

fit_snaive |>
  accuracy()
## # A tibble: 1 × 10
##   .model .type        ME    RMSE     MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <chr>     <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 snaive Training 64935. 713639. 556230. 0.399  8.81     1     1 0.264
#the RMSE is really high at 713639, but a forecast that takes into account the last #3 years (and the trend) may still be valuable

ETS

fit <- power_data_i |>
    model(ets = ETS(KWH))

report(fit)
## Series: KWH 
## Model: ETS(M,A,M) 
##   Smoothing parameters:
##     alpha = 0.07246246 
##     beta  = 0.00152623 
##     gamma = 0.0001028814 
## 
##   Initial states:
##     l[0]     b[0]      s[0]     s[-1]    s[-2]   s[-3]    s[-4]    s[-5]
##  6211508 6871.104 0.9594555 0.7445171 0.870041 1.17663 1.260443 1.199424
##      s[-6]     s[-7]    s[-8]     s[-9]   s[-10]   s[-11]
##  0.9978941 0.7670837 0.809661 0.9205826 1.069169 1.225099
## 
##   sigma^2:  0.009
## 
##      AIC     AICc      BIC 
## 6140.547 6144.064 6195.924

MAM - this is suggesting Multiplicitve Holt-Winters (which was my guess for the best model).

I’ll also try it with a damped trend because, while the upward trend is visible, it’s not super strong (M, Ad, M). This may produce a more stable forecast.

fit_ets <- myseries_train |>
  model( 
        hwmd_ets = ETS(KWH ~ error("M") + trend("Ad") + season("M")),
        hwm_ets = ETS(KWH ~ error("M") + trend("A") + season("M")),
        
        )

fit_ets |> 
  forecast(h = "3 years") |> 
  autoplot(power_data_i)

#all plots separately 
fit_ets |> 
    forecast(h = "3 years") |> 
    autoplot(power_data_i) +
    facet_wrap(~ .model, ncol = 1, scales = "free_y") 

#check the accurcy
fit_ets |> accuracy()
## # A tibble: 2 × 10
##   .model   .type         ME    RMSE     MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>      <dbl>   <dbl>   <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 hwmd_ets Training  29052. 536564. 410177. -0.235  6.49 0.737 0.752 0.324
## 2 hwm_ets  Training -40889. 536888. 412741. -1.30   6.59 0.742 0.752 0.346

The damped trend is a slightly better fit. It outperforms on RMSE (536,564 vs 536,588) as well as other metrics.

ARIMA

power_data_i |> features(KWH, features = guerrero)
## # A tibble: 1 × 1
##   lambda_guerrero
##             <dbl>
## 1          -0.205
#-.205 is relatively low, so we should use it

#We can tell by looking that the data is not stationary and probably needs a 
#difference of 12, but let's check
power_data_i |>
  features(box_cox(KWH, -0.205), unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      1.66        0.01
#calling for one difference
power_data_i |>
  features(box_cox(KWH, -0.205), unitroot_nsdiffs)
## # A tibble: 1 × 1
##   nsdiffs
##     <int>
## 1       1
#after one difference 
power_data_i |>
    features(box_cox(KWH, -0.205) |> 
               difference(12), unitroot_nsdiffs)
## # A tibble: 1 × 1
##   nsdiffs
##     <int>
## 1       0
#data is stationary
power_data_i |>
    features(box_cox(KWH, -0.205) |> 
               difference(12), unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.138         0.1
power_data_i |> autoplot((box_cox(KWH, -0.205)) |>
                           difference(12))
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

#auto-ARIMA - not stepwise (takes a while)
power_data_i |>
  model(auto_arima = ARIMA(box_cox(KWH, -0.205), stepwise = FALSE)) |>
  report()
## Series: KWH 
## Model: ARIMA(4,0,0)(2,1,0)[12] w/ drift 
## Transformation: box_cox(KWH, `-0.205`) 
## 
## Coefficients:
##          ar1      ar2     ar3      ar4     sar1     sar2  constant
##       0.2532  -0.0148  0.2361  -0.1525  -0.7264  -0.3795     9e-04
## s.e.  0.0769   0.0758  0.0761   0.0760   0.0751   0.0769     3e-04
## 
## sigma^2 estimated as 1.439e-05:  log likelihood=757.69
## AIC=-1499.39   AICc=-1498.55   BIC=-1473.85
#4,0,0, 2,1,0 with drift

#let's try stepwise
power_data_i |>
    model(auto_arima = ARIMA(box_cox(KWH, -0.205), stepwise = TRUE)) |>
    report()
## Series: KWH 
## Model: ARIMA(0,0,1)(2,1,0)[12] w/ drift 
## Transformation: box_cox(KWH, `-0.205`) 
## 
## Coefficients:
##          ma1     sar1     sar2  constant
##       0.2458  -0.7207  -0.3550    0.0013
## s.e.  0.0833   0.0751   0.0769    0.0004
## 
## sigma^2 estimated as 1.501e-05:  log likelihood=751.99
## AIC=-1493.98   AICc=-1493.63   BIC=-1478.01
#001 201 with drift, so we have two different models to pull from


#ACF and PACF to check any other options
power_data_i |>
  gg_tsdisplay(difference(box_cox(KWH, -0.205),12), plot_type = 'partial')
## Warning: `gg_tsdisplay()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsdisplay()` instead.
## Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

#given that the ACF and PACF are very similar
#we can try a seaonal component of 110 and 011 
# and non-seasonal components of 100 or 001

fit_power_5 <-  myseries_train |>
  model(
    stepwise = ARIMA(box_cox(KWH, -0.205) ~ 1 + pdq(4,0,0) + PDQ(2,1,0)),
    auto_arima = ARIMA(box_cox(KWH, -0.205) ~ 1 + pdq(0,0,1) + PDQ(2,1,0)),
    m100_011 = ARIMA(box_cox(KWH, -0.205) ~ pdq(1,0,0) + PDQ(0,1,1)),
    m001_011 = ARIMA(box_cox(KWH, -0.205) ~ pdq(0,0,1) + PDQ(0,1,1)),
    m100_110 = ARIMA(box_cox(KWH, -0.205) ~ pdq(1,0,0) + PDQ(1,1,0)),
    m001_110 = ARIMA(box_cox(KWH, -0.205) ~ pdq(0,0,1) + PDQ(1,1,0))
  )


fc <- fit_power_5 |> 
  forecast(h = "3 years")

fc |> autoplot(power_data_i)

#facet-wrap
fc |> autoplot(power_data_i) +
     facet_wrap(~ .model, ncol = 1) 

#remove confidence intervals
fc |> autoplot(power_data_i, level = NULL) +
     facet_wrap(~ .model, ncol = 1) 

#check the accuracy
fc |> 
  accuracy(power_data_i)
## # A tibble: 6 × 10
##   .model     .type      ME    RMSE     MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>      <chr>   <dbl>   <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 auto_arima Test  493001. 890698. 684619.  5.82  8.69  1.23  1.25 0.308
## 2 m001_011   Test  596898. 952443. 713909.  7.17  9.00  1.28  1.33 0.237
## 3 m001_110   Test  381962. 892635. 729972.  4.20  9.38  1.31  1.25 0.322
## 4 m100_011   Test  598210. 953031. 714275.  7.18  9.00  1.28  1.34 0.236
## 5 m100_110   Test  381445. 893204. 730464.  4.18  9.39  1.31  1.25 0.320
## 6 stepwise   Test  493928. 892908. 689356.  5.84  8.76  1.24  1.25 0.299
#residuals on the two best performers

#distribuiton looks more normal here
fit_power_5 |> 
  select(auto_arima) |> 
  gg_tsresiduals()
## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

fit_power_5 |> 
  select(stepwise) |> 
  gg_tsresiduals()

Based on the RMSE, the closest fit was auto-ARIMA, followed by stepwise, then the 100,110 model. Since there are six models here, when I create my final forecast, I will limit it to the first two. The best fit overall was the Holt-Winters with a damped trend.


and make sure I included the box-cox –

Forecasting

fit_all <- power_data_i |>
  model(
    hwmd_ets = ETS(KWH ~ error("M") + trend("Ad") + season("M")),
    hwm_ets = ETS(KWH ~ error("M") + trend("A") + season("M")),
    snaive = SNAIVE(KWH),
    stepwise = ARIMA(box_cox(KWH, -0.205) ~ 1 + pdq(4,0,0) + PDQ(2,1,0)),
    auto_arima = ARIMA(box_cox(KWH, -0.205) ~ 1 + pdq(0,0,1) + PDQ(2,1,0))
  )

#Forecasts for one year
fc_all <- fit_all |> 
  forecast(h = "1 year")

#all forecasts together 
fc_all |> 
  autoplot(power_data_i) + 
  labs(title = "All forecasts, Power Usage")

fc_all |> 
  autoplot(power_data_i, level = NULL) + 
  labs(title = "All forecasts, Power Usage (no CI)")

fc_all |> 
  autoplot(power_data_i) +
  facet_wrap(~ .model, ncol = 1)

#breaking these up 

fc_all |> 
  filter(.model %in% c("hwmd_ets", "hwm_ets")) |> 
  autoplot(power_data_i) +
  facet_wrap(~ .model, ncol = 1) + 
  labs(title = "ETS")

fc_all |> 
  filter(.model %in% c("auto_arima", "stepwise")) |> 
  autoplot(power_data_i) +
  facet_wrap(~ .model, ncol = 1)+ 
  labs(title = "ARIMA")

fc_all |> 
  filter(.model == "snaive") |> 
  autoplot(power_data_i) + 
  labs(title = "SNAIVE")

#create a data frame for all forecasts
power_forecast <- as.data.frame(fc_all)

#write.csv(power_forecast, file="powerforecast.csv", row.names=TRUE)

#for the forecast I chose
hwms_forecast <- power_forecast |> filter(.model == "hwmd_ets")

#write.csv(hwms_forecast, file="powerforecast_hwmd.csv", row.names=TRUE)

I would go with the Holt-Winters damped as my main choice, but these all capture the more recent upward trend and the change in trend where winter requires more power.