pacman::p_load(tidyverse,DataExplorer,fpp3,tsibble,xts,imputeTS,cowplot)
power_data <- readxl::read_excel("Data/ResidentialCustomerForecastLoad-624.xlsx")

1 Introduction

  This part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Our assignment is to model these data and create a monthly forecast for 2014.The variable ‘KWH’ is power consumption in Kilowatt hours. We will inspect the variable, look for and fix any potential issues. From there, the data will be modeled and the best model will be used to produce future forecasts.

2 Exploration

power_ts <- power_data %>% mutate(`YYYY-MMM` = yearmonth(`YYYY-MMM`)) %>% select(!CaseSequence) %>% tsibble(index=`YYYY-MMM`) 
  There is a visible outlier in the initial time series plot shown below.
autoplot(power_ts)

plot_histogram(power_ts)

  This value is less than twice the second smallest value and only occurs once. Regardless if this is a real reading or not, it is apparent that it does not represent the majority of the data and will not add any meaningful info to future predictions. With that in mind, we will treat it as a missing value.
power_ts %>% arrange(KWH)
# A tsibble: 192 x 2 [1M]
   `YYYY-MMM`     KWH
        <mth>   <dbl>
 1   2010 Jul  770523
 2   2005 Nov 4313019
 3   1999 Dec 4419229
 4   1999 May 4426794
 5   1999 Nov 4436269
 6   2005 May 4453983
 7   2006 Nov 4458429
 8   2001 Nov 4461979
 9   2008 Nov 4555602
10   2008 May 4608528
# ... with 182 more rows
power_ts <- power_ts  %>% 
  mutate(KWH=ifelse(KWH>4000000,KWH,NA))
  Due to how few missing values there are, it seems that the NA imputation method used here does not really change much.
ggplot_na_imputations(power_ts,na_kalman (power_ts))

ggplot_na_imputations(power_ts,na_interpolation(power_ts))

 power_ts <- power_ts %>%
        mutate( KWH =na_kalman(KWH)) 

3 Modeling

3.1 Data Preparation

  The Box-Cox transformation does not appear to have much of an affect on this data. With this in mind, we will not use it.
plot_grid(autoplot(power_ts,  box_cox(KWH,   features(power_ts,KWH, features = guerrero)$lambda_guerrero))+labs(y= "Lambda KWH"  
                                                                                                     ,title = "Lambda Transform")
                ,autoplot(power_ts),ncol=1 ) 

3.2 ETS Modeing

  The first step for find the best model is to use the automatically generated one as a baseline.
auto_ETS <- power_ts %>%
  
  model(
    
    ETS(KWH)
  )
auto_ETS %>% 
  report()
Series: KWH 
Model: ETS(M,N,M) 
  Smoothing parameters:
    alpha = 0.1152884 
    gamma = 0.1983171 

  Initial states:
    l[0]      s[0]     s[-1]     s[-2]    s[-3]    s[-4]    s[-5]    s[-6]
 6188218 0.9083241 0.7515655 0.9314405 1.225896 1.276768 1.233079 1.016933
     s[-7]    s[-8]     s[-9]   s[-10]  s[-11]
 0.7623912 0.804406 0.8855059 1.011111 1.19258

  sigma^2:  0.0094

     AIC     AICc      BIC 
6145.320 6148.047 6194.182 
compare_ETS <- power_ts %>%
  
  model(
    
    `Multiplicative Trend` = ETS(KWH ~ error("M") + trend("A") +
                                                season("M")),
                
              
             `Damped Multiplicative` = ETS(KWH ~ error("M") + 
                                    trend("Ad") + season("M"))
  )
  Using a few other models for comparison, it would appear that the AIC and AICc scores of the alternative models are not far off of the automatic model. We will therefore choose the model with the lowest RMSE which would be the Damped Multiplicative model. It would seem below that the forecasts produced by each model are relatively close, with no abnormalities present.
bind_rows(glance(auto_ETS),report(compare_ETS)) %>% arrange(AICc)
# A tibble: 3 x 9
  .model                 sigma2 log_lik   AIC  AICc   BIC     MSE    AMSE    MAE
  <chr>                   <dbl>   <dbl> <dbl> <dbl> <dbl>   <dbl>   <dbl>  <dbl>
1 ETS(KWH)              0.00943  -3058. 6145. 6148. 6194. 3.98e11 4.20e11 0.0738
2 Multiplicative Trend  0.00929  -3056. 6146. 6149. 6201. 3.89e11 4.01e11 0.0705
3 Damped Multiplicative 0.00937  -3056. 6147. 6151. 6206. 3.86e11 4.05e11 0.0703
bind_rows(accuracy(auto_ETS),accuracy(compare_ETS)) %>% arrange(RMSE)
# A tibble: 3 x 10
  .model              .type     ME   RMSE    MAE     MPE  MAPE  MASE RMSSE  ACF1
  <chr>               <chr>  <dbl>  <dbl>  <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
1 Damped Multiplicat~ Trai~ 49856. 6.22e5 4.61e5 -0.0664  6.94 0.738 0.753 0.277
2 Multiplicative Tre~ Trai~ 26430. 6.24e5 4.64e5 -0.376   7.00 0.744 0.756 0.313
3 ETS(KWH)            Trai~ 52868. 6.31e5 4.82e5  0.0715  7.27 0.772 0.765 0.220
bind_cols(compare_ETS,auto_ETS) %>%
    forecast(h=10) %>% autoplot(power_ts,level=NULL)

3.3 ARIMA Modeling

3.3.1 Differencing

  Using the initial ACF and PACF plots, it is evident that the data is not stationary.
gg_tsdisplay(power_ts,plot_type='partial')

It would appear that a single seasonal difference is required to make the data stationary.
power_ts %>%
  features(KWH, unitroot_nsdiffs)
# A tibble: 1 x 1
  nsdiffs
    <int>
1       1
power_ts %>%
  mutate(dif =difference(KWH,12)) %>% 
    features(dif, unitroot_ndiffs)
# A tibble: 1 x 1
  ndiffs
   <int>
1      0

3.3.2 Model Building

| The first step begins with generating a baseline model.

auto_ARIMA <- power_ts %>%
          model(  
            ARIMA(KWH)
            )
auto_ARIMA %>%  report()
Series: KWH 
Model: ARIMA(0,0,4)(2,1,0)[12] w/ drift 

Coefficients:
         ma1     ma2     ma3      ma4     sar1     sar2   constant
      0.3459  0.0665  0.2109  -0.0772  -0.7368  -0.4342  233796.06
s.e.  0.0772  0.0838  0.0737   0.0819   0.0763   0.0784   73852.84

sigma^2 estimated as 3.866e+11:  log likelihood=-2657.65
AIC=5331.31   AICc=5332.15   BIC=5356.85
      compare_ARIMA <- power_ts %>%

        model(
           arima_000_110 = ARIMA(KWH ~ pdq(0,0,0) + PDQ(1,1,0)),
           arima_001_110 = ARIMA(KWH ~ pdq(0,0,1) + PDQ(1,1,0)),
           arima_001_111 = ARIMA(KWH ~ pdq(0,0,1) + PDQ(1,1,1)),
           
           arima_000_210 = ARIMA(KWH ~ pdq(0,0,0) + PDQ(2,1,0)),
           arima_100_012 = ARIMA(KWH ~ pdq(1,0,0) + PDQ(0,1,2)),
           
           arima_001_012 = ARIMA(KWH ~ pdq(0,0,1) + PDQ(1,1,2)),
           arima_002_410 = ARIMA(KWH ~ pdq(0,0,2) + PDQ(4,1,0)),
           arima_4002_210 = ARIMA(KWH ~ pdq(4,0,0) + PDQ(2,1,0))
           )
  For the ARIMA models, it would appear the automatic model performs the best for both AIC and RMSE. It will be chosen as our best ARIMA model.
 bind_rows(glance(compare_ARIMA),glance(auto_ARIMA))  %>% arrange(AIC)
# A tibble: 9 x 8
  .model                sigma2 log_lik   AIC  AICc   BIC ar_roots   ma_roots  
  <chr>                  <dbl>   <dbl> <dbl> <dbl> <dbl> <list>     <list>    
1 ARIMA(KWH)     386602799008.  -2658. 5331. 5332. 5357. <cpl [24]> <cpl [4]> 
2 arima_4002_210 389634639624.  -2658. 5333. 5333. 5358. <cpl [28]> <cpl [0]> 
3 arima_100_012  404924310601.  -2663. 5336. 5337. 5352. <cpl [1]>  <cpl [24]>
4 arima_001_111  406833491598.  -2663. 5337. 5337. 5353. <cpl [12]> <cpl [13]>
5 arima_001_012  408948678419.  -2663. 5339. 5339. 5358. <cpl [12]> <cpl [25]>
6 arima_002_410  408675724669.  -2662. 5340. 5341. 5366. <cpl [48]> <cpl [2]> 
7 arima_000_210  440120515524.  -2671. 5349. 5349. 5362. <cpl [24]> <cpl [0]> 
8 arima_001_110  469469551957.  -2674. 5357. 5357. 5370. <cpl [12]> <cpl [1]> 
9 arima_000_110  501282435339.  -2681. 5368. 5368. 5377. <cpl [12]> <cpl [0]> 
 bind_rows(accuracy(compare_ARIMA),accuracy(auto_ARIMA))  %>% arrange(RMSE)
# A tibble: 9 x 10
  .model         .type        ME   RMSE    MAE    MPE  MAPE  MASE RMSSE     ACF1
  <chr>          <chr>     <dbl>  <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>    <dbl>
1 ARIMA(KWH)     Traini~  -8679. 5.90e5 4.30e5 -0.798  6.53 0.690 0.715  1.27e-4
2 arima_4002_210 Traini~  -8451. 5.93e5 4.30e5 -0.817  6.53 0.690 0.718  1.93e-2
3 arima_002_410  Traini~ -10256. 6.07e5 4.37e5 -0.874  6.63 0.700 0.735 -7.41e-4
4 arima_100_012  Traini~ -13655. 6.09e5 4.42e5 -0.933  6.71 0.709 0.738 -3.74e-3
5 arima_001_012  Traini~ -13655. 6.11e5 4.43e5 -0.939  6.74 0.711 0.740 -6.95e-4
6 arima_001_111  Traini~ -16533. 6.11e5 4.44e5 -0.985  6.76 0.712 0.740 -1.28e-3
7 arima_000_210  Traini~ -12501. 6.37e5 4.58e5 -1.01   6.89 0.734 0.772  2.78e-1
8 arima_001_110  Traini~  -7866. 6.58e5 4.78e5 -0.791  7.21 0.766 0.797 -1.37e-2
9 arima_000_110  Traini~ -10304. 6.82e5 4.96e5 -0.915  7.41 0.795 0.826  2.49e-1

4 Comparing Models

  Running he function below performs cross validation on the data. This process is very useful in determining which model better fits the data. As raw AICc score alone is only useful when choosing a model of the same type, we instead need to use the RMSE value when choosing between different kinds of modeling. Using cross validation is valid for this data as it is relatively small. However, it still requires a significant amount of time to load and therefore an image of the result of this process is instead used to show the result. It is found that the ARIMA model performs much better than the ETS model.
compare_models <- power_ts %>% 
  
   slice(-n()) %>%
  stretch_tsibble(.init = 10) %>%
  model(
     `Damped Multiplicative` = ETS(KWH ~ error("M") + 
                                    trend("Ad") + season("M")),
    ARIMA = ARIMA(KWH ~ pdq(0,0,4) + PDQ(2,1,0))
  )
compare_models %>%
  forecast(h = 12) %>%
  accuracy(power_ts) %>%
  select(.model, RMSE:MAPE)
ggdraw() + draw_image("https://i.gyazo.com/fd8b1f4c75166cbf12829b852fc903ca.png")

5 Forecasting

  final_model <- power_ts %>%
                  model(
                     ARIMA(KWH ~ pdq(0,0,4) + PDQ(2,1,0))
                     )
  Using our chooen model, the plot below shows our forecasts for the months of 2014.
final_model %>% forecast(h=12) %>%
  autoplot(power_ts)

  The raw values can viewed below.
final_model %>% forecast(h=12) 
# A fable: 12 x 4 [1M]
# Key:     .model [1]
   .model                                  `YYYY-MMM`                 KWH  .mean
   <chr>                                        <mth>              <dist>  <dbl>
 1 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 Jan   N(1e+07, 3.9e+11) 1.02e7
 2 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 Feb N(8720444, 4.3e+11) 8.72e6
 3 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 Mar N(7132127, 4.3e+11) 7.13e6
 4 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 Apr N(5807917, 4.5e+11) 5.81e6
 5 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 May N(5938295, 4.5e+11) 5.94e6
 6 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 Jun N(8207200, 4.5e+11) 8.21e6
 7 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 Jul N(9520351, 4.5e+11) 9.52e6
 8 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 Aug   N(1e+07, 4.5e+11) 1.00e7
 9 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 Sep N(8501665, 4.5e+11) 8.50e6
10 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 Oct N(5870434, 4.5e+11) 5.87e6
11 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 Nov N(6158797, 4.5e+11) 6.16e6
12 ARIMA(KWH ~ pdq(0, 0, 4) + PDQ(2, 1, 0~   2014 Dec N(8354323, 4.5e+11) 8.35e6

6 Conclusion

  The process for part B largely mirrored that of part A however, it was definitely a far more simple data set. After cleaning the extreme value from the data set, the modeling process was fairly simple. Using cross validation from there, I was able to determine the optimal model for this data set and used it to make predictions for 2014. Overall, it would appear that once you have the process down for forecasting data, it can become a quick and easy matter to make forecasts on any given data set.