Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward.
data_raw_b <- read_excel ('./data/ResidentialCustomerForecastLoad-624.xlsx', col_names = TRUE)%>%
rename(DATE = 'YYYY-MMM')%>%
mutate(DATE=as_date(paste(DATE, '01', sep='-'), format='%Y-%b-%d', tz='UTC'))
summary(data_raw_b)
## CaseSequence DATE KWH
## Min. :733.0 Min. :1998-01-01 Min. : 770523
## 1st Qu.:780.8 1st Qu.:2001-12-24 1st Qu.: 5429912
## Median :828.5 Median :2005-12-16 Median : 6283324
## Mean :828.5 Mean :2005-12-15 Mean : 6502475
## 3rd Qu.:876.2 3rd Qu.:2009-12-08 3rd Qu.: 7620524
## Max. :924.0 Max. :2013-12-01 Max. :10655730
## NA's :1
boxplot(data_raw_b$KWH,
ylab = "KWH")
out <- boxplot.stats(data_raw_b$KWH)$out
out
## [1] 770523
data_raw_b <- data_raw_b%>%
mutate(KWH = na_if(KWH, out))
data_raw_b$KWH <- na_ma(data_raw_b$KWH, k = 5, weighting = "simple")
summary(data_raw_b)
## CaseSequence DATE KWH
## Min. :733.0 Min. :1998-01-01 Min. : 4313019
## 1st Qu.:780.8 1st Qu.:2001-12-24 1st Qu.: 5443502
## Median :828.5 Median :2005-12-16 Median : 6339797
## Mean :828.5 Mean :2005-12-15 Mean : 6531803
## 3rd Qu.:876.2 3rd Qu.:2009-12-08 3rd Qu.: 7608792
## Max. :924.0 Max. :2013-12-01 Max. :10655730
pwr <- data_raw_b
tspwr <- ts(pwr$KWH, frequency = 12, start = c(1998,1))
grid.arrange(nrow=2,
ggseasonplot(tspwr),
ggsubseriesplot(tspwr))
ggtsdisplay(tspwr)
pwr_lambda <- BoxCox.lambda(tspwr)
pwrbc <- BoxCox(tspwr, lambda = pwr_lambda)
grid.arrange(nrow=2,
ggseasonplot(pwrbc),
ggsubseriesplot(pwrbc))
ggtsdisplay(pwrbc)
pwrbc %>% ur.kpss() %>% summary
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 4 lags.
##
## Value of test-statistic is: 1.595
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ndiffs(tspwr)
## [1] 1
ndiffs(pwrbc)
## [1] 1
ggtsdisplay(diff(tspwr, 12), points = FALSE)
tspwr %>% diff %>% ur.kpss() %>% summary
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 4 lags.
##
## Value of test-statistic is: 0.0471
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
ggtsdisplay(diff(pwrbc, 12), points = FALSE)
pwrbc %>% diff %>% ur.kpss() %>% summary
##
## #######################
## # KPSS Unit Root Test #
## #######################
##
## Test is of type: mu with 4 lags.
##
## Value of test-statistic is: 0.0414
##
## Critical value for a significance level of:
## 10pct 5pct 2.5pct 1pct
## critical values 0.347 0.463 0.574 0.739
pwr_model <- Arima(tspwr, order = c(1,1,2), seasonal=c(1,1,2))
summary(pwr_model)
## Series: tspwr
## ARIMA(1,1,2)(1,1,2)[12]
##
## Coefficients:
## ar1 ma1 ma2 sar1 sma1 sma2
## -0.4594 -0.2116 -0.6683 0.1912 -1.0568 0.3482
## s.e. 0.2201 0.1826 0.1594 0.3456 0.3326 0.2376
##
## sigma^2 estimated as 4.089e+11: log likelihood=-2650.19
## AIC=5314.39 AICc=5315.04 BIC=5336.7
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 42101.78 607001.7 449961.1 0.04052391 6.793162 0.7087288
## ACF1
## Training set 0.02217025
checkresiduals(pwr_model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,2)(1,1,2)[12]
## Q* = 14.027, df = 18, p-value = 0.7273
##
## Model df: 6. Total lags used: 24
auto_model <- auto.arima(tspwr, d=1, approximation = FALSE)
summary(auto_model)
## Series: tspwr
## ARIMA(1,1,2)(2,1,0)[12]
##
## Coefficients:
## ar1 ma1 ma2 sar1 sar2
## -0.5466 -0.1434 -0.7342 -0.8017 -0.4634
## s.e. 0.1837 0.1480 0.1244 0.0753 0.0761
##
## sigma^2 estimated as 4.037e+11: log likelihood=-2649.66
## AIC=5311.32 AICc=5311.81 BIC=5330.44
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 36857.45 604849.7 445553.1 -0.009880902 6.736162 0.7017858
## ACF1
## Training set 0.02896049
checkresiduals(auto_model)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,2)(2,1,0)[12]
## Q* = 14.373, df = 19, p-value = 0.7615
##
## Model df: 5. Total lags used: 24
pwr_forecast <- forecast(pwr_model, h = 12)
autoplot(pwr_forecast)
signif(pwr_forecast$mean/1000000,3)
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov
## 2014 10.20 8.61 7.26 6.31 6.18 8.15 9.25 9.93 8.66 6.30 6.28
## Dec
## 2014 8.09
export <- pwr_forecast$mean
write.csv(export, "Power Forecasts.csv")