library(readxl, quietly = TRUE, warn.conflicts =  FALSE, verbose = F)
library(fpp2,quietly = TRUE, warn.conflicts =  FALSE, verbose = F)
library(ggplot2)

library(gridExtra)
library(mlbench)
library(psych)
library(dplyr )

library("readxl")
library("dplyr")
library("tidyr")
 library("forecast")
library("fpp2")
library("lubridate")

Part A – ATM Forecast

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

Missing Values

We split the data into four dataframes, one for each ATM machine. We will fill in any missing data by the median value for the given ATM. We will omit any observation with out dates.

atm1$Cash[is.na(atm1$Cash)] <- median(atm1$Cash)
atm2$Cash[is.na(atm2$Cash)] <- median(atm2$Cash) 
atm3$Cash[is.na(atm3$Cash)] <- median(atm3$Cash) 
atm4$Cash[is.na(atm4$Cash)] <- median(atm4$Cash) 

atm1 <- na.omit(atm1)
atm2 <- na.omit(atm2)
atm3 <- na.omit(atm3)
atm4 <- na.omit(atm4)


ts1 = ts(atm1$Cash, start = c(2009,as.numeric(format(as.Date(min(atm1$DATE)), "%j"))), frequency =  365)
 
ts2 = ts(atm2$Cash,start = c(2009,as.numeric(format(as.Date(min(atm2$DATE)), "%j"))), frequency =  365)
ts3 = ts(atm3$Cash, start = c(2009,as.numeric(format(as.Date(min(atm3$DATE)), "%j"))), frequency =  365)
ts4 = ts(atm4$Cash, start = c(2009,as.numeric(format(as.Date(min(atm4$DATE)), "%j"))), frequency =  365)

ATM 1

Below table dispalys summary for atm 1. Minimum value 1 looks suspcious.

summary(atm1$Cash)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   73.00   91.00   83.89  108.00  180.00
ts1w = ts(atm1$Cash, start = 1, frequency =  7)
ts1d = ts(atm1$Cash, start = as.Date(min(atm1$DATE)), frequency =  365)

Daily Time Series

Based on the plot below, we do not see any clear trend or sesonality. Plot of the data looks like it is white noise/stationary and ndiffs() returns 0.
Pvalue from Box-Ljung test is also high, implying it is just white noise. However, the acf plot shows it is not stationary.

ggtsdisplay(ts1d, points = FALSE, main = "Withdrawals from ATM1", xlab = "Week", ylab = "Cash in hundreds")

Box.test(ts1d, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts1d
## X-squared = 1.4791, df = 1, p-value = 0.2239

Time series with Frequency 7

We will create a time series with frequency seven to see if we can identify any trends or seasonality.

ggtsdisplay(ts1w, points = FALSE, main = "Withdrawals from ATM1", xlab = "Week", ylab = "Cash in hundreds")

Box.test(ts1w, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts1w
## X-squared = 1.4791, df = 1, p-value = 0.2239

For the daily time series data we can not look at the sesonalplot or subseries plot because we only have a year worth of data. Becasue of this we will look at seasonal plot an subseries plot using the weekly time series.

Sesonal plot shows that least amount withdrawls happens on wednesdays and other days being roughly the same.
There are some obserations where thi occurs on monday and fridays.

ggseasonplot(ts1w, year.labels=TRUE, year.labels.left=TRUE) 

Seasonal subseries plot, plots average withdrwals per day of the week, shows above described pattern clearly. Least withdrwals happens on Wednesdays, and most happening around fridays.

ggsubseriesplot(ts1w, year.labels=TRUE, year.labels.left=TRUE) 

Seasonal component does not change over time. Trend component shows a downward trend.

autoplot( stl(ts1w, t.window=13, s.window="periodic", robust=T) )

Test and Train Data

We will split the data into train and test data. We will use the test data to evaluate model accuracys and for model selection.

atm1_train <-ts(atm1[0:302,]$Cash  , start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))), frequency =  7)
atm1_test <-ts(atm1[303:362,]$Cash  , start=c(2010,as.numeric(format(as.Date("2010-04-01"), "%j"))), frequency =  7)

ETS

atm1_ets_fit <- ets(atm1_train)
summary(atm1_ets_fit)
## ETS(A,N,A) 
## 
## Call:
##  ets(y = atm1_train) 
## 
##   Smoothing parameters:
##     alpha = 0.01 
##     gamma = 0.3379 
## 
##   Initial states:
##     l = 88.8583 
##     s = -42.917 19.2648 24.5604 -39.282 6.6179 38.9107
##            -7.1548
## 
##   sigma:  28.274
## 
##      AIC     AICc      BIC 
## 3753.946 3754.702 3791.051 
## 
## Training set error measures:
##                     ME     RMSE      MAE       MPE    MAPE      MASE      ACF1
## Training set 0.2639688 27.84955 18.47705 -120.4485 138.029 0.9083038 0.1367749
autoplot(atm1_ets_fit)

atm1_ets_forecast <- forecast(atm1_ets_fit, h=60)
autoplot(atm1_ets_forecast) 

checkresiduals(atm1_ets_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,A)
## Q* = 15.475, df = 5, p-value = 0.008514
## 
## Model df: 9.   Total lags used: 14

ARIMA

atm1_arima <- auto.arima(atm1_train)
summary(atm1_arima)
## Series: atm1_train 
## ARIMA(0,0,1)(0,1,1)[7] 
## 
## Coefficients:
##          ma1     sma1
##       0.1768  -0.6461
## s.e.  0.0610   0.0555
## 
## sigma^2 estimated as 750.9:  log likelihood=-1396.14
## AIC=2798.27   AICc=2798.35   BIC=2809.33
## 
## Training set error measures:
##                    ME    RMSE      MAE       MPE     MAPE      MASE        ACF1
## Training set 0.774482 26.9918 17.27918 -113.0299 130.8749 0.8494179 -0.01012743
atm1_arima_forecast <- forecast(atm1_arima, h = 60)
autoplot(atm1_arima_forecast) 

checkresiduals(atm1_arima_forecast)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(0,1,1)[7]
## Q* = 8.1068, df = 12, p-value = 0.7767
## 
## Model df: 2.   Total lags used: 14

Accuracy and Model Selection

ARIMA model has lower AIC and has lower RMSE than ETS model on both train and test data. Because of this we wil select ARIMA model and refit with entire training set and produce the forecast from that model.

#Train ARIMA
accuracy(atm1_arima_forecast)
##                    ME    RMSE      MAE       MPE     MAPE      MASE        ACF1
## Training set 0.774482 26.9918 17.27918 -113.0299 130.8749 0.8494179 -0.01012743
#Train ETS
accuracy(atm1_ets_forecast)
##                     ME     RMSE      MAE       MPE    MAPE      MASE      ACF1
## Training set 0.2639688 27.84955 18.47705 -120.4485 138.029 0.9083038 0.1367749
# Test ARIMA
accuracy( atm1_arima_forecast$mean, atm1[303:362,]$Cash)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -14.91057 28.88491 20.06694 -265.5511 270.3319
# Test ETS
accuracy( atm1_ets_forecast$mean, atm1[303:362,]$Cash)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -15.97052 30.19271 20.98208 -277.9664 282.6238
atm1_arima <- auto.arima(ts1w)
 
atm1_arima_forecast <- forecast(atm1_arima, h = 31)
 
write.csv(atm1_arima_forecast$mean,"ATM1_Forecast.csv")

ATM2

summary(atm2$Cash)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   25.50   67.00   62.58   93.00  147.00
ts2w = ts(atm2$Cash, start = 1, frequency =  7)
ts2d = ts(atm2$Cash, start = as.Date(min(atm2$DATE)), frequency =  365)

Based on the plot below, we do not see any clear trend or sesonality. Plot of the data looks like it is white noise/stationary and ndiffs() returns 0.
Pvalue from Box-Ljung test is also high, implying it is just white noise. However, the acf plot shows it is not stationary.

ggtsdisplay(ts2d, points = FALSE, main = "Withdrawals from ATM2", xlab = "Week", ylab = "Cash in hundreds")

Box.test(ts2d, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts2d
## X-squared = 0.056235, df = 1, p-value = 0.8125

Time series with Frequency 7

ggtsdisplay(ts2w, points = FALSE, main = "Withdrawals from ATM2", xlab = "Week", ylab = "Cash in hundreds")

Box.test(ts2w, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts2w
## X-squared = 0.056235, df = 1, p-value = 0.8125

For the daily time series data we can not look at the sesonalplot or subseries plot because we only have a year worth of data. Becasue of this we will look at seasonal plot an subseries plot using the weekly time series.

Sesonal plot shows that least amount withdrawls happens on wednesdays and other days being roughly the same.
There are some obserations where this occurs on monday and fridays. variation decreases as the level of the series increases so we need to use multiplicative seasonality and transform the data.

 ggseasonplot(ts2w, year.labels=TRUE, year.labels.left=TRUE) 

Seasonal subseries plot, plots average withdrwals per day of the week, shows above described pattern clearly. Least withdrwals happens on Wednesdays, and most happening around fridays.

ggsubseriesplot(ts2w, year.labels=TRUE, year.labels.left=TRUE) 

### Test and Train Data

As above we will use 60 days of data for testing the model, a better approache would be to cross validation.

atm2_train <-ts2w[0:302]  
atm2_test <-ts2w[303:362]
atm2_lambda <- BoxCox.lambda(atm2_train)

ETS

Residual ACF plot shows autocorrelation in the errors, and our forecasts may be inefficient.
Residuals are not normally distributed.

atm2_ets_fit <- ets(atm2_train, lambda =  atm2_lambda)
summary(atm2_ets_fit)
## ETS(A,A,N) 
## 
## Call:
##  ets(y = atm2_train, lambda = atm2_lambda) 
## 
##   Box-Cox transformation: lambda= 0.2076 
## 
##   Smoothing parameters:
##     alpha = 2e-04 
##     beta  = 2e-04 
## 
##   Initial states:
##     l = 6.6658 
##     b = -0.005 
## 
##   sigma:  2.123
## 
##      AIC     AICc      BIC 
## 2185.227 2185.430 2203.780 
## 
## Training set error measures:
##                   ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 14.6463 40.56853 34.78051 -Inf  Inf 0.8090366 -0.03978792
atm2_ets_forecast <- forecast(atm2_ets_fit, h=60)
autoplot(atm2_ets_forecast)

checkresiduals(atm2_ets_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,A,N)
## Q* = 195.18, df = 6, p-value < 2.2e-16
## 
## Model df: 4.   Total lags used: 10

Residual ACF plot shows no autocorrelation in the errors and residuals are normally distributed. ### ARIMA

atm2_arima <- auto.arima(ts2w, lambda =  atm2_lambda)
summary(atm2_arima)
## Series: ts2w 
## ARIMA(0,0,5)(0,1,1)[7] 
## Box Cox transformation: lambda= 0.2076287 
## 
## Coefficients:
##          ma1      ma2     ma3     ma4      ma5     sma1
##       0.0538  -0.1045  0.0373  0.0251  -0.2263  -0.5656
## s.e.  0.0523   0.0526  0.0532  0.0521   0.0523   0.0457
## 
## sigma^2 estimated as 2.35:  log likelihood=-655.81
## AIC=1325.63   AICc=1325.95   BIC=1352.75
## 
## Training set error measures:
##                    ME     RMSE      MAE  MPE MAPE     MASE        ACF1
## Training set 2.550071 29.31029 20.09578 -Inf  Inf 0.935176 -0.05814841
atm2_arima_forecast <- forecast(atm2_arima, h = 60)
autoplot(atm2_arima_forecast)

checkresiduals(atm2_arima_forecast)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,5)(0,1,1)[7]
## Q* = 2.9887, df = 8, p-value = 0.9351
## 
## Model df: 6.   Total lags used: 14

Accuracy

ARIMA model fits the training data slightly better than the ETS model, but that the ETS model provides more accurate forecasts on the test set. We will also use the ARIMA for forecasting since ETS residuals are not normally distribute and ACF shows autocorrelation.

#Train ARIMA
accuracy(atm2_arima_forecast)
##                    ME     RMSE      MAE  MPE MAPE     MASE        ACF1
## Training set 2.550071 29.31029 20.09578 -Inf  Inf 0.935176 -0.05814841
#Train ETS
accuracy(atm2_ets_forecast)
##                   ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set 14.6463 40.56853 34.78051 -Inf  Inf 0.8090366 -0.03978792
# Test ARIMA
accuracy( atm2_arima_forecast$mean, atm2[303:362,]$Cash)
##                 ME     RMSE      MAE  MPE MAPE
## Test set -0.463026 64.76235 57.78748 -Inf  Inf
# Test ETS
accuracy( atm2_ets_forecast$mean, atm2[303:362,]$Cash)
##               ME     RMSE      MAE  MPE MAPE
## Test set 20.9725 44.91564 40.35588 -Inf  Inf
atm2_arima <- auto.arima(ts2w)
 
atm2_arima_forecast <- forecast(atm2_arima, h = 31)
 
write.csv(atm2_arima_forecast$mean,"ATM2_Forecast.csv")

ATM3

ATM3 has only 3 non zero observations because we will use the Average method to forecast.

summary(atm3$Cash)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.7206  0.0000 96.0000
ts3 = ts(atm3$Cash, start = 1, frequency = 365)
 
write.csv(snaive(ts3, 30)$mean,"ATM3_Forecast.csv")

ATM4

From the table and plot below we see that there is an outlier. We will replace this median to do further analysis.

summary(atm4$Cash)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     1.563   124.334   403.839   474.043   704.507 10919.762
ts4d <- ts(atm4$Cash, start = as.Date(min(atm4$DATE)), frequency =  365)
autoplot(ts4d, points = FALSE, main = "Withdrawals from ATM4", xlab = "Week", ylab = "Cash in hundreds")
## Warning: Ignoring unknown parameters: points

atm4[285,]$Cash <- median(atm4$Cash)
ts4d <- ts(atm4$Cash, start = as.Date(min(atm4$DATE)), frequency =  365)

 autoplot(ts4d, points = FALSE, main = "Withdrawals from ATM4", xlab = "Week", ylab = "Cash in hundreds")
## Warning: Ignoring unknown parameters: points

ts4w <- ts(atm4$Cash, start = 1, frequency =  7)
ts4d  <- ts(atm4$Cash, start = as.Date(min(atm4$DATE)), frequency =  365)

Based on the plot below, we do not see any clear trend or sesonality. Plot of the data looks like it is white noise/stationary and ndiffs() returns 0.
Pvalue from Box-Ljung test is also high, implying it is just white noise. However, the acf plot shows it is not stationary.

ggtsdisplay(ts4d, points = FALSE, main = "Withdrawals from ATM4", xlab = "Week", ylab = "Cash in hundreds")

Box.test(ts4d, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts4d
## X-squared = 2.0838, df = 1, p-value = 0.1489

Time series with Frequency 7

ggtsdisplay(ts4w, points = FALSE, main = "Withdrawals from ATM1", xlab = "Week", ylab = "Cash in hundreds")

Box.test(ts4w, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ts4w
## X-squared = 2.0838, df = 1, p-value = 0.1489

For the daily time series data we can not look at the sesonalplot or subseries plot because we only have a year worth of data. Becasue of this we will look at seasonal plot an subseries plot using the weekly time series.

Seasonal plot does not show anything obivious.

 ggseasonplot(ts4w, year.labels=TRUE, year.labels.left=TRUE) 

Seasonal subseries plot, plots average withdrwals per day of the week, shows downward trend. Most withdrawls happens on Sunday and less other days.

ggsubseriesplot(ts4w, year.labels=TRUE, year.labels.left=TRUE) 

### Test and Train Data

We will use the last 60 days of data for testing.

atm4_train <-ts4w[0:302]  
atm4_test <-ts4w[303:362]

ETS

atm4_ets_fit <- ets(atm4_train)
summary(atm4_ets_fit)
## ETS(A,N,N) 
## 
## Call:
##  ets(y = atm4_train) 
## 
##   Smoothing parameters:
##     alpha = 1e-04 
## 
##   Initial states:
##     l = 449.2714 
## 
##   sigma:  360.9104
## 
##      AIC     AICc      BIC 
## 5285.275 5285.355 5296.406 
## 
## Training set error measures:
##                      ME     RMSE     MAE       MPE     MAPE      MASE
## Training set -0.1600697 359.7133 300.548 -535.1853 568.8359 0.7703108
##                    ACF1
## Training set 0.08650058
autoplot(atm4_ets_fit)

atm4_ets_forecast <- forecast(atm4_ets_fit, h=60)
autoplot(atm4_ets_forecast)

checkresiduals(atm4_ets_fit)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 23.195, df = 8, p-value = 0.003123
## 
## Model df: 2.   Total lags used: 10

ARIMA

atm4_arima <- auto.arima(ts4w)
summary(atm4_arima)
## Series: ts4w 
## ARIMA(0,0,3)(1,0,0)[7] with non-zero mean 
## 
## Coefficients:
##          ma1     ma2     ma3    sar1      mean
##       0.0874  0.0161  0.0886  0.1838  444.6109
## s.e.  0.0532  0.0583  0.0516  0.0526   26.0984
## 
## sigma^2 estimated as 119377:  log likelihood=-2648.96
## AIC=5309.93   AICc=5310.16   BIC=5333.33
## 
## Training set error measures:
##                      ME     RMSE      MAE       MPE    MAPE      MASE
## Training set -0.2167449 343.1347 287.2529 -519.3411 552.585 0.8292132
##                      ACF1
## Training set -0.008268238
atm4_arima_forecast <- forecast(atm4_arima, h = 60)
autoplot(atm4_arima_forecast)

checkresiduals(atm4_arima_forecast)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
## Q* = 16.218, df = 9, p-value = 0.06247
## 
## Model df: 5.   Total lags used: 14

Accuracy

ARIMA model fits the training data slightly better than the ETS model and it has better AIC so will use that for forecasting.

#Train ARIMA
accuracy(atm4_arima_forecast)
##                      ME     RMSE      MAE       MPE    MAPE      MASE
## Training set -0.2167449 343.1347 287.2529 -519.3411 552.585 0.8292132
##                      ACF1
## Training set -0.008268238
#Train ETS
accuracy(atm4_ets_forecast)
##                      ME     RMSE     MAE       MPE     MAPE      MASE
## Training set -0.1600697 359.7133 300.548 -535.1853 568.8359 0.7703108
##                    ACF1
## Training set 0.08650058
# Test ARIMA
accuracy( atm4_arima_forecast$mean, atm4[303:362,]$Cash)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set -6.498257 303.3209 252.9984 -799.5391 830.9191
# Test ETS
accuracy( atm4_ets_forecast$mean, atm4[303:362,]$Cash)
##                 ME     RMSE      MAE      MPE     MAPE
## Test set -16.31642 304.9523 256.0181 -816.253 846.5153
atm4_arima <- auto.arima(ts4w)
 
atm42_arima_forecast <- forecast(atm4_arima, h = 31)
 
write.csv(atm4_arima_forecast$mean,"ATM4_Forecast.csv")

Part B – Forecasting Power

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

Below table displays summary table. There is one NA and we will replace that with median

Plot below shows some sesasonality. Variation increases as the level of the series increases so it shows multiplicative seasonality. This time series is non stationary; Jung Box test has small p-value.

summary(power$KWH)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   770523  5429912  6283324  6502475  7620524 10655730        1
namedian <- median(power$KWH)
power$KWH[is.na(power$KWH)]= median(power$KWH,na.rm = TRUE)
power_ts =ts(power$KWH,start = c(1998,1),frequency = 12)
 Box.test(diff(power_ts), lag=10, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  diff(power_ts)
## X-squared = 154.15, df = 10, p-value < 2.2e-16
autoplot(power_ts)

Sesonal plot shows that power usage pattern that is tied to season. In summer and winter power usage goes and goes in fall and spring. Subseries plot also shows this pattern. We will use BoxCox to transform data.

ggseasonplot(power_ts, year.labels=TRUE, year.labels.left=TRUE) 

ggsubseriesplot(power_ts, year.labels=TRUE, year.labels.left=TRUE) 

Sesaonal Decomposition below shows that there is a postive trend and strong sesasonlity as well.

autoplot( stl(power_ts, t.window=13, s.window="periodic", robust=T) )

### Test and Train Data

power_ts <-ts(power$KWH, start = c(1998,1),end=c(2013,12),frequency = 12 )
power_train <-ts(power[0:159,]$KWH, start = c(1998,1),end=c(2011,3),frequency = 12 )
power_test <-ts(power[160:192,]$KWH,start = c(2011,4),end=c(2013,12),frequency = 12)

train_lambda=BoxCox.lambda(power_train)

test_lambda=BoxCox.lambda(power_test)

ETS

Residuals are almost nomrally distributed. ACF plot shows some autocorrelations are outside the threshold limits.

power_ets <- ets(power_train, lambda =  train_lambda)
summary(power_ets)
## ETS(A,N,A) 
## 
## Call:
##  ets(y = power_train, lambda = train_lambda) 
## 
##   Box-Cox transformation: lambda= 1.4121 
## 
##   Smoothing parameters:
##     alpha = 0.0249 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 2775131471.0161 
##     s = -328808905 -978884223 -426497532 755507280 1097643981 532617828
##            -79599874 -857334102 -679719862 -247046195 279161767 932959836
## 
##   sigma:  489147352
## 
##      AIC     AICc      BIC 
## 7183.900 7187.257 7229.934 
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set 52092.62 782216.2 478163.8 -4.702466 12.14671 0.7597811 0.1850929
autoplot(power_ets)

power_ets_forecast <- forecast(power_ets, h = 33)

autoplot(forecast(power_ets, h= 33))+autolayer(power_test,series = "Test Data")

checkresiduals(power_ets)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,A)
## Q* = 58.029, df = 10, p-value = 8.537e-09
## 
## Model df: 14.   Total lags used: 24

ARIMA

Residuals are not nomrally distributed. ACF plot shows all autocorrelations are within the threshold limits.

power_arima <- auto.arima(power_train, lambda =  train_lambda)
summary(power_arima)
## Series: power_train 
## ARIMA(0,0,1)(2,1,0)[12] with drift 
## Box Cox transformation: lambda= 1.412108 
## 
## Coefficients:
##          ma1     sar1     sar2    drift
##       0.0624  -0.7635  -0.5350  2048386
## s.e.  0.0871   0.0960   0.0906  1547173
## 
## sigma^2 estimated as 2.087e+17:  log likelihood=-3143.46
## AIC=6296.93   AICc=6297.35   BIC=6311.88
## 
## Training set error measures:
##                    ME     RMSE      MAE       MPE     MAPE      MASE
## Training set -22359.4 737607.3 437352.2 -5.470091 11.49913 0.6949333
##                     ACF1
## Training set -0.02519811
power_arima_forecast <- forecast(power_arima, h = 33)
autoplot(forecast(power_arima, 33))+ autolayer(power_test,series = "Test Data")

checkresiduals(power_arima_forecast)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(2,1,0)[12] with drift
## Q* = 15.925, df = 20, p-value = 0.7213
## 
## Model df: 4.   Total lags used: 24

STL

Residuals are not nomrally distributed. ACF plot shows all autocorrelations are within the threshold limits.

power_stl <- stl(power_train, t.window=13, s.window="periodic",robust=TRUE )
summary(power_stl)
##  Call:
##  stl(x = power_train, s.window = "periodic", t.window = 13, robust = TRUE)
## 
##  Time.series components:
##     seasonal              trend           remainder       
##  Min.   :-1578886.7   Min.   :5494380   Min.   :-7065596  
##  1st Qu.: -742271.0   1st Qu.:6154511   1st Qu.: -262631  
##  Median : -113779.7   Median :6277503   Median :   -1969  
##  Mean   :    8702.3   Mean   :6282878   Mean   :    8248  
##  3rd Qu.: 1119505.2   3rd Qu.:6390628   3rd Qu.:  288528  
##  Max.   : 1679560.7   Max.   :7233813   Max.   : 1875480  
##  IQR:
##      STL.seasonal STL.trend STL.remainder data   
##      1861776       236117    551159       2048577
##    %  90.9         11.5      26.9         100.0  
## 
##  Weights:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.8316  0.9452  0.8583  0.9876  1.0000 
## 
##  Other components: List of 5
##  $ win  : Named num [1:3] 1591 13 13
##  $ deg  : Named int [1:3] 0 1 1
##  $ jump : Named num [1:3] 160 2 2
##  $ inner: int 1
##  $ outer: int 15
power_stl_forecast <- forecast(power_stl, h = 33)
autoplot(forecast(power_stl, h=33))+ autolayer(power_test,series = "Test Data")

checkresiduals(power_stl_forecast)
## Warning in checkresiduals(power_stl_forecast): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(A,N,N)
## Q* = 43.838, df = 22, p-value = 0.003716
## 
## Model df: 2.   Total lags used: 24

Accuracy and Model Selection

The results below suggest arima as the best of these three methods for this data set and AIC for arima is better than ETS AIC. We will refit the model using the whole dataset and use that to forecast .

#Train Arima
accuracy( power_arima_forecast, power_test)
##                    ME      RMSE       MAE       MPE     MAPE      MASE
## Training set -22359.4  737607.3  437352.2 -5.470091 11.49913 0.6949333
## Test set     966977.6 1532608.8 1079568.4 11.427728 13.08846 1.7153863
##                     ACF1 Theil's U
## Training set -0.02519811        NA
## Test set      0.29301787 0.8512497
#Test ETS
accuracy( power_ets_forecast, power_test)
##                      ME      RMSE       MAE       MPE     MAPE      MASE
## Training set   52092.62  782216.2  478163.8 -4.702466 12.14671 0.7597811
## Test set     1053165.52 1384412.2 1089833.8 12.803169 13.43492 1.7316976
##                   ACF1 Theil's U
## Training set 0.1850929        NA
## Test set     0.2113514 0.8283447
#Test STL
accuracy( power_stl_forecast, power_test)
##                      ME      RMSE     MAE       MPE     MAPE      MASE
## Training set  -21115.59  796442.8  488277 -6.019777 12.48869 0.7758506
## Test set     1201430.88 1469674.7 1201431 15.006405 15.00641 1.9090204
##                   ACF1 Theil's U
## Training set 0.1969063        NA
## Test set     0.1730841 0.8793043
power_arima <- auto.arima(power_ts)
 
power_ets_forecast <- forecast(atm4_arima, h = 12)
 
write.csv(power_ets_forecast$mean,"ResidentialCustomerForecast.csv")