library(readxl, quietly = TRUE, warn.conflicts = FALSE, verbose = F)
library(fpp2,quietly = TRUE, warn.conflicts = FALSE, verbose = F)
library(ggplot2)
library(gridExtra)
library(mlbench)
library(psych)
library(dplyr )
library("readxl")
library("dplyr")
library("tidyr")
library("forecast")
library("fpp2")
library("lubridate")
In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
We split the data into four dataframes, one for each ATM machine. We will fill in any missing data by the median value for the given ATM. We will omit any observation with out dates.
atm1$Cash[is.na(atm1$Cash)] <- median(atm1$Cash)
atm2$Cash[is.na(atm2$Cash)] <- median(atm2$Cash)
atm3$Cash[is.na(atm3$Cash)] <- median(atm3$Cash)
atm4$Cash[is.na(atm4$Cash)] <- median(atm4$Cash)
atm1 <- na.omit(atm1)
atm2 <- na.omit(atm2)
atm3 <- na.omit(atm3)
atm4 <- na.omit(atm4)
ts1 = ts(atm1$Cash, start = c(2009,as.numeric(format(as.Date(min(atm1$DATE)), "%j"))), frequency = 365)
ts2 = ts(atm2$Cash,start = c(2009,as.numeric(format(as.Date(min(atm2$DATE)), "%j"))), frequency = 365)
ts3 = ts(atm3$Cash, start = c(2009,as.numeric(format(as.Date(min(atm3$DATE)), "%j"))), frequency = 365)
ts4 = ts(atm4$Cash, start = c(2009,as.numeric(format(as.Date(min(atm4$DATE)), "%j"))), frequency = 365)
Below table dispalys summary for atm 1. Minimum value 1 looks suspcious.
summary(atm1$Cash)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 73.00 91.00 83.89 108.00 180.00
ts1w = ts(atm1$Cash, start = 1, frequency = 7)
ts1d = ts(atm1$Cash, start = as.Date(min(atm1$DATE)), frequency = 365)
Based on the plot below, we do not see any clear trend or sesonality. Plot of the data looks like it is white noise/stationary and ndiffs() returns 0.
Pvalue from Box-Ljung test is also high, implying it is just white noise. However, the acf plot shows it is not stationary.
ggtsdisplay(ts1d, points = FALSE, main = "Withdrawals from ATM1", xlab = "Week", ylab = "Cash in hundreds")
Box.test(ts1d, type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts1d
## X-squared = 1.4791, df = 1, p-value = 0.2239
We will create a time series with frequency seven to see if we can identify any trends or seasonality.
ggtsdisplay(ts1w, points = FALSE, main = "Withdrawals from ATM1", xlab = "Week", ylab = "Cash in hundreds")
Box.test(ts1w, type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts1w
## X-squared = 1.4791, df = 1, p-value = 0.2239
For the daily time series data we can not look at the sesonalplot or subseries plot because we only have a year worth of data. Becasue of this we will look at seasonal plot an subseries plot using the weekly time series.
Sesonal plot shows that least amount withdrawls happens on wednesdays and other days being roughly the same.
There are some obserations where thi occurs on monday and fridays.
ggseasonplot(ts1w, year.labels=TRUE, year.labels.left=TRUE)
Seasonal subseries plot, plots average withdrwals per day of the week, shows above described pattern clearly. Least withdrwals happens on Wednesdays, and most happening around fridays.
ggsubseriesplot(ts1w, year.labels=TRUE, year.labels.left=TRUE)
Seasonal component does not change over time. Trend component shows a downward trend.
autoplot( stl(ts1w, t.window=13, s.window="periodic", robust=T) )
We will split the data into train and test data. We will use the test data to evaluate model accuracys and for model selection.
atm1_train <-ts(atm1[0:302,]$Cash , start=c(2009,as.numeric(format(as.Date("2009-05-01"), "%j"))), frequency = 7)
atm1_test <-ts(atm1[303:362,]$Cash , start=c(2010,as.numeric(format(as.Date("2010-04-01"), "%j"))), frequency = 7)
atm1_ets_fit <- ets(atm1_train)
summary(atm1_ets_fit)
## ETS(A,N,A)
##
## Call:
## ets(y = atm1_train)
##
## Smoothing parameters:
## alpha = 0.01
## gamma = 0.3379
##
## Initial states:
## l = 88.8583
## s = -42.917 19.2648 24.5604 -39.282 6.6179 38.9107
## -7.1548
##
## sigma: 28.274
##
## AIC AICc BIC
## 3753.946 3754.702 3791.051
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.2639688 27.84955 18.47705 -120.4485 138.029 0.9083038 0.1367749
autoplot(atm1_ets_fit)
atm1_ets_forecast <- forecast(atm1_ets_fit, h=60)
autoplot(atm1_ets_forecast)
checkresiduals(atm1_ets_fit)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 15.475, df = 5, p-value = 0.008514
##
## Model df: 9. Total lags used: 14
atm1_arima <- auto.arima(atm1_train)
summary(atm1_arima)
## Series: atm1_train
## ARIMA(0,0,1)(0,1,1)[7]
##
## Coefficients:
## ma1 sma1
## 0.1768 -0.6461
## s.e. 0.0610 0.0555
##
## sigma^2 estimated as 750.9: log likelihood=-1396.14
## AIC=2798.27 AICc=2798.35 BIC=2809.33
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.774482 26.9918 17.27918 -113.0299 130.8749 0.8494179 -0.01012743
atm1_arima_forecast <- forecast(atm1_arima, h = 60)
autoplot(atm1_arima_forecast)
checkresiduals(atm1_arima_forecast)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(0,1,1)[7]
## Q* = 8.1068, df = 12, p-value = 0.7767
##
## Model df: 2. Total lags used: 14
ARIMA model has lower AIC and has lower RMSE than ETS model on both train and test data. Because of this we wil select ARIMA model and refit with entire training set and produce the forecast from that model.
#Train ARIMA
accuracy(atm1_arima_forecast)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.774482 26.9918 17.27918 -113.0299 130.8749 0.8494179 -0.01012743
#Train ETS
accuracy(atm1_ets_forecast)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.2639688 27.84955 18.47705 -120.4485 138.029 0.9083038 0.1367749
# Test ARIMA
accuracy( atm1_arima_forecast$mean, atm1[303:362,]$Cash)
## ME RMSE MAE MPE MAPE
## Test set -14.91057 28.88491 20.06694 -265.5511 270.3319
# Test ETS
accuracy( atm1_ets_forecast$mean, atm1[303:362,]$Cash)
## ME RMSE MAE MPE MAPE
## Test set -15.97052 30.19271 20.98208 -277.9664 282.6238
atm1_arima <- auto.arima(ts1w)
atm1_arima_forecast <- forecast(atm1_arima, h = 31)
write.csv(atm1_arima_forecast$mean,"ATM1_Forecast.csv")
summary(atm2$Cash)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 25.50 67.00 62.58 93.00 147.00
ts2w = ts(atm2$Cash, start = 1, frequency = 7)
ts2d = ts(atm2$Cash, start = as.Date(min(atm2$DATE)), frequency = 365)
Based on the plot below, we do not see any clear trend or sesonality. Plot of the data looks like it is white noise/stationary and ndiffs() returns 0.
Pvalue from Box-Ljung test is also high, implying it is just white noise. However, the acf plot shows it is not stationary.
ggtsdisplay(ts2d, points = FALSE, main = "Withdrawals from ATM2", xlab = "Week", ylab = "Cash in hundreds")
Box.test(ts2d, type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts2d
## X-squared = 0.056235, df = 1, p-value = 0.8125
ggtsdisplay(ts2w, points = FALSE, main = "Withdrawals from ATM2", xlab = "Week", ylab = "Cash in hundreds")
Box.test(ts2w, type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts2w
## X-squared = 0.056235, df = 1, p-value = 0.8125
For the daily time series data we can not look at the sesonalplot or subseries plot because we only have a year worth of data. Becasue of this we will look at seasonal plot an subseries plot using the weekly time series.
Sesonal plot shows that least amount withdrawls happens on wednesdays and other days being roughly the same.
There are some obserations where this occurs on monday and fridays. variation decreases as the level of the series increases so we need to use multiplicative seasonality and transform the data.
ggseasonplot(ts2w, year.labels=TRUE, year.labels.left=TRUE)
Seasonal subseries plot, plots average withdrwals per day of the week, shows above described pattern clearly. Least withdrwals happens on Wednesdays, and most happening around fridays.
ggsubseriesplot(ts2w, year.labels=TRUE, year.labels.left=TRUE)
### Test and Train Data
As above we will use 60 days of data for testing the model, a better approache would be to cross validation.
atm2_train <-ts2w[0:302]
atm2_test <-ts2w[303:362]
atm2_lambda <- BoxCox.lambda(atm2_train)
Residual ACF plot shows autocorrelation in the errors, and our forecasts may be inefficient.
Residuals are not normally distributed.
atm2_ets_fit <- ets(atm2_train, lambda = atm2_lambda)
summary(atm2_ets_fit)
## ETS(A,A,N)
##
## Call:
## ets(y = atm2_train, lambda = atm2_lambda)
##
## Box-Cox transformation: lambda= 0.2076
##
## Smoothing parameters:
## alpha = 2e-04
## beta = 2e-04
##
## Initial states:
## l = 6.6658
## b = -0.005
##
## sigma: 2.123
##
## AIC AICc BIC
## 2185.227 2185.430 2203.780
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 14.6463 40.56853 34.78051 -Inf Inf 0.8090366 -0.03978792
atm2_ets_forecast <- forecast(atm2_ets_fit, h=60)
autoplot(atm2_ets_forecast)
checkresiduals(atm2_ets_fit)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,A,N)
## Q* = 195.18, df = 6, p-value < 2.2e-16
##
## Model df: 4. Total lags used: 10
Residual ACF plot shows no autocorrelation in the errors and residuals are normally distributed. ### ARIMA
atm2_arima <- auto.arima(ts2w, lambda = atm2_lambda)
summary(atm2_arima)
## Series: ts2w
## ARIMA(0,0,5)(0,1,1)[7]
## Box Cox transformation: lambda= 0.2076287
##
## Coefficients:
## ma1 ma2 ma3 ma4 ma5 sma1
## 0.0538 -0.1045 0.0373 0.0251 -0.2263 -0.5656
## s.e. 0.0523 0.0526 0.0532 0.0521 0.0523 0.0457
##
## sigma^2 estimated as 2.35: log likelihood=-655.81
## AIC=1325.63 AICc=1325.95 BIC=1352.75
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 2.550071 29.31029 20.09578 -Inf Inf 0.935176 -0.05814841
atm2_arima_forecast <- forecast(atm2_arima, h = 60)
autoplot(atm2_arima_forecast)
checkresiduals(atm2_arima_forecast)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,5)(0,1,1)[7]
## Q* = 2.9887, df = 8, p-value = 0.9351
##
## Model df: 6. Total lags used: 14
ARIMA model fits the training data slightly better than the ETS model, but that the ETS model provides more accurate forecasts on the test set. We will also use the ARIMA for forecasting since ETS residuals are not normally distribute and ACF shows autocorrelation.
#Train ARIMA
accuracy(atm2_arima_forecast)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 2.550071 29.31029 20.09578 -Inf Inf 0.935176 -0.05814841
#Train ETS
accuracy(atm2_ets_forecast)
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 14.6463 40.56853 34.78051 -Inf Inf 0.8090366 -0.03978792
# Test ARIMA
accuracy( atm2_arima_forecast$mean, atm2[303:362,]$Cash)
## ME RMSE MAE MPE MAPE
## Test set -0.463026 64.76235 57.78748 -Inf Inf
# Test ETS
accuracy( atm2_ets_forecast$mean, atm2[303:362,]$Cash)
## ME RMSE MAE MPE MAPE
## Test set 20.9725 44.91564 40.35588 -Inf Inf
atm2_arima <- auto.arima(ts2w)
atm2_arima_forecast <- forecast(atm2_arima, h = 31)
write.csv(atm2_arima_forecast$mean,"ATM2_Forecast.csv")
ATM3 has only 3 non zero observations because we will use the Average method to forecast.
summary(atm3$Cash)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.7206 0.0000 96.0000
ts3 = ts(atm3$Cash, start = 1, frequency = 365)
write.csv(snaive(ts3, 30)$mean,"ATM3_Forecast.csv")
From the table and plot below we see that there is an outlier. We will replace this median to do further analysis.
summary(atm4$Cash)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.563 124.334 403.839 474.043 704.507 10919.762
ts4d <- ts(atm4$Cash, start = as.Date(min(atm4$DATE)), frequency = 365)
autoplot(ts4d, points = FALSE, main = "Withdrawals from ATM4", xlab = "Week", ylab = "Cash in hundreds")
## Warning: Ignoring unknown parameters: points
atm4[285,]$Cash <- median(atm4$Cash)
ts4d <- ts(atm4$Cash, start = as.Date(min(atm4$DATE)), frequency = 365)
autoplot(ts4d, points = FALSE, main = "Withdrawals from ATM4", xlab = "Week", ylab = "Cash in hundreds")
## Warning: Ignoring unknown parameters: points
ts4w <- ts(atm4$Cash, start = 1, frequency = 7)
ts4d <- ts(atm4$Cash, start = as.Date(min(atm4$DATE)), frequency = 365)
Based on the plot below, we do not see any clear trend or sesonality. Plot of the data looks like it is white noise/stationary and ndiffs() returns 0.
Pvalue from Box-Ljung test is also high, implying it is just white noise. However, the acf plot shows it is not stationary.
ggtsdisplay(ts4d, points = FALSE, main = "Withdrawals from ATM4", xlab = "Week", ylab = "Cash in hundreds")
Box.test(ts4d, type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts4d
## X-squared = 2.0838, df = 1, p-value = 0.1489
ggtsdisplay(ts4w, points = FALSE, main = "Withdrawals from ATM1", xlab = "Week", ylab = "Cash in hundreds")
Box.test(ts4w, type="Ljung-Box")
##
## Box-Ljung test
##
## data: ts4w
## X-squared = 2.0838, df = 1, p-value = 0.1489
For the daily time series data we can not look at the sesonalplot or subseries plot because we only have a year worth of data. Becasue of this we will look at seasonal plot an subseries plot using the weekly time series.
Seasonal plot does not show anything obivious.
ggseasonplot(ts4w, year.labels=TRUE, year.labels.left=TRUE)
Seasonal subseries plot, plots average withdrwals per day of the week, shows downward trend. Most withdrawls happens on Sunday and less other days.
ggsubseriesplot(ts4w, year.labels=TRUE, year.labels.left=TRUE)
### Test and Train Data
We will use the last 60 days of data for testing.
atm4_train <-ts4w[0:302]
atm4_test <-ts4w[303:362]
atm4_ets_fit <- ets(atm4_train)
summary(atm4_ets_fit)
## ETS(A,N,N)
##
## Call:
## ets(y = atm4_train)
##
## Smoothing parameters:
## alpha = 1e-04
##
## Initial states:
## l = 449.2714
##
## sigma: 360.9104
##
## AIC AICc BIC
## 5285.275 5285.355 5296.406
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1600697 359.7133 300.548 -535.1853 568.8359 0.7703108
## ACF1
## Training set 0.08650058
autoplot(atm4_ets_fit)
atm4_ets_forecast <- forecast(atm4_ets_fit, h=60)
autoplot(atm4_ets_forecast)
checkresiduals(atm4_ets_fit)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 23.195, df = 8, p-value = 0.003123
##
## Model df: 2. Total lags used: 10
atm4_arima <- auto.arima(ts4w)
summary(atm4_arima)
## Series: ts4w
## ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
##
## Coefficients:
## ma1 ma2 ma3 sar1 mean
## 0.0874 0.0161 0.0886 0.1838 444.6109
## s.e. 0.0532 0.0583 0.0516 0.0526 26.0984
##
## sigma^2 estimated as 119377: log likelihood=-2648.96
## AIC=5309.93 AICc=5310.16 BIC=5333.33
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -0.2167449 343.1347 287.2529 -519.3411 552.585 0.8292132
## ACF1
## Training set -0.008268238
atm4_arima_forecast <- forecast(atm4_arima, h = 60)
autoplot(atm4_arima_forecast)
checkresiduals(atm4_arima_forecast)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,3)(1,0,0)[7] with non-zero mean
## Q* = 16.218, df = 9, p-value = 0.06247
##
## Model df: 5. Total lags used: 14
ARIMA model fits the training data slightly better than the ETS model and it has better AIC so will use that for forecasting.
#Train ARIMA
accuracy(atm4_arima_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.2167449 343.1347 287.2529 -519.3411 552.585 0.8292132
## ACF1
## Training set -0.008268238
#Train ETS
accuracy(atm4_ets_forecast)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.1600697 359.7133 300.548 -535.1853 568.8359 0.7703108
## ACF1
## Training set 0.08650058
# Test ARIMA
accuracy( atm4_arima_forecast$mean, atm4[303:362,]$Cash)
## ME RMSE MAE MPE MAPE
## Test set -6.498257 303.3209 252.9984 -799.5391 830.9191
# Test ETS
accuracy( atm4_ets_forecast$mean, atm4[303:362,]$Cash)
## ME RMSE MAE MPE MAPE
## Test set -16.31642 304.9523 256.0181 -816.253 846.5153
atm4_arima <- auto.arima(ts4w)
atm42_arima_forecast <- forecast(atm4_arima, h = 31)
write.csv(atm4_arima_forecast$mean,"ATM4_Forecast.csv")
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
Below table displays summary table. There is one NA and we will replace that with median
Plot below shows some sesasonality. Variation increases as the level of the series increases so it shows multiplicative seasonality. This time series is non stationary; Jung Box test has small p-value.
summary(power$KWH)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 770523 5429912 6283324 6502475 7620524 10655730 1
namedian <- median(power$KWH)
power$KWH[is.na(power$KWH)]= median(power$KWH,na.rm = TRUE)
power_ts =ts(power$KWH,start = c(1998,1),frequency = 12)
Box.test(diff(power_ts), lag=10, type="Ljung-Box")
##
## Box-Ljung test
##
## data: diff(power_ts)
## X-squared = 154.15, df = 10, p-value < 2.2e-16
autoplot(power_ts)
Sesonal plot shows that power usage pattern that is tied to season. In summer and winter power usage goes and goes in fall and spring. Subseries plot also shows this pattern. We will use BoxCox to transform data.
ggseasonplot(power_ts, year.labels=TRUE, year.labels.left=TRUE)
ggsubseriesplot(power_ts, year.labels=TRUE, year.labels.left=TRUE)
Sesaonal Decomposition below shows that there is a postive trend and strong sesasonlity as well.
autoplot( stl(power_ts, t.window=13, s.window="periodic", robust=T) )
### Test and Train Data
power_ts <-ts(power$KWH, start = c(1998,1),end=c(2013,12),frequency = 12 )
power_train <-ts(power[0:159,]$KWH, start = c(1998,1),end=c(2011,3),frequency = 12 )
power_test <-ts(power[160:192,]$KWH,start = c(2011,4),end=c(2013,12),frequency = 12)
train_lambda=BoxCox.lambda(power_train)
test_lambda=BoxCox.lambda(power_test)
Residuals are almost nomrally distributed. ACF plot shows some autocorrelations are outside the threshold limits.
power_ets <- ets(power_train, lambda = train_lambda)
summary(power_ets)
## ETS(A,N,A)
##
## Call:
## ets(y = power_train, lambda = train_lambda)
##
## Box-Cox transformation: lambda= 1.4121
##
## Smoothing parameters:
## alpha = 0.0249
## gamma = 1e-04
##
## Initial states:
## l = 2775131471.0161
## s = -328808905 -978884223 -426497532 755507280 1097643981 532617828
## -79599874 -857334102 -679719862 -247046195 279161767 932959836
##
## sigma: 489147352
##
## AIC AICc BIC
## 7183.900 7187.257 7229.934
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 52092.62 782216.2 478163.8 -4.702466 12.14671 0.7597811 0.1850929
autoplot(power_ets)
power_ets_forecast <- forecast(power_ets, h = 33)
autoplot(forecast(power_ets, h= 33))+autolayer(power_test,series = "Test Data")
checkresiduals(power_ets)
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,A)
## Q* = 58.029, df = 10, p-value = 8.537e-09
##
## Model df: 14. Total lags used: 24
Residuals are not nomrally distributed. ACF plot shows all autocorrelations are within the threshold limits.
power_arima <- auto.arima(power_train, lambda = train_lambda)
summary(power_arima)
## Series: power_train
## ARIMA(0,0,1)(2,1,0)[12] with drift
## Box Cox transformation: lambda= 1.412108
##
## Coefficients:
## ma1 sar1 sar2 drift
## 0.0624 -0.7635 -0.5350 2048386
## s.e. 0.0871 0.0960 0.0906 1547173
##
## sigma^2 estimated as 2.087e+17: log likelihood=-3143.46
## AIC=6296.93 AICc=6297.35 BIC=6311.88
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set -22359.4 737607.3 437352.2 -5.470091 11.49913 0.6949333
## ACF1
## Training set -0.02519811
power_arima_forecast <- forecast(power_arima, h = 33)
autoplot(forecast(power_arima, 33))+ autolayer(power_test,series = "Test Data")
checkresiduals(power_arima_forecast)
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(2,1,0)[12] with drift
## Q* = 15.925, df = 20, p-value = 0.7213
##
## Model df: 4. Total lags used: 24
Residuals are not nomrally distributed. ACF plot shows all autocorrelations are within the threshold limits.
power_stl <- stl(power_train, t.window=13, s.window="periodic",robust=TRUE )
summary(power_stl)
## Call:
## stl(x = power_train, s.window = "periodic", t.window = 13, robust = TRUE)
##
## Time.series components:
## seasonal trend remainder
## Min. :-1578886.7 Min. :5494380 Min. :-7065596
## 1st Qu.: -742271.0 1st Qu.:6154511 1st Qu.: -262631
## Median : -113779.7 Median :6277503 Median : -1969
## Mean : 8702.3 Mean :6282878 Mean : 8248
## 3rd Qu.: 1119505.2 3rd Qu.:6390628 3rd Qu.: 288528
## Max. : 1679560.7 Max. :7233813 Max. : 1875480
## IQR:
## STL.seasonal STL.trend STL.remainder data
## 1861776 236117 551159 2048577
## % 90.9 11.5 26.9 100.0
##
## Weights:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.8316 0.9452 0.8583 0.9876 1.0000
##
## Other components: List of 5
## $ win : Named num [1:3] 1591 13 13
## $ deg : Named int [1:3] 0 1 1
## $ jump : Named num [1:3] 160 2 2
## $ inner: int 1
## $ outer: int 15
power_stl_forecast <- forecast(power_stl, h = 33)
autoplot(forecast(power_stl, h=33))+ autolayer(power_test,series = "Test Data")
checkresiduals(power_stl_forecast)
## Warning in checkresiduals(power_stl_forecast): The fitted degrees of freedom is
## based on the model used for the seasonally adjusted data.
##
## Ljung-Box test
##
## data: Residuals from STL + ETS(A,N,N)
## Q* = 43.838, df = 22, p-value = 0.003716
##
## Model df: 2. Total lags used: 24
The results below suggest arima as the best of these three methods for this data set and AIC for arima is better than ETS AIC. We will refit the model using the whole dataset and use that to forecast .
#Train Arima
accuracy( power_arima_forecast, power_test)
## ME RMSE MAE MPE MAPE MASE
## Training set -22359.4 737607.3 437352.2 -5.470091 11.49913 0.6949333
## Test set 966977.6 1532608.8 1079568.4 11.427728 13.08846 1.7153863
## ACF1 Theil's U
## Training set -0.02519811 NA
## Test set 0.29301787 0.8512497
#Test ETS
accuracy( power_ets_forecast, power_test)
## ME RMSE MAE MPE MAPE MASE
## Training set 52092.62 782216.2 478163.8 -4.702466 12.14671 0.7597811
## Test set 1053165.52 1384412.2 1089833.8 12.803169 13.43492 1.7316976
## ACF1 Theil's U
## Training set 0.1850929 NA
## Test set 0.2113514 0.8283447
#Test STL
accuracy( power_stl_forecast, power_test)
## ME RMSE MAE MPE MAPE MASE
## Training set -21115.59 796442.8 488277 -6.019777 12.48869 0.7758506
## Test set 1201430.88 1469674.7 1201431 15.006405 15.00641 1.9090204
## ACF1 Theil's U
## Training set 0.1969063 NA
## Test set 0.1730841 0.8793043
power_arima <- auto.arima(power_ts)
power_ets_forecast <- forecast(atm4_arima, h = 12)
write.csv(power_ets_forecast$mean,"ResidentialCustomerForecast.csv")