In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.
library(fpp2)
library(xts)
atm_raw <- readxl::read_excel("ATM624Data.xlsx")
atm_data <- as.data.frame(atm_raw[complete.cases(atm_raw), ])
atm_data$DATE <- as.Date(atm_data$DATE, origin = "1900-01-01")
atm1 <- atm_data[atm_data$ATM=="ATM1",]
periodicity(unique(atm1$DATE))
## Daily periodicity from 2009-05-03 to 2010-05-02
## An 'xts' object on 2009-05-03/2010-05-02 containing:
## Data: num [1:362, 1] 96 82 85 90 99 88 8 104 87 93 ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
atm2 <- atm_data[atm_data$ATM=="ATM2",]
atm2_ts <- xts(atm2$Cash, order.by=atm2$DATE)
autoplot(atm2_ts) + ylab("Cash")
atm3 <- atm_data[atm_data$ATM=="ATM3",]
atm3_ts <- xts(atm3$Cash, order.by=atm3$DATE)
autoplot(atm3_ts) + ylab("Cash")
atm4 <- atm_data[atm_data$ATM=="ATM4",]
atm4_ts <- xts(atm4$Cash, order.by=atm4$DATE)
autoplot(atm4_ts) + ylab("Cash")
Time Plots for ATM1 and ATM2 seem stationary with apparent cyclic pattern; ATM2 may have a slight downward trend (to be confirmed). For ATM3, the plot shows data with mostly zero values with minimal transactions at the end of the series, so it will not be modeled. For ATM4, the plot shows an extreme outlier 3/4 through the series.
ACF & PACF plots for ATM1, ATM2, and ATM3 show some autocorrelation between each observation and its immediate predecessor as well as higher autocorrelation between the current observation and other individual lagged observations. ATM4 shows no autocorrelation in neither plots (white noise), let’s see how the modeling goes
library(tseries)
atm1_ts <- ts(atm1$Cash, start=c(start(atm1$DATE)), frequency = 7)
plot(decompose(atm1_ts))
## Warning in adf.test(atm1_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: atm1_ts
## Dickey-Fuller = -4.6043, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## [1] 0
## Warning in adf.test(atm2_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: atm2_ts
## Dickey-Fuller = -6.009, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## [1] 1
## Warning in adf.test(atm3_ts): p-value greater than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: atm3_ts
## Dickey-Fuller = -0.055232, Lag order = 7, p-value = 0.99
## alternative hypothesis: stationary
## [1] 0
## Warning in adf.test(atm4_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: atm4_ts
## Dickey-Fuller = -6.3253, Lag order = 7, p-value = 0.01
## alternative hypothesis: stationary
## [1] 0
Time Series Decomposition plots, weekly boxplots and Dickey Fuller test (p-values < 0.05), confirm stationarity for most of the different ATM series (1,4), meaning no significant trends and fairly random remainders across with the exception of ATM2. Weekly boxplots show some seasonality for days of the week 4 or five, only for ATM1 & ATM2. ATM3’s limited data points make the assessment not very representative. Applying ndiffs() function shows that ATM2 requires 1 difference to make it fully stationary
# Training & Validation data sets (Over one month of data points to be forecasted)
train <- 1:(length(atm1_ts) - 32)
atm1_ts_train <- ts(atm1_ts[train], frequency = 7)
atm1_ts_val <- ts(atm1_ts[-train], frequency = 7)
train <- 1:(length(atm2_ts) - 32)
atm2_ts_train <- ts(atm2_ts[train], frequency = 7)
atm2_ts_val <- ts(atm2_ts[-train], frequency = 7)
train <- 1:(length(atm4_ts) - 32)
atm4_ts_train <- ts(atm4_ts[train], frequency = 7)
atm4_ts_val <- ts(atm4_ts[-train], frequency = 7)
# Calculating the appropriate Box-Cox transformation in order to stabilise the variance in the series
lambda1 <- BoxCox.lambda(atm1_ts_train)
lambda1
## [1] 0.624176
## [1] 0.6093865
## [1] -0.1182591
# ATM1 ACF and PACF plots suggest a combination of AR and MA models with the following order: p = 2, d = 0, q = 7
fit_atm1 <- Arima(atm1_ts_train, order = c(2, 0, 7), lambda = lambda1)
fit_atm1
## Series: atm1_ts_train
## ARIMA(2,0,7) with non-zero mean
## Box Cox transformation: lambda= 0.624176
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4 ma5 ma6
## -0.0341 -0.3214 0.0825 0.1213 -0.0541 -0.1672 -0.0609 -0.0017
## s.e. 0.1078 0.1278 0.0986 0.1175 0.0492 0.0559 0.0563 0.0544
## ma7 mean
## 0.4622 23.0573
## s.e. 0.0435 0.3733
##
## sigma^2 estimated as 45.95: log likelihood=-1095.73
## AIC=2213.46 AICc=2214.29 BIC=2255.25
library(forecast)
autofit_atm1 <- auto.arima(atm1_ts_train, stepwise=F, approximation=F, lambda = lambda1, seasonal = T)
autofit_atm1
## Series: atm1_ts_train
## ARIMA(0,0,2)(0,1,1)[7]
## Box Cox transformation: lambda= 0.624176
##
## Coefficients:
## ma1 ma2 sma1
## 0.1293 -0.1233 -0.5588
## s.e. 0.0560 0.0588 0.0494
##
## sigma^2 estimated as 32.81: log likelihood=-1021.91
## AIC=2051.82 AICc=2051.95 BIC=2066.93
## ME RMSE MAE MPE MAPE MASE
## Training set 3.779421 30.59308 23.04854 -115.2474 138.4951 1.171837
## ACF1
## Training set 0.02334147
## ME RMSE MAE MPE MAPE MASE
## Training set 1.248273 27.08843 17.72902 -105.2917 123.9401 0.9013812
## ACF1
## Training set 0.002843221
# Auto.Arima selected a seasonal model with the folowing order: (0,0,2)(0,1,1)[7] which shows better AIC and accuracy metrics than the manually selected non-seasonal model (2, 0, 7)
# ATM2 ACF and PACF plots suggest a combination of AR and MA models with the following order: p = 2, q = 5 and first difference d = 1
fit_atm2 <- Arima(atm2_ts_train, order = c(2, 1, 4), lambda = lambda2)
fit_atm2
## Series: atm2_ts_train
## ARIMA(2,1,4)
## Box Cox transformation: lambda= 0.6093865
##
## Coefficients:
## ar1 ar2 ma1 ma2 ma3 ma4
## -0.4624 -0.9684 -0.4073 0.2995 -0.8123 -0.0279
## s.e. 0.0224 0.0179 0.0590 0.0488 0.0479 0.0663
##
## sigma^2 estimated as 52.71: log likelihood=-1121.91
## AIC=2257.82 AICc=2258.17 BIC=2284.42
autofit_atm2 <- auto.arima(atm2_ts_train, stepwise=F, approximation=F, lambda = lambda2, seasonal = T)
autofit_atm2
## Series: atm2_ts_train
## ARIMA(2,0,2)(0,1,1)[7]
## Box Cox transformation: lambda= 0.6093865
##
## Coefficients:
## ar1 ar2 ma1 ma2 sma1
## -0.4195 -0.8998 0.4186 0.7726 -0.6378
## s.e. 0.0643 0.0555 0.0983 0.0750 0.0503
##
## sigma^2 estimated as 35.21: log likelihood=-1035.63
## AIC=2083.26 AICc=2083.52 BIC=2105.94
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.9871435 31.79138 26.09898 -Inf Inf 1.178218 -0.06817714
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.3391646 27.36514 18.70675 -Inf Inf 0.8445014 0.004523928
# Auto.Arima selected a seasonal model with the folowing order: (2,0,2)(0,1,1)[7] which shows better AIC and accuracy metrics than the manually selected non-seasonal model (2, 1, 4)
# ATM4 ACF and PACF plots shows no correlation, a combination of AR and MA models will still be used with the following order: p = 0, d = 0, q = 0
fit_atm4 <- Arima(atm4_ts_train, order = c(0, 0, 0), lambda = lambda4)
fit_atm4
## Series: atm4_ts_train
## ARIMA(0,0,0) with non-zero mean
## Box Cox transformation: lambda= -0.1182591
##
## Coefficients:
## mean
## 4.0120
## s.e. 0.0428
##
## sigma^2 estimated as 0.6128: log likelihood=-390.47
## AIC=784.94 AICc=784.97 BIC=792.55
autofit_atm4 <- auto.arima(atm4_ts_train, stepwise=F, approximation=F, lambda = lambda4)
autofit_atm4
## Series: atm4_ts_train
## ARIMA(0,0,0)(2,0,0)[7] with non-zero mean
## Box Cox transformation: lambda= -0.1182591
##
## Coefficients:
## sar1 sar2 mean
## 0.2226 0.1626 4.0127
## s.e. 0.0543 0.0548 0.0651
##
## sigma^2 estimated as 0.5583: log likelihood=-374.41
## AIC=756.82 AICc=756.94 BIC=772.05
## ME RMSE MAE MPE MAPE MASE
## Training set 252.0324 719.9623 354.0198 -221.7176 295.9419 0.8534868
## ACF1
## Training set -0.01069726
## ME RMSE MAE MPE MAPE MASE
## Training set 229.2873 708.0909 329.934 -220.6308 287.8673 0.7954198
## ACF1
## Training set -0.008948548
##
## Box-Ljung test
##
## data: residuals(autofit_atm1)
## X-squared = 2.4967, df = 7, p-value = 0.9273
##
## Box-Ljung test
##
## data: residuals(autofit_atm2)
## X-squared = 15.258, df = 7, p-value = 0.03283
##
## Box-Ljung test
##
## data: residuals(autofit_atm4)
## X-squared = 7.5965, df = 7, p-value = 0.3695
The model residuals for ATM1 and ATM4, very well approximate White Noise characteristics in the ACF and PACF plots, only couple residuals outside the no autocorrelation area being significant. For ATM2 the model residuals still show good white noise approximation although with slightly higher correlations. The null hypothesis (model does not show lack of fit) is not rejected for ATM1 and ATM4. The Box-Ljung shows that the autocorrelations of the residuals for those two models are not significantly different from zero at 0.05 significance level. Different story with ATM2 model where the null hypothesis is rejected, confirming certain depndence/correlations
fc1 <- forecast(autofit_atm1, h=32)
plot(fc1, ylab="Cash ATM1")
lines(lag(atm1_ts_val, -length(atm1_ts_train)), col="green")
fc2 <- forecast(autofit_atm2, h=32)
plot(fc2, ylab="Cash ATM2")
lines(lag(atm2_ts_val, -length(atm2_ts_train)), col="green")
fc4 <- forecast(autofit_atm4, h=32)
plot(fc4, ylab="Cash ATM4")
lines(lag(atm4_ts_val, -length(atm4_ts_train)), col="green")
## ME RMSE MAE MPE MAPE MASE
## Training set 1.248273 27.08843 17.72902 -105.29167 123.94013 0.4706947
## Test set 25.845976 25.84598 25.84598 80.76868 80.76868 0.6861948
## ACF1
## Training set 0.002843221
## Test set NA
## ME RMSE MAE MPE MAPE MASE
## Training set 0.3391646 27.36514 18.70675 -Inf Inf 0.4312117
## Test set 29.1493537 29.14935 29.14935 91.09173 91.09173 0.6719256
## ACF1
## Training set 0.004523928
## Test set NA
## ME RMSE MAE MPE MAPE MASE
## Training set 229.28732 708.09089 329.93404 -220.6308 287.8673 0.72927718
## Test set -41.44728 41.44728 41.44728 -129.5227 129.5227 0.09161393
## ACF1
## Training set -0.008948548
## Test set NA
# Forecast for May 2010
fc1_may2010 <- forecast(Arima(atm1_ts, model=autofit_atm1), h=30)
plot(fc1_may2010)
# ATM3 will be forecasted using ATM1 model. Due to the very limited data points, the closest distribution mean from the rest of the data series is from ATM1
c(mean(atm1_ts), mean(atm2_ts), mean(atm3_ts[atm3_ts!=0]), mean(atm4_ts))
## [1] 83.88674 62.57851 87.66667 474.04334
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.
library(fpp2)
library(xts)
pu_raw <- readxl::read_excel("ResidentialCustomerForecastLoad-624.xlsx")
pu_data <- as.data.frame(pu_raw[complete.cases(pu_raw), ])
colnames(pu_data) <- c("Case", "YearMo", "KWH")
pu_data$YearMo <- as.Date(paste0(pu_data$YearMo,"-01"), format = "%Y-%b-%d")
periodicity(unique(pu_data$YearMo))
## Monthly periodicity from 1998-01-01 to 2013-12-01
## An 'xts' object on 1998-01-01/2013-12-01 containing:
## Data: num [1:191, 1] 6862583 5838198 5420658 5010364 4665377 ...
## Indexed by objects of class: [Date] TZ: UTC
## xts Attributes:
## NULL
Time Plot seems stationary with a slight upward trend towards the end of the series (to be confirmed). The plot also shows an extreme outlier 3/4 through the series.
ACF & PACF plots show some autocorrelation between each observation and its immediate predecessor as well as higher autocorrelation between the current observation and other individual lagged observations, primarily the first two lags
library(tseries)
pu_ts <- ts(pu_data$KWH, start=c(start(pu_data$YearMo)), frequency = 12)
plot(decompose(pu_ts))
## Warning in adf.test(pu_ts): p-value smaller than printed p-value
##
## Augmented Dickey-Fuller Test
##
## data: pu_ts
## Dickey-Fuller = -4.8347, Lag order = 5, p-value = 0.01
## alternative hypothesis: stationary
## [1] 1
Time Series Decomposition plots, monthly boxplots and Dickey Fuller test (p-value < 0.05), suggest stationarity for the series, even though a an upward trend appears evident. Weekly boxplots show some seasonality for months 7 and 8, which makes sense as most power consumption rises during summer as well as winter months. Applying ndiffs() function shows that the series requires 1 difference to make it fully stationary
# Training & Validation data sets (Over one month of data points to be forecasted)
train <- 1:(length(pu_ts) - 6)
pu_ts_train <- ts(pu_ts[train], frequency = 12)
pu_ts_val <- ts(pu_ts[-train], frequency = 12)
# Calculating the appropriate Box-Cox transformation in order to stabilise the variance in the series
lambda <- BoxCox.lambda(pu_ts_train)
lambda
## [1] -0.05138027
# Using Auto.Arima to determine p,d,q orders
library(forecast)
autofit_pu <- auto.arima(pu_ts_train, stepwise=F, approximation=F, lambda = lambda, seasonal = T)
autofit_pu
## Series: pu_ts_train
## ARIMA(0,0,1)(2,0,2)[12] with non-zero mean
## Box Cox transformation: lambda= -0.05138027
##
## Coefficients:
## ma1 sar1 sar2 sma1 sma2 mean
## 0.1183 1.6304 -0.6840 -1.5532 0.7285 10.7604
## s.e. 0.0748 0.3530 0.3286 0.3356 0.2333 0.0212
##
## sigma^2 estimated as 0.009882: log likelihood=161.94
## AIC=-309.88 AICc=-309.25 BIC=-287.34
## ME RMSE MAE MPE MAPE MASE
## Training set 181458.1 1058843 730021 -2.935441 14.43076 0.9823372
## ACF1
## Training set 0.2184049
##
## Box-Ljung test
##
## data: residuals(autofit_pu)
## X-squared = 2.2548, df = 12, p-value = 0.9989
The model residuals for the series model, very well approximate White Noise characteristics in the ACF and PACF plots. The null hypothesis (model does not show lack of fit). The Box-Ljung shows that the autocorrelations of the residuals for those two models are not significantly different from zero at 0.05 significance level
fc_pu <- forecast(autofit_pu, h=6)
plot(fc_pu, ylab="KWH")
lines(lag(pu_ts_val, -length(pu_ts_train)), col="purple")
## ME RMSE MAE MPE MAPE
## Training set 181458.1 1058843 730021 -2.935441e+00 1.443076e+01
## Test set -4770652.6 4770653 4770653 -7.951088e+07 7.951088e+07
## MASE ACF1
## Training set 0.6102161 0.2184049
## Test set 3.9877330 NA