library(fpp3)
library(tidyverse)
library(readxl)
library(kableExtra)
library(openxlsx)
library(forecast)atm_data <- read_csv("ATM624Data.csv")
head(atm_data)## # A tibble: 6 × 3
## DATE ATM Cash
## <chr> <chr> <dbl>
## 1 5/1/2009 12:00:00 AM ATM1 96
## 2 5/1/2009 12:00:00 AM ATM2 107
## 3 5/2/2009 12:00:00 AM ATM1 82
## 4 5/2/2009 12:00:00 AM ATM2 89
## 5 5/3/2009 12:00:00 AM ATM1 85
## 6 5/3/2009 12:00:00 AM ATM2 90
# Change DATE column to a proper date format
atm_data$DATE <- as.Date(mdy_hms(atm_data$DATE, tz="EST"))
# Change column names
names(atm_data) <- c("date","ATM","amount")
head(atm_data)## # A tibble: 6 × 3
## date ATM amount
## <date> <chr> <dbl>
## 1 2009-05-01 ATM1 96
## 2 2009-05-01 ATM2 107
## 3 2009-05-02 ATM1 82
## 4 2009-05-02 ATM2 89
## 5 2009-05-03 ATM1 85
## 6 2009-05-03 ATM2 90
missing <- atm_data %>% filter(is.na(date)) %>% nrow() %>% tibble(date = .)
missing <- atm_data %>% filter(is.na(ATM)) %>% nrow() %>%
tibble(ATM = .) %>% cbind(missing,.)
missing <- atm_data %>% filter(is.na(amount)) %>% nrow() %>%
tibble(amount = .) %>% cbind(missing,.)
rownames(missing) <- "NA Values"
kbl(missing) %>% kable_paper("striped", full_width = F, position="left") %>%
add_header_above(c("Column Name" = 4))|
Column Name
|
|||
|---|---|---|---|
| date | ATM | amount | |
| NA Values | 0 | 14 | 19 |
There are 19 records where no amount is recorded and 14 records where no machine is recorded
atm_data %>% filter(is.na(amount) | is.na(ATM))## # A tibble: 19 × 3
## date ATM amount
## <date> <chr> <dbl>
## 1 2009-06-13 ATM1 NA
## 2 2009-06-16 ATM1 NA
## 3 2009-06-18 ATM2 NA
## 4 2009-06-22 ATM1 NA
## 5 2009-06-24 ATM2 NA
## 6 2010-05-01 <NA> NA
## 7 2010-05-02 <NA> NA
## 8 2010-05-03 <NA> NA
## 9 2010-05-04 <NA> NA
## 10 2010-05-05 <NA> NA
## 11 2010-05-06 <NA> NA
## 12 2010-05-07 <NA> NA
## 13 2010-05-08 <NA> NA
## 14 2010-05-09 <NA> NA
## 15 2010-05-10 <NA> NA
## 16 2010-05-11 <NA> NA
## 17 2010-05-12 <NA> NA
## 18 2010-05-13 <NA> NA
## 19 2010-05-14 <NA> NA
Inspecting the output, we see that some of the records that are missing amount are also missing machine. to forecast, we’ll remove those from the data set.
# Keep only where both amount and machine are not NA
atm_data <- atm_data %>% filter(!(is.na(amount) & is.na(ATM)))
# Re-check
atm_data %>% filter(is.na(amount))## # A tibble: 5 × 3
## date ATM amount
## <date> <chr> <dbl>
## 1 2009-06-13 ATM1 NA
## 2 2009-06-16 ATM1 NA
## 3 2009-06-18 ATM2 NA
## 4 2009-06-22 ATM1 NA
## 5 2009-06-24 ATM2 NA
atm_data$weekday <- atm_data$date %>% wday()# Function to return the value of the amount from 7 days previous for the
# same machine.
lastWeekday <- function(d, m){
d <- d-7
i <- atm_data %>% filter(date == d, ATM == m) %>%
select(amount) %>% as.double()
return(i)
}
lastWeekday <- Vectorize(lastWeekday)
atm_data[is.na(atm_data$amount),"amount"] <- atm_data %>% filter(is.na(amount)) %>%
mutate(imputed = lastWeekday(date, ATM)) %>% select(imputed)we then search for outliers. We’ll start by plotting the data:
atm_data %>% ggplot(aes(x=date,y=amount,color=ATM)) + geom_line() +
facet_wrap(ATM ~ ., scales = "free") +
theme_light() +
scale_x_date(date_breaks = "3 months", date_labels = "%b %y") +
scale_color_discrete(name="Machine") +
labs(title="Cash Withdrawals by ATM", y="Amount (100's of $)",
x="Date")Two issues can be seen in the graphs above. The first is that ATM3 only has a few non-zero data points. Forecasting with those data will be nearly impossible to do accurately.
The second issue is that ATM4 appears to have a single, large outlier at date 2010-02-09. Let’s look more closely at that.
days <- c("Sunday","Monday","Tuesday","Wednesday","Thursday",
"Friday","Saturday")
atm_data %>% filter(ATM == "ATM4") %>% ggplot(aes(x=date, y=amount)) +
geom_line() + theme_light() +
theme(axis.text.x = element_blank()) +
facet_wrap(ordered(weekday,labels=days) ~ .,scales="free_y") +
labs(title="ATM4", subtitle="By Day of Week", x=NULL, y= NULL)We can see that the suspected outlier was recorded on a Tuesday and is well above the other values recorded on a Tuesday (or any other weekday for that matter).
If we look at the data points surrounding it, that outlier value does not look like a part of a localized trend (say, in advance of Valentine’s Day):
atm_data %>% filter(date <= as.Date('2010-03-09'),
date >= as.Date('2010-01-09'),
ATM=="ATM4") %>%
ggplot(aes(x=date,y=amount)) + geom_line() +
theme_light() +
scale_x_date(date_breaks = "1 week", date_labels = "%b %d") +
labs(title="Cash Withdrawals at ATM4",
subtitle="01/09/10 - 03/09/10",
y="Amount (100's of $)",
x="Date")So, we assume this is some sort of data entry issue and will replace this value with the mean of the Tuesday amounts for ATM4.
# Mean value withdrawn on Tuesdays at ATM4
replacement <- atm_data %>% filter(ATM=="ATM4",weekday==3) %>% summarize(mean(amount))
# Replace outlier value
atm_data[which(atm_data$date==as.Date("2010-02-09") & atm_data$ATM=="ATM4"),"amount"] <- replacementNow that we have the data more or less how we want it in terms of data types, we will split each machine into it’s own data set and convert them into time series.
atm1 <- atm_data %>% filter(ATM=="ATM1") %>% select(amount) %>%
ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
strftime(as.Date("2009-05-01"),'%W')))
atm2 <- atm_data %>% filter(ATM=="ATM2") %>% select(amount) %>%
ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
strftime(as.Date("2009-05-01"),'%W')))
atm3 <- atm_data %>% filter(ATM=="ATM3") %>% select(amount) %>%
ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
strftime(as.Date("2009-05-01"),'%W')))
atm4 <- atm_data %>% filter(ATM=="ATM4") %>% select(amount) %>%
ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
strftime(as.Date("2009-05-01"),'%W')))First, let’s plot the series for each ATM:
atm1 %>% autoplot() + theme_light() +
labs(title="Withdrawals from ATM1", y="Amount (100's of $)",
x="Week")atm2 %>% autoplot() + theme_light() +
labs(title="Withdrawals from ATM2", y="Amount (100's of $)",
x="Week")atm3 %>% autoplot() + theme_light() +
labs(title="Withdrawals from ATM3", y="Amount (100's of $)",
x="Week")atm4 %>% autoplot() + theme_light() +
labs(title="Withdrawals from ATM4", y="Amount (100's of $)",
x="Week")TM1 appears to have some weekly seasonality.
ATM2 also appears to have a similar weekly seasonality.
ATM3, as previously discussed, does not have enough data to really forecast well.
ATM4, unlike the other machines’ data, appears to be rather random - like white noise.
Now we’ll look at models for each ATM, starting with ATM1.
ATM1 First we’ll try to stabilize the series with a Box Cox transform:
l <- BoxCox.lambda(atm1)
atm1.trans <- BoxCox(atm1,lambda=l)
autoplot(cbind(original = atm1, transformed=atm1.trans), facets=T) +
theme_light() +
labs(title="Withdrawals from ATM1", y="Amount (100's of $)",
x="Week")Looking at the plots above, the Box Cox transform seems to stabilize some of the variance in the data using λ= 0.25.
Next we’ll split the data sets into a training set and a testing set, holding out 60 days for testing and the rest for training the models:
atm1.train <- subset(atm1.trans,end=length(atm1.trans)-60)
atm1.test <- subset(atm1.trans, start=length(atm1.trans)-59)Next, we’ll look at various models to see how they perform on the training data set.
We’ll begin using a seasonal naive forecast:
atm1.snaive <- snaive(atm1.train,h=31, biasadj=T)
accuracy(atm1.snaive)## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.004699424 1.632284 0.8453411 -Inf Inf 1 0.1045403
There appears to be some numerical instability in some of the error rates, but the RMSE and MASE are interpretable. The MASE shows us that this forecast is equal to a naive forecast (which this is a seasonal version of). Let’s look at the plot:
autoplot(atm1.snaive, PI=F) + theme_light() +
labs(title="Withdrawals from ATM1",
subtitle = "Seasonal Naive Model",
y="Amount (100's of $)",
x="Week")As expected, the snaive() model is really just adding in the seasonal component to the last value.
Next, we’ll try the Holt-Winters method using additive seasonality:**
# Use Holt-Winters method
atm1.hw <- hw(atm1.train, seasonal = "additive", h=31, biasadj = T)
accuracy(atm1.hw)## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.000598947 1.308741 0.6893755 -Inf Inf 0.8154998 0.1040176
Our RMSE and MASE accuracy measures both decreased from the seasonal naive method above.
autoplot(atm1.hw, PI=F, series = "Holt-Winters") + theme_light() +
labs(title="Withdrawals from ATM1",
subtitle = "Holt-Winters Seasonal Model",
y="Amount (100's of $)",
x="Week")The predictions from the Holt-Winters are flatter than the Seasonal Naive forecast, and don’t seem to take into account some of the positive spikes seen in the past data. This may be ok, though, as it may generalize better on unseen data.
The error measures RMSE and MASE are an improvement over the Seasonal Naive model, though.
Finally, we’ll try a seasonal ARIMA model:
atm1.arima <- auto.arima(atm1.train)
atm1.arima## Series: atm1.train
## ARIMA(0,0,2)(0,1,1)[7]
##
## Coefficients:
## ma1 ma2 sma1
## 0.1282 -0.0945 -0.8596
## s.e. 0.0579 0.0570 0.0493
##
## sigma^2 = 1.726: log likelihood = -507.35
## AIC=1022.7 AICc=1022.84 BIC=1037.49
The auto.arima() function chose an ARIMA(0,0,2)(0,1,1)[7] model. This makes sense, as it there should be little correlation with previous withdrawal amounts, yet some change in seasonality variance over time.
accuracy(atm1.arima)## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.02862017 1.291929 0.6996114 -Inf Inf 0.8276084 -0.001612681
The RMSE is only slightly lower with this ARIMA model and the MASE is actually a bit worse, when compared to the Holt-Winters model.
atm1.pred <- forecast(atm1.arima,h=31)
autoplot(atm1.pred, PI=F) + theme_light() +
labs(title="Withdrawals from ATM1",
subtitle = "ARIMA(0,0,1)(0,1,2)[7] Model",
y="Amount (100's of $)",
x="Week")The ARIMA model predictions strongly resemble the Holt-Winters model above. Let’s look at the residuals:
checkresiduals(atm1.arima)##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,2)(0,1,1)[7]
## Q* = 7.7234, df = 11, p-value = 0.7378
##
## Model df: 3. Total lags used: 14
The residuals are not really normal, though the Ljung-Box test confirms there is a lack of significant correlation.
Now, we will evaluate the accuracy measures with the test set:
# Check test set accuracy measures
testMeasures <- cbind(method = "Seasonal Naive",
data.frame(
as.list(accuracy(atm1.snaive, atm1.test)["Test set",])
))
testMeasures <- rbind(testMeasures,
cbind(method = "Holt-Winters",
data.frame(
as.list(accuracy(atm1.hw, atm1.test)["Test set",])
))
)
atm1.pred <- Arima(atm1.test,model=atm1.arima,h=31, biasadj=T)
testMeasures <- rbind(testMeasures,
cbind(method = "ARIMA",
data.frame(
as.list(accuracy(atm1.pred)["Training set",])
), Theil.s.U = NA))
testMeasures %>% select(method, RMSE, MASE) %>%
kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)| method | RMSE | MASE |
|---|---|---|
| Seasonal Naive | 1.2043282 | 1.1263666 |
| Holt-Winters | 2.8380034 | 2.3113934 |
| ARIMA | 0.5211845 | 0.7523489 |
The ARIMA model seems to generalize very well, as it has the best error metrics using the testing data set. So we will use that model for our predicted values for May 2010:
atm1.forecast <- forecast(atm1, model = atm1.arima, h=31, biasadj = T)
atm1.forecast %>%
kable() %>% kable_styling(bootstrap_options = "striped") %>%
scroll_box(width = "100%", height = "200px")| Point Forecast | Lo 80 | Hi 80 | Lo 95 | Hi 95 | |
|---|---|---|---|---|---|
| 59.42857 | 88.54968 | 86.86618 | 90.23319 | 85.97499 | 91.12438 |
| 59.57143 | 104.26506 | 102.56778 | 105.96234 | 101.66929 | 106.86083 |
| 59.71429 | 74.26779 | 72.56307 | 75.97250 | 71.66064 | 76.87493 |
| 59.85714 | 27.14883 | 25.44411 | 28.85355 | 24.54169 | 29.75597 |
| 60.00000 | 96.76130 | 95.05659 | 98.46602 | 94.15416 | 99.36844 |
| 60.14286 | 73.60436 | 71.89964 | 75.30908 | 70.99722 | 76.21150 |
| 60.28571 | 91.53222 | 89.82750 | 93.23693 | 88.92507 | 94.13936 |
| 60.42857 | 90.03139 | 88.31036 | 91.75242 | 87.39930 | 92.66348 |
| 60.57143 | 103.59256 | 101.87126 | 105.31385 | 100.96006 | 106.22505 |
| 60.71429 | 74.26779 | 72.54634 | 75.98923 | 71.63507 | 76.90050 |
| 60.85714 | 27.14883 | 25.42739 | 28.87027 | 24.51611 | 29.78155 |
| 61.00000 | 96.76130 | 95.03986 | 98.48275 | 94.12858 | 99.39402 |
| 61.14286 | 73.60436 | 71.88292 | 75.32580 | 70.97164 | 76.23708 |
| 61.28571 | 91.53222 | 89.81077 | 93.25366 | 88.89950 | 94.16493 |
| 61.42857 | 90.03139 | 88.29379 | 91.76899 | 87.37397 | 92.68882 |
| 61.57143 | 103.59256 | 101.85469 | 105.33042 | 100.93472 | 106.25039 |
| 61.71429 | 74.26779 | 72.52978 | 76.00579 | 71.60974 | 76.92584 |
| 61.85714 | 27.14883 | 25.41082 | 28.88684 | 24.49078 | 29.80688 |
| 62.00000 | 96.76130 | 95.02330 | 98.49931 | 94.10325 | 99.41935 |
| 62.14286 | 73.60436 | 71.86635 | 75.34236 | 70.94631 | 76.26241 |
| 62.28571 | 91.53222 | 89.79421 | 93.27022 | 88.87417 | 94.19027 |
| 62.42857 | 90.03139 | 88.27738 | 91.78540 | 87.34887 | 92.71392 |
| 62.57143 | 103.59256 | 101.83828 | 105.34683 | 100.90963 | 106.27548 |
| 62.71429 | 74.26779 | 72.51337 | 76.02220 | 71.58464 | 76.95093 |
| 62.85714 | 27.14883 | 25.39442 | 28.90324 | 24.46569 | 29.83197 |
| 63.00000 | 96.76130 | 95.00689 | 98.51572 | 94.07816 | 99.44445 |
| 63.14286 | 73.60436 | 71.84995 | 75.35877 | 70.92122 | 76.28750 |
| 63.28571 | 91.53222 | 89.77780 | 93.28663 | 88.84907 | 94.21536 |
| 63.42857 | 90.03139 | 88.26113 | 91.80166 | 87.32400 | 92.73878 |
| 63.57143 | 103.59256 | 101.82203 | 105.36308 | 100.88477 | 106.30034 |
| 63.71429 | 74.26779 | 72.49712 | 76.03845 | 71.55978 | 76.97579 |
# Output point forecasts
output <- cbind(date = seq(from=as.Date("2010-05-01"),
to=as.Date("2010-05-31"),by=1),
point_forecast = as.data.frame(atm1.forecast$mean))
write.xlsx(output,"forecast_ATM1.xlsx")
autoplot(atm1.forecast, PI=T) +
theme_light() +
labs(title="Withdrawals from ATM1",
subtitle = "ARIMA(0,0,1)(0,1,2)[7] Forecasts",
y="Amount (100's of $)",
x="Week")ATM2 As we did for ATM1 above, we’ll try to stabilize the series with a Box Cox transform:
l <- BoxCox.lambda(atm2)## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
atm2.trans <- BoxCox(atm2,lambda=l)
autoplot(cbind(original = atm2, transformed=atm2.trans), facets=T) +
theme_light() +
labs(title="Withdrawals from ATM2", y="Amount (100's of $)",
x="Week")atm2.train <- subset(atm2.trans,end=length(atm2.trans)-60)
atm2.test <- subset(atm2.trans, start=length(atm2.trans)-59)We’ll again begin using a seasonal naive forecast:
atm2.snaive <- snaive(atm2.train,h=31, biasadj=T)
accuracy(atm2.snaive)## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.05817628 8.21813 5.634022 -Inf Inf 1 0.09936693
autoplot(atm2.snaive, PI = F) +
theme_light() +
labs(title="Withdrawals from ATM2",
subtitle = "Seasonal Naive Model",
y="Amount (100's of $)",
x="Week")Visually, there appears to be a bit of a downward trend in the data, but this Seasonal Naive model doesn’t take that into account. This might not be a good model, so we’ll keep looking.
We’ll try Holt-Winters as our second model:
# Use Holt-Winters method
atm2.hw <- hw(atm2.train, seasonal = "additive", h=31, biasadj = T)
accuracy(atm2.hw)## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.02198949 6.697468 4.46445 -Inf Inf 0.7924091 0.02699244
Our chosen error metrics: RMSE and MASE, both decrease compared to the Seasonal Naive model above.
autoplot(atm2.hw, PI = F) +
theme_light() +
labs(title="Withdrawals from ATM2",
subtitle = "Holt-Winters Seasonal Model",
y="Amount (100's of $)",
x="Week")We see that this model does use some of the downward trend we saw, though not much.
Finally, we’ll try ARIMA on these data:
atm2.arima <- auto.arima(atm2.train)
atm2.arima## Series: atm2.train
## ARIMA(0,0,0)(0,1,2)[7]
##
## Coefficients:
## sma1 sma2
## -0.7271 -0.0828
## s.e. 0.0601 0.0594
##
## sigma^2 = 46.04: log likelihood = -995.94
## AIC=1997.88 AICc=1997.97 BIC=2008.98
Interestingly, the auto.arima() function chose a different model than it did for the ATM1 data. Let’s check the accuracy measures:
accuracy(atm2.arima)## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.4826825 6.684235 4.445472 -Inf Inf 0.7890405 0.04564369
The ARIMA model has very nearly the same RMSE and MASE that the Holt-Winter model did.
atm2.pred <- forecast(atm2.arima,h=31)
autoplot(atm2.pred, PI=T) + theme_light() +
labs(title="Withdrawals from ATM2",
subtitle = "ARIMA(0,0,0)(0,1,2)[7] Model",
y="Amount (100's of $)",
x="Week")We’ll check the accuracy over the testing data set to see how well the models generalize:
# Check test set accuracy measures
testMeasures <- cbind(method = "Seasonal Naive",
data.frame(
as.list(accuracy(atm2.snaive, atm1.test)["Test set",])
))
testMeasures <- rbind(testMeasures,
cbind(method = "Holt-Winters",
data.frame(
as.list(accuracy(atm2.hw, atm1.test)["Test set",])
))
)
atm2.pred <- Arima(atm2.test,model=atm2.arima,h=31, biasadj=T)
testMeasures <- rbind(testMeasures,
cbind(method = "ARIMA",
data.frame(
as.list(accuracy(atm2.pred)["Training set",])
), Theil.s.U = NA))
testMeasures %>% select(method, RMSE, MASE) %>%
kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)| method | RMSE | MASE |
|---|---|---|
| Seasonal Naive | 14.703665 | 2.0390192 |
| Holt-Winters | 12.178901 | 1.8777154 |
| ARIMA | 4.629508 | 0.7121923 |
As before, ARIMA does better with data it has not seen. So, we’ll use this model to make our forecasts:
atm2.forecast <- forecast(atm2, model = atm2.arima, h=31, biasadj = T)
atm2.forecast %>%
kable() %>% kable_styling(bootstrap_options = "striped") %>%
scroll_box(width = "100%", height = "200px")| Point Forecast | Lo 80 | Hi 80 | Lo 95 | Hi 95 | |
|---|---|---|---|---|---|
| 59.42857 | 64.80428 | 56.108830 | 73.49972 | 51.505739 | 78.10281 |
| 59.57143 | 73.03464 | 64.339191 | 81.73008 | 59.736100 | 86.33318 |
| 59.71429 | 17.74685 | 9.051407 | 26.44230 | 4.448316 | 31.04539 |
| 59.85714 | 10.85213 | 2.156684 | 19.54758 | -2.446407 | 24.15067 |
| 60.00000 | 93.34042 | 84.644969 | 102.03586 | 80.041878 | 106.63895 |
| 60.14286 | 81.31553 | 72.620085 | 90.01098 | 68.016994 | 94.61407 |
| 60.28571 | 67.14308 | 58.447630 | 75.83852 | 53.844538 | 80.44161 |
| 60.42857 | 65.44402 | 56.430659 | 74.45739 | 51.659272 | 79.22878 |
| 60.57143 | 71.08524 | 62.071880 | 80.09861 | 57.300492 | 84.87000 |
| 60.71429 | 18.44272 | 9.429351 | 27.45608 | 4.657964 | 32.22747 |
| 60.85714 | 11.72797 | 2.714601 | 20.74133 | -2.056786 | 25.51272 |
| 61.00000 | 91.90718 | 82.893816 | 100.92055 | 78.122429 | 105.69193 |
| 61.14286 | 80.67954 | 71.666178 | 89.69291 | 66.894791 | 94.46430 |
| 61.28571 | 64.36705 | 55.353684 | 73.38041 | 50.582296 | 78.15180 |
| 61.42857 | 65.44402 | 56.280326 | 74.60772 | 51.429358 | 79.45869 |
| 61.57143 | 71.08524 | 61.921546 | 80.24894 | 57.070578 | 85.09991 |
| 61.71429 | 18.44272 | 9.279018 | 27.60641 | 4.428050 | 32.45738 |
| 61.85714 | 11.72797 | 2.564268 | 20.89166 | -2.286701 | 25.74263 |
| 62.00000 | 91.90718 | 82.743483 | 101.07088 | 77.892514 | 105.92185 |
| 62.14286 | 80.67954 | 71.515845 | 89.84324 | 66.664877 | 94.69421 |
| 62.28571 | 64.36705 | 55.203350 | 73.53075 | 50.352382 | 78.38172 |
| 62.42857 | 65.44402 | 56.132420 | 74.75563 | 51.203154 | 79.68489 |
| 62.57143 | 71.08524 | 61.773640 | 80.39685 | 56.844375 | 85.32611 |
| 62.71429 | 18.44272 | 9.131112 | 27.75432 | 4.201846 | 32.68359 |
| 62.85714 | 11.72797 | 2.416362 | 21.03957 | -2.512904 | 25.96884 |
| 63.00000 | 91.90718 | 82.595576 | 101.21878 | 77.666311 | 106.14805 |
| 63.14286 | 80.67954 | 71.367939 | 89.99115 | 66.438673 | 94.92041 |
| 63.28571 | 64.36705 | 55.055444 | 73.67865 | 50.126179 | 78.60792 |
| 63.42857 | 65.44402 | 55.986827 | 74.90122 | 50.980488 | 79.90756 |
| 63.57143 | 71.08524 | 61.628047 | 80.54244 | 56.621709 | 85.54878 |
| 63.71429 | 18.44272 | 8.985518 | 27.89991 | 3.979180 | 32.90625 |
# Output point forecasts
output <- cbind(date = seq(from=as.Date("2010-05-01"),
to=as.Date("2010-05-31"),by=1),
point_forecast = as.data.frame(atm2.forecast$mean))
write.xlsx(output,"forecast_ATM2.xlsx")
autoplot(atm2.forecast, PI=T) +
theme_light() +
labs(title="Withdrawals from ATM2",
subtitle = "ARIMA(0,0,0)(0,1,2)[7] Forecasts",
y="Amount (100's of $)",
x="Week")Given the fact that there are only 3 data points for ATM3, there isn’t a good model choice. The best we could likely do is a naive forecast, but even that I would not be comfortable using in practice:
autoplot(atm3) + autolayer(naive(atm3)) The data for ATM4, as mentioned above, seemed to have no real seasonal pattern and almost resembled white noise. So, we’ll take a slightly different approach.
First, though, we will check to see if a Box-Cox transformation is useful:
l <- BoxCox.lambda(atm4)
atm4.trans <- BoxCox(atm4,lambda=l)
autoplot(cbind(original = atm4, transformed=atm4.trans), facets=T) +
theme_light() +
labs(title="Withdrawals from ATM4", y="Amount (100's of $)",
x="Week")The transformation did reduce some of the variance, so we’ll keep it.
Next, we’ll see if there is any evidence of seasonality that we did not see in the overall plot.
ggAcf(atm4.trans)Surprisingly, there does appear to be some weekly seasonality like ATM1 and ATM2. We’ll split the data and use an STL decomposition to see what that looks like:
atm4.train <- subset(atm4.trans,end=length(atm2.trans)-60)
atm4.test <- subset(atm4.trans, start=length(atm2.trans)-59)atm4.decomp <- stl(atm4.train, s.window="periodic")
autoplot(atm4.decomp)The stl() function seemed to tease out the seasonality pretty well. We’ll try to use this decomposition to forecast, and see how well it does:
atm4.stl <- seasadj(atm4.decomp) %>% snaive(h=31)
accuracy(atm4.stl)## ME RMSE MAE MPE MAPE MASE ACF1
## Training set 0.1070168 17.97907 14.21384 -34.43799 174.691 1 0.04499377
atm4.stl %>% autoplot() +
labs(title="Withdrawals from ATM4",
subtitle="STL Seasonal Naive Model",
y="Amount (100's of $)",
x="Week")
The STL method does ok, but we can probably do better. We’ll try a
Holt-Winter model:
atm4.hw <- hw(atm4.train, h = 31, seasonal = "additive", biasadj = T,
lambda=l)
accuracy(atm4.hw)## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.337626 13.1643 10.47503 -59.01754 82.22031 0.7369594 0.0649929
The Holt-Winter model does well in both RMSE and MASE. Let’s plot the results and see what the model predictions look like based on the training data:
autoplot(atm4.hw, PI = T) +
theme_light() +
labs(title="Withdrawals from ATM4",
subtitle = "Holt-Winters Seasonal Model",
y="Amount (100's of $)",
x="Week")The predictions look ok, though they have a large prediction interval.
Finally, we try an ARIMA model next to see if it works better:
atm4.arima <- auto.arima(atm4.train, biasadj = T)
atm4.arima## Series: atm4.train
## ARIMA(0,0,1)(2,0,0)[7] with non-zero mean
##
## Coefficients:
## ma1 sar1 sar2 mean
## 0.0883 0.2092 0.1443 30.0424
## s.e. 0.0576 0.0577 0.0586 1.3337
##
## sigma^2 = 200.8: log likelihood = -1239.71
## AIC=2489.43 AICc=2489.63 BIC=2508.03
The auto.arima() function chose an ARIMA(0,0,1)(2,0,0)[7] model. We’ll first check the accuracy measures on the training set and then plot them.
accuracy(atm2.arima)## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.4826825 6.684235 4.445472 -Inf Inf 0.7890405 0.04564369
atm4.pred <- forecast(atm4.arima,h=31)
autoplot(atm4.pred, PI=T) + theme_light() +
labs(title="Withdrawals from ATM4",
subtitle = "ARIMA(0,0,1)(2,0,0)[7] Model",
y="Amount (100's of $)",
x="Week")The RMSE accuracy measure appears better than the STL Seasonal Naive model, but the MASE is actually a bit worse. Also, looking at the graph above, the predictions don’t look very stable.
We’ll check the residuals next:
checkresiduals(atm4.arima)##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,0,1)(2,0,0)[7] with non-zero mean
## Q* = 11.319, df = 11, p-value = 0.4169
##
## Model df: 3. Total lags used: 14
The residuals aren’t exactly normal, though there appear to be no significant correlations between them and the Ljung-Box confirms the same.
Finally, let’s look at all of the models’ performance with the test data:
# Check test set accuracy measures
testMeasures <- cbind(method = "STL",
data.frame(
as.list(accuracy(atm4.stl, atm4.test)["Test set",])
))
testMeasures <- rbind(testMeasures,
cbind(method = "Holt-Winters",
data.frame(
as.list(accuracy(atm4.hw, atm4.test)["Test set",])
))
)
atm4.pred <- Arima(atm4.test,model=atm4.arima,biasadj=T)
testMeasures <- rbind(testMeasures,
cbind(method = "ARIMA",
data.frame(
as.list(accuracy(atm4.pred)["Training set",])
), Theil.s.U = NA))
testMeasures %>% select(method, RMSE, MASE) %>%
kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)| method | RMSE | MASE |
|---|---|---|
| STL | 21.28007 | 1.1667665 |
| Holt-Winters | 17.59295 | 0.9856173 |
| ARIMA | 12.33961 | 0.8749154 |
Surprisingly, the ARIMA model does better than all the others in terms of our chosen accuracy measures on the test data set. However, the instability of the model predictions makes me hesitant to use it.
In conclusion, I would choose the Holt-Winter model here, despite it’s worse performance in accuracy measures, because I feel it provides more stable predictions of the three models.
atm4.forecast <- forecast(atm4.trans, model=atm4.hw, h=31,
use.initial.values=TRUE,
biasadj = T)
atm4.forecast %>%
kable() %>% kable_styling(bootstrap_options = "striped") %>%
scroll_box(width = "100%", height = "200px")| Point Forecast | Lo 80 | Hi 80 | Lo 95 | Hi 95 | |
|---|---|---|---|---|---|
| 59.42857 | 26.05965 | 10.470474 | 49.50389 | 5.2010605 | 65.25192 |
| 59.57143 | 27.51257 | 11.324439 | 51.64204 | 5.7757149 | 67.78627 |
| 59.71429 | 28.18119 | 11.699643 | 52.66995 | 6.0224496 | 69.03117 |
| 59.85714 | 27.16074 | 11.031793 | 51.32039 | 5.5456069 | 67.52032 |
| 60.00000 | 23.22162 | 8.635555 | 45.77789 | 3.9248643 | 61.11479 |
| 60.14286 | 10.16172 | 1.927026 | 25.85077 | 0.2560098 | 37.39999 |
| 60.28571 | 28.53184 | 11.760629 | 53.51730 | 6.0073660 | 70.23031 |
| 60.42857 | 26.05043 | 10.209555 | 50.09025 | 4.9321807 | 66.30303 |
| 60.57143 | 27.50308 | 11.052246 | 52.24166 | 5.4910460 | 68.85870 |
| 60.71429 | 28.17157 | 11.422702 | 53.27568 | 5.7313119 | 70.11364 |
| 60.85714 | 27.15131 | 10.763731 | 51.91736 | 5.2675011 | 68.58935 |
| 61.00000 | 23.21296 | 8.401166 | 46.33875 | 3.6950086 | 62.12720 |
| 61.14286 | 10.15619 | 1.824018 | 26.26222 | 0.2059312 | 38.17594 |
| 61.28571 | 28.52215 | 11.483438 | 54.12705 | 5.7171902 | 71.32070 |
| 61.42857 | 26.04121 | 9.953083 | 50.67825 | 4.6719199 | 67.35948 |
| 61.57143 | 27.49359 | 10.784547 | 52.84291 | 5.2150999 | 69.93647 |
| 61.71429 | 28.16195 | 11.150264 | 53.88306 | 5.4489254 | 71.20148 |
| 61.85714 | 27.14189 | 10.500105 | 52.51602 | 4.9980256 | 69.66383 |
| 62.00000 | 23.20431 | 8.171013 | 46.90141 | 3.4733746 | 63.14521 |
| 62.14286 | 10.15067 | 1.724389 | 26.67582 | 0.1618406 | 38.95794 |
| 62.28571 | 28.51247 | 11.210683 | 54.73854 | 5.4356753 | 72.41664 |
| 62.42857 | 26.03200 | 9.700924 | 51.26806 | 4.4200735 | 68.42159 |
| 62.57143 | 27.48410 | 10.521203 | 53.44597 | 4.9476692 | 71.01990 |
| 62.71429 | 28.15234 | 10.882190 | 54.49225 | 5.1750833 | 72.29501 |
| 62.85714 | 27.13246 | 10.240785 | 53.11654 | 4.7369799 | 70.74407 |
| 63.00000 | 23.19565 | 7.944979 | 47.46604 | 3.2597776 | 64.16911 |
| 63.14286 | 10.14514 | 1.628061 | 27.09168 | 0.1235512 | 39.74622 |
| 63.28571 | 28.50279 | 10.942235 | 55.35195 | 5.1626242 | 73.51843 |
| 63.42857 | 26.02278 | 9.452956 | 51.85985 | 4.1764548 | 69.48964 |
| 63.57143 | 27.47461 | 10.262092 | 54.05102 | 4.6885653 | 72.10929 |
| 63.71429 | 28.14273 | 10.618357 | 55.10345 | 4.9095971 | 73.39452 |
# Output point forecasts
output <- cbind(date = seq(from=as.Date("2010-05-01"),
to=as.Date("2010-05-31"),by=1),
point_forecast = as.data.frame(InvBoxCox(atm4.forecast$mean,lambda=l)))
write.xlsx(output,"forecast_ATM4.xlsx")
autoplot(atm4.forecast, PI=T) +
theme_light() +
labs(title="Withdrawals from ATM4",
subtitle = "Holt-Winters Forecasts",
y="Amount (100's of $)",
x="Week")The date column needs to be adjusted to a yearmonth date rather than
a character. There is also a missing value in the KWH
column. I will replace the missing value with the prior year value of
the same month. For example, the missing value for September, 2008 will
be filled with the September, 2007 KWH value.
b <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")
head(b)## # A tibble: 6 × 3
## CaseSequence `YYYY-MMM` KWH
## <dbl> <chr> <dbl>
## 1 733 1998-Jan 6862583
## 2 734 1998-Feb 5838198
## 3 735 1998-Mar 5420658
## 4 736 1998-Apr 5010364
## 5 737 1998-May 4665377
## 6 738 1998-Jun 6467147
b <- b %>%
rename(YearMonth = "YYYY-MMM") %>%
mutate(YearMonth = yearmonth(YearMonth))
b.data <- as_tsibble(b, index = YearMonth)
head(b.data)## # A tsibble: 6 x 3 [1M]
## CaseSequence YearMonth KWH
## <dbl> <mth> <dbl>
## 1 733 1998 Jan 6862583
## 2 734 1998 Feb 5838198
## 3 735 1998 Mar 5420658
## 4 736 1998 Apr 5010364
## 5 737 1998 May 4665377
## 6 738 1998 Jun 6467147
summary(b.data$KWH)## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 770523 5429912 6283324 6502475 7620524 10655730 1
paste0("Num of missing values: ", sum(is.na(b.data)))## [1] "Num of missing values: 1"
paste0("Row index of NA value: " ,which(is.na(b.data$KWH)))## [1] "Row index of NA value: 129"
b.data[129,]## # A tsibble: 1 x 3 [1M]
## CaseSequence YearMonth KWH
## <dbl> <mth> <dbl>
## 1 861 2008 Sep NA
b.data[129,3] <- b.data[117,3]paste0("Num of missing values: ", sum(is.na(b.data)))## [1] "Num of missing values: 0"
The data is seasonal and contains and outlier for July 2010 with a much lower power usage than any other month. There also appears to be a slight upward trend in energy usage which could be due to a rise / fall in local temperature month over month especially during summer and winter months. The outlier will be replaced with the previous year’s value for that month (Ex July 2010 will be replaced with July 2009).
b.data %>%
autoplot(KWH) +
ggtitle("Residential Customer Power Consumption w/ Outlier")paste0("Row index of outlier: " ,which.min(b.data$KWH))## [1] "Row index of outlier: 151"
b.data[151,]## # A tsibble: 1 x 3 [1M]
## CaseSequence YearMonth KWH
## <dbl> <mth> <dbl>
## 1 883 2010 Jul 770523
b.data[151,3] <- b.data[139,3]b.data %>%
autoplot(KWH) +
ggtitle("Residential Customer Power Consumption w/ Adjusted Outlier")Like in Part A, checking the stationarity of the time series is key to using ARIMA models. The data appears to have strong seasonal autocorrelation every 12 months. After taking 1 seasonal difference the data appears to be stationary.
gg_tsdisplay(b.data, y = KWH, plot_type='partial') +
ggtitle("Before Differencing")b.data %>%
features(KWH, unitroot_kpss, lag = 12)## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 1.67 0.01
nsd <- b.data %>%
features(KWH, unitroot_nsdiffs, lag = 12)
paste0("Number of Seasonal differences needed for stationarity: ", nsd$nsdiffs)## [1] "Number of Seasonal differences needed for stationarity: 1"
b.data %>%
features(difference(KWH, lag = 12), unitroot_kpss)## # A tibble: 1 × 2
## kpss_stat kpss_pvalue
## <dbl> <dbl>
## 1 0.162 0.1
gg_tsdisplay(b.data, difference(KWH, lag = 12), plot_type='partial') +
ggtitle("ATM1 After Differencing")## Warning: Removed 12 rows containing missing values (`geom_line()`).
## Warning: Removed 12 rows containing missing values (`geom_point()`).
b.data <- b.data %>%
mutate(diff_KWH = difference(KWH, lag = 12))A few different models with varying degrees of complexity were created to attempt to find a model that could be effective in forecasting future consumer power usage. Basic models like MEAN and SNAIVE were used as well as more complicated ETS and ARIMA models. Each model will be compared to find the best fitting model to use for forecasting.
fit5 <- b.data %>%
model(
MEAN = MEAN(KWH),
SNAIVE = SNAIVE(KWH),
ETS_Additive = ETS(KWH ~ error("A") + trend("A") + season("A")),
ETS_Multiplicative = ETS(KWH ~ error("M") + trend("A") + season("M")),
#ETS_Additive_Boxcox = ETS(box_cox(KWH, lambda) ~ error("A") + trend("N") + season("A")),
#ETS_Multiplicative_Boxcox = ETS(box_cox(KWH, lambda) ~ error("M") + trend("N") + season("M"))
ARIMA = ARIMA(KWH)
)The ARIMA(0,0,4)(2,1,0)[12] w/ drift model outperformed all of the other models in terms of RMSE and MAE. This model will be used for forecasting.
# acc.met <- fit1 %>%
# glance() %>%
# select(.model:BIC)
# acc.met2 <- fit1.arima %>%
# glance() %>%
# select(.model:BIC)
#acc <- rbind(acc.met, acc.met2)
#acc.met10 <- fit5 %>%
# accuracy() %>%
# select(.model, RMSE, MAE)
# RMSE2 <- fit1.arima %>%
# accuracy() %>%
# select(RMSE)
#rmse <- rbind(RMSE, RMSE2)
#kableExtra::kable(cbind(acc.met, acc.met2))
# kableExtra::kable(acc.met10)The ARIMA(0,0,4)(2,1,0)[12] w/ drift meet most of the expectations for model forecasting. The AFC plot shows that innovation residuals are likely from white noise. The mean of the innovation residuals are close to zero given the scale of the innovation residuals is in the millions (mean = -8653). The homoscedascity property is not met, however, as the variance changes throughout the series. Finally the data is not exactly from a normal distribution. The homoscedascity and normality properties are helpful but not necessary for forecasting.
fit5.forecast <- b.data %>%
model(
ARIMA = ARIMA(KWH)
)
fit5.forecast %>%
gg_tsresiduals()resid <- residuals(fit5.forecast, type = "innovation")
mean(resid$.resid) ## [1] -8653.906
The forecast for the Electricity Consumption is provided below. The forecasts have been stored in a .csv file that will be up on my Github and attached to the Project submission.
fc5.final <- fit5.forecast %>%
forecast(h = "1 year")
fc5.final %>%
autoplot(b.data) +
ggtitle("Residential Power Consumption: ARIMA(0,0,4)(2,1,0)[12] w/ drift Forecast")