Name: Charles Ugiagbe.

Date: 11/24/23

library(fpp3)
library(tidyverse)
library(readxl)
library(kableExtra)
library(openxlsx)
library(forecast)

Part A: ATM Withdrawal Forcast.

Read the data into R

atm_data <- read_csv("ATM624Data.csv")

head(atm_data)
## # A tibble: 6 × 3
##   DATE                 ATM    Cash
##   <chr>                <chr> <dbl>
## 1 5/1/2009 12:00:00 AM ATM1     96
## 2 5/1/2009 12:00:00 AM ATM2    107
## 3 5/2/2009 12:00:00 AM ATM1     82
## 4 5/2/2009 12:00:00 AM ATM2     89
## 5 5/3/2009 12:00:00 AM ATM1     85
## 6 5/3/2009 12:00:00 AM ATM2     90

Data Cleaning and Renaming

# Change DATE column to a proper date format
atm_data$DATE <- as.Date(mdy_hms(atm_data$DATE, tz="EST"))

# Change column names
names(atm_data) <- c("date","ATM","amount")

head(atm_data)
## # A tibble: 6 × 3
##   date       ATM   amount
##   <date>     <chr>  <dbl>
## 1 2009-05-01 ATM1      96
## 2 2009-05-01 ATM2     107
## 3 2009-05-02 ATM1      82
## 4 2009-05-02 ATM2      89
## 5 2009-05-03 ATM1      85
## 6 2009-05-03 ATM2      90

Check for missing values

missing <- atm_data %>% filter(is.na(date)) %>% nrow() %>% tibble(date = .)
missing <- atm_data %>% filter(is.na(ATM)) %>% nrow() %>%
  tibble(ATM = .) %>% cbind(missing,.)
missing <- atm_data %>% filter(is.na(amount)) %>% nrow() %>%
  tibble(amount = .) %>% cbind(missing,.)

rownames(missing) <- "NA Values"

kbl(missing) %>% kable_paper("striped", full_width = F, position="left") %>%
  add_header_above(c("Column Name" = 4))
Column Name
date ATM amount
NA Values 0 14 19

There are 19 records where no amount is recorded and 14 records where no machine is recorded

atm_data %>% filter(is.na(amount) | is.na(ATM))
## # A tibble: 19 × 3
##    date       ATM   amount
##    <date>     <chr>  <dbl>
##  1 2009-06-13 ATM1      NA
##  2 2009-06-16 ATM1      NA
##  3 2009-06-18 ATM2      NA
##  4 2009-06-22 ATM1      NA
##  5 2009-06-24 ATM2      NA
##  6 2010-05-01 <NA>      NA
##  7 2010-05-02 <NA>      NA
##  8 2010-05-03 <NA>      NA
##  9 2010-05-04 <NA>      NA
## 10 2010-05-05 <NA>      NA
## 11 2010-05-06 <NA>      NA
## 12 2010-05-07 <NA>      NA
## 13 2010-05-08 <NA>      NA
## 14 2010-05-09 <NA>      NA
## 15 2010-05-10 <NA>      NA
## 16 2010-05-11 <NA>      NA
## 17 2010-05-12 <NA>      NA
## 18 2010-05-13 <NA>      NA
## 19 2010-05-14 <NA>      NA

Inspecting the output, we see that some of the records that are missing amount are also missing machine. to forecast, we’ll remove those from the data set.

# Keep only where both amount and machine are not NA
atm_data <- atm_data %>% filter(!(is.na(amount) & is.na(ATM)))

# Re-check
atm_data %>% filter(is.na(amount))
## # A tibble: 5 × 3
##   date       ATM   amount
##   <date>     <chr>  <dbl>
## 1 2009-06-13 ATM1      NA
## 2 2009-06-16 ATM1      NA
## 3 2009-06-18 ATM2      NA
## 4 2009-06-22 ATM1      NA
## 5 2009-06-24 ATM2      NA
atm_data$weekday <- atm_data$date %>% wday()
# Function to return the value of the amount from 7 days previous for the
#   same machine.
lastWeekday <- function(d, m){
  d <- d-7
  i <- atm_data %>% filter(date == d, ATM == m) %>%
    select(amount) %>% as.double()
  return(i)
}

lastWeekday <- Vectorize(lastWeekday)

atm_data[is.na(atm_data$amount),"amount"] <- atm_data %>% filter(is.na(amount)) %>%
  mutate(imputed = lastWeekday(date, ATM)) %>% select(imputed)

we then search for outliers. We’ll start by plotting the data:

atm_data %>% ggplot(aes(x=date,y=amount,color=ATM)) + geom_line() +
  facet_wrap(ATM ~ ., scales = "free") +
  theme_light() +
  scale_x_date(date_breaks = "3 months", date_labels = "%b %y") +
  scale_color_discrete(name="Machine") +
  labs(title="Cash Withdrawals by ATM", y="Amount (100's of $)",
       x="Date")

Two issues can be seen in the graphs above. The first is that ATM3 only has a few non-zero data points. Forecasting with those data will be nearly impossible to do accurately.

The second issue is that ATM4 appears to have a single, large outlier at date 2010-02-09. Let’s look more closely at that.

days <- c("Sunday","Monday","Tuesday","Wednesday","Thursday",
          "Friday","Saturday")

atm_data %>% filter(ATM == "ATM4") %>% ggplot(aes(x=date, y=amount)) +
  geom_line() + theme_light() +
  theme(axis.text.x = element_blank()) + 
  facet_wrap(ordered(weekday,labels=days) ~ .,scales="free_y") +
  labs(title="ATM4", subtitle="By Day of Week", x=NULL, y= NULL)

We can see that the suspected outlier was recorded on a Tuesday and is well above the other values recorded on a Tuesday (or any other weekday for that matter).

If we look at the data points surrounding it, that outlier value does not look like a part of a localized trend (say, in advance of Valentine’s Day):

atm_data %>% filter(date <= as.Date('2010-03-09'),
               date >= as.Date('2010-01-09'),
               ATM=="ATM4") %>%
  ggplot(aes(x=date,y=amount)) + geom_line() +
  theme_light() +
  scale_x_date(date_breaks = "1 week", date_labels = "%b %d") +
  labs(title="Cash Withdrawals at ATM4", 
       subtitle="01/09/10 - 03/09/10",
       y="Amount (100's of $)",
       x="Date")

So, we assume this is some sort of data entry issue and will replace this value with the mean of the Tuesday amounts for ATM4.

# Mean value withdrawn on Tuesdays at ATM4
replacement <- atm_data %>% filter(ATM=="ATM4",weekday==3) %>% summarize(mean(amount))

# Replace outlier value
atm_data[which(atm_data$date==as.Date("2010-02-09") & atm_data$ATM=="ATM4"),"amount"] <- replacement

Splitting

Now that we have the data more or less how we want it in terms of data types, we will split each machine into it’s own data set and convert them into time series.

atm1 <- atm_data %>% filter(ATM=="ATM1") %>% select(amount) %>%
  ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
                            strftime(as.Date("2009-05-01"),'%W')))

atm2 <- atm_data %>% filter(ATM=="ATM2") %>% select(amount) %>%
  ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
                            strftime(as.Date("2009-05-01"),'%W')))

atm3 <- atm_data %>% filter(ATM=="ATM3") %>% select(amount) %>%
  ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
                            strftime(as.Date("2009-05-01"),'%W')))

atm4 <- atm_data %>% filter(ATM=="ATM4") %>% select(amount) %>%
  ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
                            strftime(as.Date("2009-05-01"),'%W')))

Exploratory Analysis

First, let’s plot the series for each ATM:

atm1 %>% autoplot() + theme_light() +
  labs(title="Withdrawals from ATM1", y="Amount (100's of $)",
       x="Week")

atm2 %>% autoplot() + theme_light() +
  labs(title="Withdrawals from ATM2", y="Amount (100's of $)",
       x="Week")

atm3 %>% autoplot() + theme_light() +
  labs(title="Withdrawals from ATM3", y="Amount (100's of $)",
       x="Week")

atm4 %>% autoplot() + theme_light() +
  labs(title="Withdrawals from ATM4", y="Amount (100's of $)",
       x="Week")

TM1 appears to have some weekly seasonality.

ATM2 also appears to have a similar weekly seasonality.

ATM3, as previously discussed, does not have enough data to really forecast well.

ATM4, unlike the other machines’ data, appears to be rather random - like white noise.

Modeling

Now we’ll look at models for each ATM, starting with ATM1.

ATM1 First we’ll try to stabilize the series with a Box Cox transform:

l <- BoxCox.lambda(atm1)
atm1.trans <- BoxCox(atm1,lambda=l)

autoplot(cbind(original = atm1, transformed=atm1.trans), facets=T) +
  theme_light() +
  labs(title="Withdrawals from ATM1", y="Amount (100's of $)",
       x="Week")

Looking at the plots above, the Box Cox transform seems to stabilize some of the variance in the data using λ= 0.25.

Next we’ll split the data sets into a training set and a testing set, holding out 60 days for testing and the rest for training the models:

atm1.train <- subset(atm1.trans,end=length(atm1.trans)-60)
atm1.test <- subset(atm1.trans, start=length(atm1.trans)-59)

Next, we’ll look at various models to see how they perform on the training data set.

We’ll begin using a seasonal naive forecast:

atm1.snaive <- snaive(atm1.train,h=31, biasadj=T)

accuracy(atm1.snaive)
##                       ME     RMSE       MAE  MPE MAPE MASE      ACF1
## Training set 0.004699424 1.632284 0.8453411 -Inf  Inf    1 0.1045403

There appears to be some numerical instability in some of the error rates, but the RMSE and MASE are interpretable. The MASE shows us that this forecast is equal to a naive forecast (which this is a seasonal version of). Let’s look at the plot:

autoplot(atm1.snaive, PI=F) + theme_light() +
  labs(title="Withdrawals from ATM1",
       subtitle = "Seasonal Naive Model",
       y="Amount (100's of $)",
       x="Week")

As expected, the snaive() model is really just adding in the seasonal component to the last value.

Next, we’ll try the Holt-Winters method using additive seasonality:**

# Use Holt-Winters method
atm1.hw <- hw(atm1.train, seasonal = "additive", h=31, biasadj = T)

accuracy(atm1.hw)
##                        ME     RMSE       MAE  MPE MAPE      MASE      ACF1
## Training set -0.000598947 1.308741 0.6893755 -Inf  Inf 0.8154998 0.1040176

Our RMSE and MASE accuracy measures both decreased from the seasonal naive method above.

autoplot(atm1.hw, PI=F, series = "Holt-Winters") + theme_light() +
  labs(title="Withdrawals from ATM1",
       subtitle = "Holt-Winters Seasonal Model",
       y="Amount (100's of $)",
       x="Week")

The predictions from the Holt-Winters are flatter than the Seasonal Naive forecast, and don’t seem to take into account some of the positive spikes seen in the past data. This may be ok, though, as it may generalize better on unseen data.

The error measures RMSE and MASE are an improvement over the Seasonal Naive model, though.

Finally, we’ll try a seasonal ARIMA model:

atm1.arima <- auto.arima(atm1.train)

atm1.arima
## Series: atm1.train 
## ARIMA(0,0,2)(0,1,1)[7] 
## 
## Coefficients:
##          ma1      ma2     sma1
##       0.1282  -0.0945  -0.8596
## s.e.  0.0579   0.0570   0.0493
## 
## sigma^2 = 1.726:  log likelihood = -507.35
## AIC=1022.7   AICc=1022.84   BIC=1037.49

The auto.arima() function chose an ARIMA(0,0,2)(0,1,1)[7] model. This makes sense, as it there should be little correlation with previous withdrawal amounts, yet some change in seasonality variance over time.

accuracy(atm1.arima)
##                      ME     RMSE       MAE  MPE MAPE      MASE         ACF1
## Training set 0.02862017 1.291929 0.6996114 -Inf  Inf 0.8276084 -0.001612681

The RMSE is only slightly lower with this ARIMA model and the MASE is actually a bit worse, when compared to the Holt-Winters model.

atm1.pred <- forecast(atm1.arima,h=31)

autoplot(atm1.pred, PI=F) + theme_light() +
  labs(title="Withdrawals from ATM1",
       subtitle = "ARIMA(0,0,1)(0,1,2)[7] Model",
       y="Amount (100's of $)",
       x="Week")

The ARIMA model predictions strongly resemble the Holt-Winters model above. Let’s look at the residuals:

checkresiduals(atm1.arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,2)(0,1,1)[7]
## Q* = 7.7234, df = 11, p-value = 0.7378
## 
## Model df: 3.   Total lags used: 14

The residuals are not really normal, though the Ljung-Box test confirms there is a lack of significant correlation.

Now, we will evaluate the accuracy measures with the test set:

# Check test set accuracy measures
testMeasures <- cbind(method = "Seasonal Naive", 
                      data.frame(
                        as.list(accuracy(atm1.snaive, atm1.test)["Test set",])
                      ))

testMeasures <- rbind(testMeasures,
  cbind(method = "Holt-Winters", 
                      data.frame(
                        as.list(accuracy(atm1.hw, atm1.test)["Test set",])
                      ))
)

atm1.pred <- Arima(atm1.test,model=atm1.arima,h=31, biasadj=T)

testMeasures <- rbind(testMeasures,
  cbind(method = "ARIMA", 
                      data.frame(
                        as.list(accuracy(atm1.pred)["Training set",])
                      ), Theil.s.U = NA))

testMeasures %>% select(method, RMSE, MASE) %>%
  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
method RMSE MASE
Seasonal Naive 1.2043282 1.1263666
Holt-Winters 2.8380034 2.3113934
ARIMA 0.5211845 0.7523489

The ARIMA model seems to generalize very well, as it has the best error metrics using the testing data set. So we will use that model for our predicted values for May 2010:

atm1.forecast <- forecast(atm1, model = atm1.arima, h=31, biasadj = T)

atm1.forecast %>%
  kable() %>% kable_styling(bootstrap_options = "striped") %>%
  scroll_box(width = "100%", height = "200px")
Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
59.42857 88.54968 86.86618 90.23319 85.97499 91.12438
59.57143 104.26506 102.56778 105.96234 101.66929 106.86083
59.71429 74.26779 72.56307 75.97250 71.66064 76.87493
59.85714 27.14883 25.44411 28.85355 24.54169 29.75597
60.00000 96.76130 95.05659 98.46602 94.15416 99.36844
60.14286 73.60436 71.89964 75.30908 70.99722 76.21150
60.28571 91.53222 89.82750 93.23693 88.92507 94.13936
60.42857 90.03139 88.31036 91.75242 87.39930 92.66348
60.57143 103.59256 101.87126 105.31385 100.96006 106.22505
60.71429 74.26779 72.54634 75.98923 71.63507 76.90050
60.85714 27.14883 25.42739 28.87027 24.51611 29.78155
61.00000 96.76130 95.03986 98.48275 94.12858 99.39402
61.14286 73.60436 71.88292 75.32580 70.97164 76.23708
61.28571 91.53222 89.81077 93.25366 88.89950 94.16493
61.42857 90.03139 88.29379 91.76899 87.37397 92.68882
61.57143 103.59256 101.85469 105.33042 100.93472 106.25039
61.71429 74.26779 72.52978 76.00579 71.60974 76.92584
61.85714 27.14883 25.41082 28.88684 24.49078 29.80688
62.00000 96.76130 95.02330 98.49931 94.10325 99.41935
62.14286 73.60436 71.86635 75.34236 70.94631 76.26241
62.28571 91.53222 89.79421 93.27022 88.87417 94.19027
62.42857 90.03139 88.27738 91.78540 87.34887 92.71392
62.57143 103.59256 101.83828 105.34683 100.90963 106.27548
62.71429 74.26779 72.51337 76.02220 71.58464 76.95093
62.85714 27.14883 25.39442 28.90324 24.46569 29.83197
63.00000 96.76130 95.00689 98.51572 94.07816 99.44445
63.14286 73.60436 71.84995 75.35877 70.92122 76.28750
63.28571 91.53222 89.77780 93.28663 88.84907 94.21536
63.42857 90.03139 88.26113 91.80166 87.32400 92.73878
63.57143 103.59256 101.82203 105.36308 100.88477 106.30034
63.71429 74.26779 72.49712 76.03845 71.55978 76.97579
# Output point forecasts
output <- cbind(date = seq(from=as.Date("2010-05-01"),
                           to=as.Date("2010-05-31"),by=1),
      point_forecast = as.data.frame(atm1.forecast$mean))

write.xlsx(output,"forecast_ATM1.xlsx")

autoplot(atm1.forecast, PI=T) +
  theme_light() +
  labs(title="Withdrawals from ATM1",
       subtitle = "ARIMA(0,0,1)(0,1,2)[7] Forecasts",
       y="Amount (100's of $)",
       x="Week")

ATM2 As we did for ATM1 above, we’ll try to stabilize the series with a Box Cox transform:

l <- BoxCox.lambda(atm2)
## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.
atm2.trans <- BoxCox(atm2,lambda=l)

autoplot(cbind(original = atm2, transformed=atm2.trans), facets=T) +
  theme_light() +
  labs(title="Withdrawals from ATM2", y="Amount (100's of $)",
       x="Week")

atm2.train <- subset(atm2.trans,end=length(atm2.trans)-60)
atm2.test <- subset(atm2.trans, start=length(atm2.trans)-59)

We’ll again begin using a seasonal naive forecast:

atm2.snaive <- snaive(atm2.train,h=31, biasadj=T)

accuracy(atm2.snaive)
##                       ME    RMSE      MAE  MPE MAPE MASE       ACF1
## Training set -0.05817628 8.21813 5.634022 -Inf  Inf    1 0.09936693
autoplot(atm2.snaive, PI = F) +
  theme_light() +
  labs(title="Withdrawals from ATM2",
       subtitle = "Seasonal Naive Model",
       y="Amount (100's of $)",
       x="Week")

Visually, there appears to be a bit of a downward trend in the data, but this Seasonal Naive model doesn’t take that into account. This might not be a good model, so we’ll keep looking.

We’ll try Holt-Winters as our second model:

# Use Holt-Winters method
atm2.hw <- hw(atm2.train, seasonal = "additive", h=31, biasadj = T)

accuracy(atm2.hw)
##                      ME     RMSE     MAE  MPE MAPE      MASE       ACF1
## Training set 0.02198949 6.697468 4.46445 -Inf  Inf 0.7924091 0.02699244

Our chosen error metrics: RMSE and MASE, both decrease compared to the Seasonal Naive model above.

autoplot(atm2.hw, PI = F) +
  theme_light() +
  labs(title="Withdrawals from ATM2",
       subtitle = "Holt-Winters Seasonal Model",
       y="Amount (100's of $)",
       x="Week")

We see that this model does use some of the downward trend we saw, though not much.

Finally, we’ll try ARIMA on these data:

atm2.arima <- auto.arima(atm2.train)

atm2.arima
## Series: atm2.train 
## ARIMA(0,0,0)(0,1,2)[7] 
## 
## Coefficients:
##          sma1     sma2
##       -0.7271  -0.0828
## s.e.   0.0601   0.0594
## 
## sigma^2 = 46.04:  log likelihood = -995.94
## AIC=1997.88   AICc=1997.97   BIC=2008.98

Interestingly, the auto.arima() function chose a different model than it did for the ATM1 data. Let’s check the accuracy measures:

accuracy(atm2.arima)
##                      ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set -0.4826825 6.684235 4.445472 -Inf  Inf 0.7890405 0.04564369

The ARIMA model has very nearly the same RMSE and MASE that the Holt-Winter model did.

atm2.pred <- forecast(atm2.arima,h=31)

autoplot(atm2.pred, PI=T) + theme_light() +
  labs(title="Withdrawals from ATM2",
       subtitle = "ARIMA(0,0,0)(0,1,2)[7] Model",
       y="Amount (100's of $)",
       x="Week")

We’ll check the accuracy over the testing data set to see how well the models generalize:

# Check test set accuracy measures
testMeasures <- cbind(method = "Seasonal Naive", 
                      data.frame(
                        as.list(accuracy(atm2.snaive, atm1.test)["Test set",])
                      ))

testMeasures <- rbind(testMeasures,
  cbind(method = "Holt-Winters", 
                      data.frame(
                        as.list(accuracy(atm2.hw, atm1.test)["Test set",])
                      ))
)

atm2.pred <- Arima(atm2.test,model=atm2.arima,h=31, biasadj=T)

testMeasures <- rbind(testMeasures,
  cbind(method = "ARIMA", 
                      data.frame(
                        as.list(accuracy(atm2.pred)["Training set",])
                      ), Theil.s.U = NA))

testMeasures %>% select(method, RMSE, MASE) %>%
  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
method RMSE MASE
Seasonal Naive 14.703665 2.0390192
Holt-Winters 12.178901 1.8777154
ARIMA 4.629508 0.7121923

As before, ARIMA does better with data it has not seen. So, we’ll use this model to make our forecasts:

atm2.forecast <- forecast(atm2, model = atm2.arima, h=31, biasadj = T)

atm2.forecast %>%
  kable() %>% kable_styling(bootstrap_options = "striped") %>%
  scroll_box(width = "100%", height = "200px")
Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
59.42857 64.80428 56.108830 73.49972 51.505739 78.10281
59.57143 73.03464 64.339191 81.73008 59.736100 86.33318
59.71429 17.74685 9.051407 26.44230 4.448316 31.04539
59.85714 10.85213 2.156684 19.54758 -2.446407 24.15067
60.00000 93.34042 84.644969 102.03586 80.041878 106.63895
60.14286 81.31553 72.620085 90.01098 68.016994 94.61407
60.28571 67.14308 58.447630 75.83852 53.844538 80.44161
60.42857 65.44402 56.430659 74.45739 51.659272 79.22878
60.57143 71.08524 62.071880 80.09861 57.300492 84.87000
60.71429 18.44272 9.429351 27.45608 4.657964 32.22747
60.85714 11.72797 2.714601 20.74133 -2.056786 25.51272
61.00000 91.90718 82.893816 100.92055 78.122429 105.69193
61.14286 80.67954 71.666178 89.69291 66.894791 94.46430
61.28571 64.36705 55.353684 73.38041 50.582296 78.15180
61.42857 65.44402 56.280326 74.60772 51.429358 79.45869
61.57143 71.08524 61.921546 80.24894 57.070578 85.09991
61.71429 18.44272 9.279018 27.60641 4.428050 32.45738
61.85714 11.72797 2.564268 20.89166 -2.286701 25.74263
62.00000 91.90718 82.743483 101.07088 77.892514 105.92185
62.14286 80.67954 71.515845 89.84324 66.664877 94.69421
62.28571 64.36705 55.203350 73.53075 50.352382 78.38172
62.42857 65.44402 56.132420 74.75563 51.203154 79.68489
62.57143 71.08524 61.773640 80.39685 56.844375 85.32611
62.71429 18.44272 9.131112 27.75432 4.201846 32.68359
62.85714 11.72797 2.416362 21.03957 -2.512904 25.96884
63.00000 91.90718 82.595576 101.21878 77.666311 106.14805
63.14286 80.67954 71.367939 89.99115 66.438673 94.92041
63.28571 64.36705 55.055444 73.67865 50.126179 78.60792
63.42857 65.44402 55.986827 74.90122 50.980488 79.90756
63.57143 71.08524 61.628047 80.54244 56.621709 85.54878
63.71429 18.44272 8.985518 27.89991 3.979180 32.90625
# Output point forecasts
output <- cbind(date = seq(from=as.Date("2010-05-01"),
                           to=as.Date("2010-05-31"),by=1),
      point_forecast = as.data.frame(atm2.forecast$mean))

write.xlsx(output,"forecast_ATM2.xlsx")

autoplot(atm2.forecast, PI=T) +
  theme_light() +
  labs(title="Withdrawals from ATM2",
       subtitle = "ARIMA(0,0,0)(0,1,2)[7] Forecasts",
       y="Amount (100's of $)",
       x="Week")

ATM3

Given the fact that there are only 3 data points for ATM3, there isn’t a good model choice. The best we could likely do is a naive forecast, but even that I would not be comfortable using in practice:

autoplot(atm3) + autolayer(naive(atm3)) 

ATM4

The data for ATM4, as mentioned above, seemed to have no real seasonal pattern and almost resembled white noise. So, we’ll take a slightly different approach.

First, though, we will check to see if a Box-Cox transformation is useful:

l <- BoxCox.lambda(atm4)
atm4.trans <- BoxCox(atm4,lambda=l)

autoplot(cbind(original = atm4, transformed=atm4.trans), facets=T) +
  theme_light() +
  labs(title="Withdrawals from ATM4", y="Amount (100's of $)",
       x="Week")

The transformation did reduce some of the variance, so we’ll keep it.

Next, we’ll see if there is any evidence of seasonality that we did not see in the overall plot.

ggAcf(atm4.trans)

Surprisingly, there does appear to be some weekly seasonality like ATM1 and ATM2. We’ll split the data and use an STL decomposition to see what that looks like:

atm4.train <- subset(atm4.trans,end=length(atm2.trans)-60)
atm4.test <- subset(atm4.trans, start=length(atm2.trans)-59)
atm4.decomp <- stl(atm4.train, s.window="periodic")
autoplot(atm4.decomp)

The stl() function seemed to tease out the seasonality pretty well. We’ll try to use this decomposition to forecast, and see how well it does:

atm4.stl <- seasadj(atm4.decomp) %>% snaive(h=31)

accuracy(atm4.stl)
##                     ME     RMSE      MAE       MPE    MAPE MASE       ACF1
## Training set 0.1070168 17.97907 14.21384 -34.43799 174.691    1 0.04499377
atm4.stl %>% autoplot() +
  labs(title="Withdrawals from ATM4", 
       subtitle="STL Seasonal Naive Model",
       y="Amount (100's of $)",
       x="Week")

The STL method does ok, but we can probably do better. We’ll try a Holt-Winter model:

atm4.hw <- hw(atm4.train, h = 31, seasonal = "additive", biasadj = T,
              lambda=l)

accuracy(atm4.hw)
##                     ME    RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set -0.337626 13.1643 10.47503 -59.01754 82.22031 0.7369594 0.0649929

The Holt-Winter model does well in both RMSE and MASE. Let’s plot the results and see what the model predictions look like based on the training data:

autoplot(atm4.hw, PI = T) +
  theme_light() +
  labs(title="Withdrawals from ATM4",
       subtitle = "Holt-Winters Seasonal Model",
       y="Amount (100's of $)",
       x="Week")

The predictions look ok, though they have a large prediction interval.

Finally, we try an ARIMA model next to see if it works better:

atm4.arima <- auto.arima(atm4.train, biasadj = T)

atm4.arima
## Series: atm4.train 
## ARIMA(0,0,1)(2,0,0)[7] with non-zero mean 
## 
## Coefficients:
##          ma1    sar1    sar2     mean
##       0.0883  0.2092  0.1443  30.0424
## s.e.  0.0576  0.0577  0.0586   1.3337
## 
## sigma^2 = 200.8:  log likelihood = -1239.71
## AIC=2489.43   AICc=2489.63   BIC=2508.03

The auto.arima() function chose an ARIMA(0,0,1)(2,0,0)[7] model. We’ll first check the accuracy measures on the training set and then plot them.

accuracy(atm2.arima)
##                      ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set -0.4826825 6.684235 4.445472 -Inf  Inf 0.7890405 0.04564369
atm4.pred <- forecast(atm4.arima,h=31)

autoplot(atm4.pred, PI=T) + theme_light() +
  labs(title="Withdrawals from ATM4",
       subtitle = "ARIMA(0,0,1)(2,0,0)[7] Model",
       y="Amount (100's of $)",
       x="Week")

The RMSE accuracy measure appears better than the STL Seasonal Naive model, but the MASE is actually a bit worse. Also, looking at the graph above, the predictions don’t look very stable.

We’ll check the residuals next:

checkresiduals(atm4.arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(2,0,0)[7] with non-zero mean
## Q* = 11.319, df = 11, p-value = 0.4169
## 
## Model df: 3.   Total lags used: 14

The residuals aren’t exactly normal, though there appear to be no significant correlations between them and the Ljung-Box confirms the same.

Finally, let’s look at all of the models’ performance with the test data:

# Check test set accuracy measures
testMeasures <- cbind(method = "STL", 
                      data.frame(
                        as.list(accuracy(atm4.stl, atm4.test)["Test set",])
                      ))

testMeasures <- rbind(testMeasures,
  cbind(method = "Holt-Winters", 
                      data.frame(
                        as.list(accuracy(atm4.hw, atm4.test)["Test set",])
                      ))
)

atm4.pred <- Arima(atm4.test,model=atm4.arima,biasadj=T)

testMeasures <- rbind(testMeasures,
  cbind(method = "ARIMA", 
                      data.frame(
                        as.list(accuracy(atm4.pred)["Training set",])
                      ), Theil.s.U = NA))

testMeasures %>% select(method, RMSE, MASE) %>%
  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)
method RMSE MASE
STL 21.28007 1.1667665
Holt-Winters 17.59295 0.9856173
ARIMA 12.33961 0.8749154

Surprisingly, the ARIMA model does better than all the others in terms of our chosen accuracy measures on the test data set. However, the instability of the model predictions makes me hesitant to use it.

In conclusion, I would choose the Holt-Winter model here, despite it’s worse performance in accuracy measures, because I feel it provides more stable predictions of the three models.

atm4.forecast <- forecast(atm4.trans, model=atm4.hw, h=31,
                          use.initial.values=TRUE,
                          biasadj = T)

atm4.forecast %>%
  kable() %>% kable_styling(bootstrap_options = "striped") %>%
  scroll_box(width = "100%", height = "200px")
Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
59.42857 26.05965 10.470474 49.50389 5.2010605 65.25192
59.57143 27.51257 11.324439 51.64204 5.7757149 67.78627
59.71429 28.18119 11.699643 52.66995 6.0224496 69.03117
59.85714 27.16074 11.031793 51.32039 5.5456069 67.52032
60.00000 23.22162 8.635555 45.77789 3.9248643 61.11479
60.14286 10.16172 1.927026 25.85077 0.2560098 37.39999
60.28571 28.53184 11.760629 53.51730 6.0073660 70.23031
60.42857 26.05043 10.209555 50.09025 4.9321807 66.30303
60.57143 27.50308 11.052246 52.24166 5.4910460 68.85870
60.71429 28.17157 11.422702 53.27568 5.7313119 70.11364
60.85714 27.15131 10.763731 51.91736 5.2675011 68.58935
61.00000 23.21296 8.401166 46.33875 3.6950086 62.12720
61.14286 10.15619 1.824018 26.26222 0.2059312 38.17594
61.28571 28.52215 11.483438 54.12705 5.7171902 71.32070
61.42857 26.04121 9.953083 50.67825 4.6719199 67.35948
61.57143 27.49359 10.784547 52.84291 5.2150999 69.93647
61.71429 28.16195 11.150264 53.88306 5.4489254 71.20148
61.85714 27.14189 10.500105 52.51602 4.9980256 69.66383
62.00000 23.20431 8.171013 46.90141 3.4733746 63.14521
62.14286 10.15067 1.724389 26.67582 0.1618406 38.95794
62.28571 28.51247 11.210683 54.73854 5.4356753 72.41664
62.42857 26.03200 9.700924 51.26806 4.4200735 68.42159
62.57143 27.48410 10.521203 53.44597 4.9476692 71.01990
62.71429 28.15234 10.882190 54.49225 5.1750833 72.29501
62.85714 27.13246 10.240785 53.11654 4.7369799 70.74407
63.00000 23.19565 7.944979 47.46604 3.2597776 64.16911
63.14286 10.14514 1.628061 27.09168 0.1235512 39.74622
63.28571 28.50279 10.942235 55.35195 5.1626242 73.51843
63.42857 26.02278 9.452956 51.85985 4.1764548 69.48964
63.57143 27.47461 10.262092 54.05102 4.6885653 72.10929
63.71429 28.14273 10.618357 55.10345 4.9095971 73.39452
# Output point forecasts
output <- cbind(date = seq(from=as.Date("2010-05-01"),
                           to=as.Date("2010-05-31"),by=1),
      point_forecast = as.data.frame(InvBoxCox(atm4.forecast$mean,lambda=l)))

write.xlsx(output,"forecast_ATM4.xlsx")

autoplot(atm4.forecast, PI=T) +
  theme_light() +
  labs(title="Withdrawals from ATM4",
       subtitle = "Holt-Winters Forecasts",
       y="Amount (100's of $)",
       x="Week")

Part B - Forecasting Power

Read in Data

The date column needs to be adjusted to a yearmonth date rather than a character. There is also a missing value in the KWH column. I will replace the missing value with the prior year value of the same month. For example, the missing value for September, 2008 will be filled with the September, 2007 KWH value.

b <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")

head(b)
## # A tibble: 6 × 3
##   CaseSequence `YYYY-MMM`     KWH
##          <dbl> <chr>        <dbl>
## 1          733 1998-Jan   6862583
## 2          734 1998-Feb   5838198
## 3          735 1998-Mar   5420658
## 4          736 1998-Apr   5010364
## 5          737 1998-May   4665377
## 6          738 1998-Jun   6467147
b <- b %>%
        rename(YearMonth = "YYYY-MMM") %>%
        mutate(YearMonth = yearmonth(YearMonth))

b.data <- as_tsibble(b, index = YearMonth) 

head(b.data)
## # A tsibble: 6 x 3 [1M]
##   CaseSequence YearMonth     KWH
##          <dbl>     <mth>   <dbl>
## 1          733  1998 Jan 6862583
## 2          734  1998 Feb 5838198
## 3          735  1998 Mar 5420658
## 4          736  1998 Apr 5010364
## 5          737  1998 May 4665377
## 6          738  1998 Jun 6467147
summary(b.data$KWH)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   770523  5429912  6283324  6502475  7620524 10655730        1
paste0("Num of missing values: ", sum(is.na(b.data)))
## [1] "Num of missing values: 1"
paste0("Row index of NA value: " ,which(is.na(b.data$KWH)))
## [1] "Row index of NA value: 129"
b.data[129,]
## # A tsibble: 1 x 3 [1M]
##   CaseSequence YearMonth   KWH
##          <dbl>     <mth> <dbl>
## 1          861  2008 Sep    NA
b.data[129,3] <- b.data[117,3]
paste0("Num of missing values: ", sum(is.na(b.data)))
## [1] "Num of missing values: 0"

Data Exploration

The data is seasonal and contains and outlier for July 2010 with a much lower power usage than any other month. There also appears to be a slight upward trend in energy usage which could be due to a rise / fall in local temperature month over month especially during summer and winter months. The outlier will be replaced with the previous year’s value for that month (Ex July 2010 will be replaced with July 2009).

b.data %>%
        autoplot(KWH) +
        ggtitle("Residential Customer Power Consumption w/ Outlier")

paste0("Row index of outlier: " ,which.min(b.data$KWH))
## [1] "Row index of outlier: 151"
b.data[151,]
## # A tsibble: 1 x 3 [1M]
##   CaseSequence YearMonth    KWH
##          <dbl>     <mth>  <dbl>
## 1          883  2010 Jul 770523
b.data[151,3] <- b.data[139,3]
b.data %>%
        autoplot(KWH) +
        ggtitle("Residential Customer Power Consumption w/ Adjusted Outlier")

Data Modeling

Check if Data is Stationary

Like in Part A, checking the stationarity of the time series is key to using ARIMA models. The data appears to have strong seasonal autocorrelation every 12 months. After taking 1 seasonal difference the data appears to be stationary.

gg_tsdisplay(b.data, y = KWH, plot_type='partial') +
        ggtitle("Before Differencing")

b.data %>%
  features(KWH, unitroot_kpss, lag = 12)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      1.67        0.01
nsd <- b.data %>%
  features(KWH, unitroot_nsdiffs, lag = 12)

paste0("Number of Seasonal differences needed for stationarity: ", nsd$nsdiffs)
## [1] "Number of Seasonal differences needed for stationarity: 1"
b.data %>%
  features(difference(KWH, lag = 12), unitroot_kpss)
## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.162         0.1
gg_tsdisplay(b.data, difference(KWH, lag = 12), plot_type='partial') +
        ggtitle("ATM1 After Differencing")
## Warning: Removed 12 rows containing missing values (`geom_line()`).
## Warning: Removed 12 rows containing missing values (`geom_point()`).

b.data <- b.data %>%
        mutate(diff_KWH = difference(KWH, lag = 12))

Model Creation

A few different models with varying degrees of complexity were created to attempt to find a model that could be effective in forecasting future consumer power usage. Basic models like MEAN and SNAIVE were used as well as more complicated ETS and ARIMA models. Each model will be compared to find the best fitting model to use for forecasting.

fit5 <- b.data %>%
  model(
        MEAN = MEAN(KWH),
        SNAIVE = SNAIVE(KWH),
        ETS_Additive = ETS(KWH ~ error("A") + trend("A") + season("A")),
        ETS_Multiplicative = ETS(KWH ~ error("M") + trend("A") + season("M")),
        #ETS_Additive_Boxcox = ETS(box_cox(KWH, lambda) ~ error("A") + trend("N") + season("A")),
        #ETS_Multiplicative_Boxcox  = ETS(box_cox(KWH, lambda) ~ error("M") + trend("N") + season("M"))
        ARIMA = ARIMA(KWH)
        )

Model Evaluation

The ARIMA(0,0,4)(2,1,0)[12] w/ drift model outperformed all of the other models in terms of RMSE and MAE. This model will be used for forecasting.

# acc.met <- fit1 %>%
#        glance() %>%
#        select(.model:BIC)

# acc.met2 <- fit1.arima %>%
#        glance() %>%
#        select(.model:BIC)

#acc <- rbind(acc.met, acc.met2)

#acc.met10 <- fit5 %>%
#        accuracy() %>%
#        select(.model, RMSE, MAE)

# RMSE2 <- fit1.arima %>%
#        accuracy() %>%
#        select(RMSE)

#rmse <- rbind(RMSE, RMSE2)

#kableExtra::kable(cbind(acc.met, acc.met2))

# kableExtra::kable(acc.met10)

Model Diagnostics

The ARIMA(0,0,4)(2,1,0)[12] w/ drift meet most of the expectations for model forecasting. The AFC plot shows that innovation residuals are likely from white noise. The mean of the innovation residuals are close to zero given the scale of the innovation residuals is in the millions (mean = -8653). The homoscedascity property is not met, however, as the variance changes throughout the series. Finally the data is not exactly from a normal distribution. The homoscedascity and normality properties are helpful but not necessary for forecasting.

fit5.forecast <- b.data %>%
  model(
        ARIMA = ARIMA(KWH)
        )

fit5.forecast %>%
        gg_tsresiduals()

resid <- residuals(fit5.forecast, type = "innovation")
mean(resid$.resid) 
## [1] -8653.906

Forecast

The forecast for the Electricity Consumption is provided below. The forecasts have been stored in a .csv file that will be up on my Github and attached to the Project submission.

fc5.final <- fit5.forecast %>%
  forecast(h = "1 year")


fc5.final %>%
  autoplot(b.data) +
  ggtitle("Residential Power Consumption: ARIMA(0,0,4)(2,1,0)[12] w/ drift Forecast")