Name: Charles Ugiagbe.

Date: 11/24/23

library(fpp3)
library(tidyverse)
library(readxl)
library(kableExtra)
library(openxlsx)
library(forecast)

Part A: ATM Withdrawal Forcast.

Read the data into R

atm_data <- read_csv("ATM624Data.csv")

head(atm_data)

## # A tibble: 6 × 3
##   DATE                 ATM    Cash
##   <chr>                <chr> <dbl>
## 1 5/1/2009 12:00:00 AM ATM1     96
## 2 5/1/2009 12:00:00 AM ATM2    107
## 3 5/2/2009 12:00:00 AM ATM1     82
## 4 5/2/2009 12:00:00 AM ATM2     89
## 5 5/3/2009 12:00:00 AM ATM1     85
## 6 5/3/2009 12:00:00 AM ATM2     90

Data Cleaning and Renaming

# Change DATE column to a proper date format
atm_data$DATE <- as.Date(mdy_hms(atm_data$DATE, tz="EST"))

# Change column names
names(atm_data) <- c("date","ATM","amount")

head(atm_data)

## # A tibble: 6 × 3
##   date       ATM   amount
##   <date>     <chr>  <dbl>
## 1 2009-05-01 ATM1      96
## 2 2009-05-01 ATM2     107
## 3 2009-05-02 ATM1      82
## 4 2009-05-02 ATM2      89
## 5 2009-05-03 ATM1      85
## 6 2009-05-03 ATM2      90

Check for missing values

missing <- atm_data %>% filter(is.na(date)) %>% nrow() %>% tibble(date = .)
missing <- atm_data %>% filter(is.na(ATM)) %>% nrow() %>%
  tibble(ATM = .) %>% cbind(missing,.)
missing <- atm_data %>% filter(is.na(amount)) %>% nrow() %>%
  tibble(amount = .) %>% cbind(missing,.)

rownames(missing) <- "NA Values"

kbl(missing) %>% kable_paper("striped", full_width = F, position="left") %>%
  add_header_above(c("Column Name" = 4))

Column Name
	date	ATM	amount
NA Values	0	14	19

There are 19 records where no amount is recorded and 14 records where no machine is recorded

atm_data %>% filter(is.na(amount) | is.na(ATM))

## # A tibble: 19 × 3
##    date       ATM   amount
##    <date>     <chr>  <dbl>
##  1 2009-06-13 ATM1      NA
##  2 2009-06-16 ATM1      NA
##  3 2009-06-18 ATM2      NA
##  4 2009-06-22 ATM1      NA
##  5 2009-06-24 ATM2      NA
##  6 2010-05-01 <NA>      NA
##  7 2010-05-02 <NA>      NA
##  8 2010-05-03 <NA>      NA
##  9 2010-05-04 <NA>      NA
## 10 2010-05-05 <NA>      NA
## 11 2010-05-06 <NA>      NA
## 12 2010-05-07 <NA>      NA
## 13 2010-05-08 <NA>      NA
## 14 2010-05-09 <NA>      NA
## 15 2010-05-10 <NA>      NA
## 16 2010-05-11 <NA>      NA
## 17 2010-05-12 <NA>      NA
## 18 2010-05-13 <NA>      NA
## 19 2010-05-14 <NA>      NA

Inspecting the output, we see that some of the records that are missing amount are also missing machine. to forecast, we’ll remove those from the data set.

# Keep only where both amount and machine are not NA
atm_data <- atm_data %>% filter(!(is.na(amount) & is.na(ATM)))

# Re-check
atm_data %>% filter(is.na(amount))

## # A tibble: 5 × 3
##   date       ATM   amount
##   <date>     <chr>  <dbl>
## 1 2009-06-13 ATM1      NA
## 2 2009-06-16 ATM1      NA
## 3 2009-06-18 ATM2      NA
## 4 2009-06-22 ATM1      NA
## 5 2009-06-24 ATM2      NA

atm_data$weekday <- atm_data$date %>% wday()

# Function to return the value of the amount from 7 days previous for the
#   same machine.
lastWeekday <- function(d, m){
  d <- d-7
  i <- atm_data %>% filter(date == d, ATM == m) %>%
    select(amount) %>% as.double()
  return(i)
}

lastWeekday <- Vectorize(lastWeekday)

atm_data[is.na(atm_data$amount),"amount"] <- atm_data %>% filter(is.na(amount)) %>%
  mutate(imputed = lastWeekday(date, ATM)) %>% select(imputed)

we then search for outliers. We’ll start by plotting the data:

atm_data %>% ggplot(aes(x=date,y=amount,color=ATM)) + geom_line() +
  facet_wrap(ATM ~ ., scales = "free") +
  theme_light() +
  scale_x_date(date_breaks = "3 months", date_labels = "%b %y") +
  scale_color_discrete(name="Machine") +
  labs(title="Cash Withdrawals by ATM", y="Amount (100's of $)",
       x="Date")

Two issues can be seen in the graphs above. The first is that ATM3 only has a few non-zero data points. Forecasting with those data will be nearly impossible to do accurately.

The second issue is that ATM4 appears to have a single, large outlier at date 2010-02-09. Let’s look more closely at that.

days <- c("Sunday","Monday","Tuesday","Wednesday","Thursday",
          "Friday","Saturday")

atm_data %>% filter(ATM == "ATM4") %>% ggplot(aes(x=date, y=amount)) +
  geom_line() + theme_light() +
  theme(axis.text.x = element_blank()) + 
  facet_wrap(ordered(weekday,labels=days) ~ .,scales="free_y") +
  labs(title="ATM4", subtitle="By Day of Week", x=NULL, y= NULL)

We can see that the suspected outlier was recorded on a Tuesday and is well above the other values recorded on a Tuesday (or any other weekday for that matter).

If we look at the data points surrounding it, that outlier value does not look like a part of a localized trend (say, in advance of Valentine’s Day):

atm_data %>% filter(date <= as.Date('2010-03-09'),
               date >= as.Date('2010-01-09'),
               ATM=="ATM4") %>%
  ggplot(aes(x=date,y=amount)) + geom_line() +
  theme_light() +
  scale_x_date(date_breaks = "1 week", date_labels = "%b %d") +
  labs(title="Cash Withdrawals at ATM4", 
       subtitle="01/09/10 - 03/09/10",
       y="Amount (100's of $)",
       x="Date")

So, we assume this is some sort of data entry issue and will replace this value with the mean of the Tuesday amounts for ATM4.

# Mean value withdrawn on Tuesdays at ATM4
replacement <- atm_data %>% filter(ATM=="ATM4",weekday==3) %>% summarize(mean(amount))

# Replace outlier value
atm_data[which(atm_data$date==as.Date("2010-02-09") & atm_data$ATM=="ATM4"),"amount"] <- replacement

Splitting

Now that we have the data more or less how we want it in terms of data types, we will split each machine into it’s own data set and convert them into time series.

atm1 <- atm_data %>% filter(ATM=="ATM1") %>% select(amount) %>%
  ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
                            strftime(as.Date("2009-05-01"),'%W')))

atm2 <- atm_data %>% filter(ATM=="ATM2") %>% select(amount) %>%
  ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
                            strftime(as.Date("2009-05-01"),'%W')))

atm3 <- atm_data %>% filter(ATM=="ATM3") %>% select(amount) %>%
  ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
                            strftime(as.Date("2009-05-01"),'%W')))

atm4 <- atm_data %>% filter(ATM=="ATM4") %>% select(amount) %>%
  ts(frequency = 7, start=c(strftime(as.Date("2009-05-01"),'%w'),
                            strftime(as.Date("2009-05-01"),'%W')))

Exploratory Analysis

First, let’s plot the series for each ATM:

atm1 %>% autoplot() + theme_light() +
  labs(title="Withdrawals from ATM1", y="Amount (100's of $)",
       x="Week")

atm2 %>% autoplot() + theme_light() +
  labs(title="Withdrawals from ATM2", y="Amount (100's of $)",
       x="Week")

atm3 %>% autoplot() + theme_light() +
  labs(title="Withdrawals from ATM3", y="Amount (100's of $)",
       x="Week")

atm4 %>% autoplot() + theme_light() +
  labs(title="Withdrawals from ATM4", y="Amount (100's of $)",
       x="Week")

TM1 appears to have some weekly seasonality.

ATM2 also appears to have a similar weekly seasonality.

ATM3, as previously discussed, does not have enough data to really forecast well.

ATM4, unlike the other machines’ data, appears to be rather random - like white noise.

Modeling

Now we’ll look at models for each ATM, starting with ATM1.

ATM1 First we’ll try to stabilize the series with a Box Cox transform:

l <- BoxCox.lambda(atm1)
atm1.trans <- BoxCox(atm1,lambda=l)

autoplot(cbind(original = atm1, transformed=atm1.trans), facets=T) +
  theme_light() +
  labs(title="Withdrawals from ATM1", y="Amount (100's of $)",
       x="Week")

Looking at the plots above, the Box Cox transform seems to stabilize some of the variance in the data using λ= 0.25.

Next we’ll split the data sets into a training set and a testing set, holding out 60 days for testing and the rest for training the models:

atm1.train <- subset(atm1.trans,end=length(atm1.trans)-60)
atm1.test <- subset(atm1.trans, start=length(atm1.trans)-59)

Next, we’ll look at various models to see how they perform on the training data set.

We’ll begin using a seasonal naive forecast:

atm1.snaive <- snaive(atm1.train,h=31, biasadj=T)

accuracy(atm1.snaive)

##                       ME     RMSE       MAE  MPE MAPE MASE      ACF1
## Training set 0.004699424 1.632284 0.8453411 -Inf  Inf    1 0.1045403

There appears to be some numerical instability in some of the error rates, but the RMSE and MASE are interpretable. The MASE shows us that this forecast is equal to a naive forecast (which this is a seasonal version of). Let’s look at the plot:

autoplot(atm1.snaive, PI=F) + theme_light() +
  labs(title="Withdrawals from ATM1",
       subtitle = "Seasonal Naive Model",
       y="Amount (100's of $)",
       x="Week")

As expected, the snaive() model is really just adding in the seasonal component to the last value.

Next, we’ll try the Holt-Winters method using additive seasonality:**

# Use Holt-Winters method
atm1.hw <- hw(atm1.train, seasonal = "additive", h=31, biasadj = T)

accuracy(atm1.hw)

##                        ME     RMSE       MAE  MPE MAPE      MASE      ACF1
## Training set -0.000598947 1.308741 0.6893755 -Inf  Inf 0.8154998 0.1040176

Our RMSE and MASE accuracy measures both decreased from the seasonal naive method above.

autoplot(atm1.hw, PI=F, series = "Holt-Winters") + theme_light() +
  labs(title="Withdrawals from ATM1",
       subtitle = "Holt-Winters Seasonal Model",
       y="Amount (100's of $)",
       x="Week")

The predictions from the Holt-Winters are flatter than the Seasonal Naive forecast, and don’t seem to take into account some of the positive spikes seen in the past data. This may be ok, though, as it may generalize better on unseen data.

The error measures RMSE and MASE are an improvement over the Seasonal Naive model, though.

Finally, we’ll try a seasonal ARIMA model:

atm1.arima <- auto.arima(atm1.train)

atm1.arima

## Series: atm1.train 
## ARIMA(0,0,2)(0,1,1)[7] 
## 
## Coefficients:
##          ma1      ma2     sma1
##       0.1282  -0.0945  -0.8596
## s.e.  0.0579   0.0570   0.0493
## 
## sigma^2 = 1.726:  log likelihood = -507.35
## AIC=1022.7   AICc=1022.84   BIC=1037.49

The auto.arima() function chose an ARIMA(0,0,2)(0,1,1)[7] model. This makes sense, as it there should be little correlation with previous withdrawal amounts, yet some change in seasonality variance over time.

accuracy(atm1.arima)

##                      ME     RMSE       MAE  MPE MAPE      MASE         ACF1
## Training set 0.02862017 1.291929 0.6996114 -Inf  Inf 0.8276084 -0.001612681

The RMSE is only slightly lower with this ARIMA model and the MASE is actually a bit worse, when compared to the Holt-Winters model.

atm1.pred <- forecast(atm1.arima,h=31)

autoplot(atm1.pred, PI=F) + theme_light() +
  labs(title="Withdrawals from ATM1",
       subtitle = "ARIMA(0,0,1)(0,1,2)[7] Model",
       y="Amount (100's of $)",
       x="Week")

The ARIMA model predictions strongly resemble the Holt-Winters model above. Let’s look at the residuals:

checkresiduals(atm1.arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,2)(0,1,1)[7]
## Q* = 7.7234, df = 11, p-value = 0.7378
## 
## Model df: 3.   Total lags used: 14

The residuals are not really normal, though the Ljung-Box test confirms there is a lack of significant correlation.

Now, we will evaluate the accuracy measures with the test set:

# Check test set accuracy measures
testMeasures <- cbind(method = "Seasonal Naive", 
                      data.frame(
                        as.list(accuracy(atm1.snaive, atm1.test)["Test set",])
                      ))

testMeasures <- rbind(testMeasures,
  cbind(method = "Holt-Winters", 
                      data.frame(
                        as.list(accuracy(atm1.hw, atm1.test)["Test set",])
                      ))
)

atm1.pred <- Arima(atm1.test,model=atm1.arima,h=31, biasadj=T)

testMeasures <- rbind(testMeasures,
  cbind(method = "ARIMA", 
                      data.frame(
                        as.list(accuracy(atm1.pred)["Training set",])
                      ), Theil.s.U = NA))

testMeasures %>% select(method, RMSE, MASE) %>%
  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)

method	RMSE	MASE
Seasonal Naive	1.2043282	1.1263666
Holt-Winters	2.8380034	2.3113934
ARIMA	0.5211845	0.7523489

The ARIMA model seems to generalize very well, as it has the best error metrics using the testing data set. So we will use that model for our predicted values for May 2010:

atm1.forecast <- forecast(atm1, model = atm1.arima, h=31, biasadj = T)

atm1.forecast %>%
  kable() %>% kable_styling(bootstrap_options = "striped") %>%
  scroll_box(width = "100%", height = "200px")

	Point Forecast	Lo 80	Hi 80	Lo 95	Hi 95
59.42857	88.54968	86.86618	90.23319	85.97499	91.12438
59.57143	104.26506	102.56778	105.96234	101.66929	106.86083
59.71429	74.26779	72.56307	75.97250	71.66064	76.87493
59.85714	27.14883	25.44411	28.85355	24.54169	29.75597
60.00000	96.76130	95.05659	98.46602	94.15416	99.36844
60.14286	73.60436	71.89964	75.30908	70.99722	76.21150
60.28571	91.53222	89.82750	93.23693	88.92507	94.13936
60.42857	90.03139	88.31036	91.75242	87.39930	92.66348
60.57143	103.59256	101.87126	105.31385	100.96006	106.22505
60.71429	74.26779	72.54634	75.98923	71.63507	76.90050
60.85714	27.14883	25.42739	28.87027	24.51611	29.78155
61.00000	96.76130	95.03986	98.48275	94.12858	99.39402
61.14286	73.60436	71.88292	75.32580	70.97164	76.23708
61.28571	91.53222	89.81077	93.25366	88.89950	94.16493
61.42857	90.03139	88.29379	91.76899	87.37397	92.68882
61.57143	103.59256	101.85469	105.33042	100.93472	106.25039
61.71429	74.26779	72.52978	76.00579	71.60974	76.92584
61.85714	27.14883	25.41082	28.88684	24.49078	29.80688
62.00000	96.76130	95.02330	98.49931	94.10325	99.41935
62.14286	73.60436	71.86635	75.34236	70.94631	76.26241
62.28571	91.53222	89.79421	93.27022	88.87417	94.19027
62.42857	90.03139	88.27738	91.78540	87.34887	92.71392
62.57143	103.59256	101.83828	105.34683	100.90963	106.27548
62.71429	74.26779	72.51337	76.02220	71.58464	76.95093
62.85714	27.14883	25.39442	28.90324	24.46569	29.83197
63.00000	96.76130	95.00689	98.51572	94.07816	99.44445
63.14286	73.60436	71.84995	75.35877	70.92122	76.28750
63.28571	91.53222	89.77780	93.28663	88.84907	94.21536
63.42857	90.03139	88.26113	91.80166	87.32400	92.73878
63.57143	103.59256	101.82203	105.36308	100.88477	106.30034
63.71429	74.26779	72.49712	76.03845	71.55978	76.97579

# Output point forecasts
output <- cbind(date = seq(from=as.Date("2010-05-01"),
                           to=as.Date("2010-05-31"),by=1),
      point_forecast = as.data.frame(atm1.forecast$mean))

write.xlsx(output,"forecast_ATM1.xlsx")

autoplot(atm1.forecast, PI=T) +
  theme_light() +
  labs(title="Withdrawals from ATM1",
       subtitle = "ARIMA(0,0,1)(0,1,2)[7] Forecasts",
       y="Amount (100's of $)",
       x="Week")

ATM2 As we did for ATM1 above, we’ll try to stabilize the series with a Box Cox transform:

l <- BoxCox.lambda(atm2)

## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.

atm2.trans <- BoxCox(atm2,lambda=l)

autoplot(cbind(original = atm2, transformed=atm2.trans), facets=T) +
  theme_light() +
  labs(title="Withdrawals from ATM2", y="Amount (100's of $)",
       x="Week")

atm2.train <- subset(atm2.trans,end=length(atm2.trans)-60)
atm2.test <- subset(atm2.trans, start=length(atm2.trans)-59)

We’ll again begin using a seasonal naive forecast:

atm2.snaive <- snaive(atm2.train,h=31, biasadj=T)

accuracy(atm2.snaive)

##                       ME    RMSE      MAE  MPE MAPE MASE       ACF1
## Training set -0.05817628 8.21813 5.634022 -Inf  Inf    1 0.09936693

autoplot(atm2.snaive, PI = F) +
  theme_light() +
  labs(title="Withdrawals from ATM2",
       subtitle = "Seasonal Naive Model",
       y="Amount (100's of $)",
       x="Week")

Visually, there appears to be a bit of a downward trend in the data, but this Seasonal Naive model doesn’t take that into account. This might not be a good model, so we’ll keep looking.

We’ll try Holt-Winters as our second model:

# Use Holt-Winters method
atm2.hw <- hw(atm2.train, seasonal = "additive", h=31, biasadj = T)

accuracy(atm2.hw)

##                      ME     RMSE     MAE  MPE MAPE      MASE       ACF1
## Training set 0.02198949 6.697468 4.46445 -Inf  Inf 0.7924091 0.02699244

Our chosen error metrics: RMSE and MASE, both decrease compared to the Seasonal Naive model above.

autoplot(atm2.hw, PI = F) +
  theme_light() +
  labs(title="Withdrawals from ATM2",
       subtitle = "Holt-Winters Seasonal Model",
       y="Amount (100's of $)",
       x="Week")

We see that this model does use some of the downward trend we saw, though not much.

Finally, we’ll try ARIMA on these data:

atm2.arima <- auto.arima(atm2.train)

atm2.arima

## Series: atm2.train 
## ARIMA(0,0,0)(0,1,2)[7] 
## 
## Coefficients:
##          sma1     sma2
##       -0.7271  -0.0828
## s.e.   0.0601   0.0594
## 
## sigma^2 = 46.04:  log likelihood = -995.94
## AIC=1997.88   AICc=1997.97   BIC=2008.98

Interestingly, the auto.arima() function chose a different model than it did for the ATM1 data. Let’s check the accuracy measures:

accuracy(atm2.arima)

##                      ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set -0.4826825 6.684235 4.445472 -Inf  Inf 0.7890405 0.04564369

The ARIMA model has very nearly the same RMSE and MASE that the Holt-Winter model did.

atm2.pred <- forecast(atm2.arima,h=31)

autoplot(atm2.pred, PI=T) + theme_light() +
  labs(title="Withdrawals from ATM2",
       subtitle = "ARIMA(0,0,0)(0,1,2)[7] Model",
       y="Amount (100's of $)",
       x="Week")

We’ll check the accuracy over the testing data set to see how well the models generalize:

# Check test set accuracy measures
testMeasures <- cbind(method = "Seasonal Naive", 
                      data.frame(
                        as.list(accuracy(atm2.snaive, atm1.test)["Test set",])
                      ))

testMeasures <- rbind(testMeasures,
  cbind(method = "Holt-Winters", 
                      data.frame(
                        as.list(accuracy(atm2.hw, atm1.test)["Test set",])
                      ))
)

atm2.pred <- Arima(atm2.test,model=atm2.arima,h=31, biasadj=T)

testMeasures <- rbind(testMeasures,
  cbind(method = "ARIMA", 
                      data.frame(
                        as.list(accuracy(atm2.pred)["Training set",])
                      ), Theil.s.U = NA))

testMeasures %>% select(method, RMSE, MASE) %>%
  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)

method	RMSE	MASE
Seasonal Naive	14.703665	2.0390192
Holt-Winters	12.178901	1.8777154
ARIMA	4.629508	0.7121923

As before, ARIMA does better with data it has not seen. So, we’ll use this model to make our forecasts:

atm2.forecast <- forecast(atm2, model = atm2.arima, h=31, biasadj = T)

atm2.forecast %>%
  kable() %>% kable_styling(bootstrap_options = "striped") %>%
  scroll_box(width = "100%", height = "200px")

	Point Forecast	Lo 80	Hi 80	Lo 95	Hi 95
59.42857	64.80428	56.108830	73.49972	51.505739	78.10281
59.57143	73.03464	64.339191	81.73008	59.736100	86.33318
59.71429	17.74685	9.051407	26.44230	4.448316	31.04539
59.85714	10.85213	2.156684	19.54758	-2.446407	24.15067
60.00000	93.34042	84.644969	102.03586	80.041878	106.63895
60.14286	81.31553	72.620085	90.01098	68.016994	94.61407
60.28571	67.14308	58.447630	75.83852	53.844538	80.44161
60.42857	65.44402	56.430659	74.45739	51.659272	79.22878
60.57143	71.08524	62.071880	80.09861	57.300492	84.87000
60.71429	18.44272	9.429351	27.45608	4.657964	32.22747
60.85714	11.72797	2.714601	20.74133	-2.056786	25.51272
61.00000	91.90718	82.893816	100.92055	78.122429	105.69193
61.14286	80.67954	71.666178	89.69291	66.894791	94.46430
61.28571	64.36705	55.353684	73.38041	50.582296	78.15180
61.42857	65.44402	56.280326	74.60772	51.429358	79.45869
61.57143	71.08524	61.921546	80.24894	57.070578	85.09991
61.71429	18.44272	9.279018	27.60641	4.428050	32.45738
61.85714	11.72797	2.564268	20.89166	-2.286701	25.74263
62.00000	91.90718	82.743483	101.07088	77.892514	105.92185
62.14286	80.67954	71.515845	89.84324	66.664877	94.69421
62.28571	64.36705	55.203350	73.53075	50.352382	78.38172
62.42857	65.44402	56.132420	74.75563	51.203154	79.68489
62.57143	71.08524	61.773640	80.39685	56.844375	85.32611
62.71429	18.44272	9.131112	27.75432	4.201846	32.68359
62.85714	11.72797	2.416362	21.03957	-2.512904	25.96884
63.00000	91.90718	82.595576	101.21878	77.666311	106.14805
63.14286	80.67954	71.367939	89.99115	66.438673	94.92041
63.28571	64.36705	55.055444	73.67865	50.126179	78.60792
63.42857	65.44402	55.986827	74.90122	50.980488	79.90756
63.57143	71.08524	61.628047	80.54244	56.621709	85.54878
63.71429	18.44272	8.985518	27.89991	3.979180	32.90625

# Output point forecasts
output <- cbind(date = seq(from=as.Date("2010-05-01"),
                           to=as.Date("2010-05-31"),by=1),
      point_forecast = as.data.frame(atm2.forecast$mean))

write.xlsx(output,"forecast_ATM2.xlsx")

autoplot(atm2.forecast, PI=T) +
  theme_light() +
  labs(title="Withdrawals from ATM2",
       subtitle = "ARIMA(0,0,0)(0,1,2)[7] Forecasts",
       y="Amount (100's of $)",
       x="Week")

ATM3

Given the fact that there are only 3 data points for ATM3, there isn’t a good model choice. The best we could likely do is a naive forecast, but even that I would not be comfortable using in practice:

autoplot(atm3) + autolayer(naive(atm3))

ATM4

The data for ATM4, as mentioned above, seemed to have no real seasonal pattern and almost resembled white noise. So, we’ll take a slightly different approach.

First, though, we will check to see if a Box-Cox transformation is useful:

l <- BoxCox.lambda(atm4)
atm4.trans <- BoxCox(atm4,lambda=l)

autoplot(cbind(original = atm4, transformed=atm4.trans), facets=T) +
  theme_light() +
  labs(title="Withdrawals from ATM4", y="Amount (100's of $)",
       x="Week")

The transformation did reduce some of the variance, so we’ll keep it.

Next, we’ll see if there is any evidence of seasonality that we did not see in the overall plot.

ggAcf(atm4.trans)

Surprisingly, there does appear to be some weekly seasonality like ATM1 and ATM2. We’ll split the data and use an STL decomposition to see what that looks like:

atm4.train <- subset(atm4.trans,end=length(atm2.trans)-60)
atm4.test <- subset(atm4.trans, start=length(atm2.trans)-59)

atm4.decomp <- stl(atm4.train, s.window="periodic")
autoplot(atm4.decomp)

The stl() function seemed to tease out the seasonality pretty well. We’ll try to use this decomposition to forecast, and see how well it does:

atm4.stl <- seasadj(atm4.decomp) %>% snaive(h=31)

accuracy(atm4.stl)

##                     ME     RMSE      MAE       MPE    MAPE MASE       ACF1
## Training set 0.1070168 17.97907 14.21384 -34.43799 174.691    1 0.04499377

atm4.stl %>% autoplot() +
  labs(title="Withdrawals from ATM4", 
       subtitle="STL Seasonal Naive Model",
       y="Amount (100's of $)",
       x="Week")

The STL method does ok, but we can probably do better. We’ll try a Holt-Winter model:

atm4.hw <- hw(atm4.train, h = 31, seasonal = "additive", biasadj = T,
              lambda=l)

accuracy(atm4.hw)

##                     ME    RMSE      MAE       MPE     MAPE      MASE      ACF1
## Training set -0.337626 13.1643 10.47503 -59.01754 82.22031 0.7369594 0.0649929

The Holt-Winter model does well in both RMSE and MASE. Let’s plot the results and see what the model predictions look like based on the training data:

autoplot(atm4.hw, PI = T) +
  theme_light() +
  labs(title="Withdrawals from ATM4",
       subtitle = "Holt-Winters Seasonal Model",
       y="Amount (100's of $)",
       x="Week")

The predictions look ok, though they have a large prediction interval.

Finally, we try an ARIMA model next to see if it works better:

atm4.arima <- auto.arima(atm4.train, biasadj = T)

atm4.arima

## Series: atm4.train 
## ARIMA(0,0,1)(2,0,0)[7] with non-zero mean 
## 
## Coefficients:
##          ma1    sar1    sar2     mean
##       0.0883  0.2092  0.1443  30.0424
## s.e.  0.0576  0.0577  0.0586   1.3337
## 
## sigma^2 = 200.8:  log likelihood = -1239.71
## AIC=2489.43   AICc=2489.63   BIC=2508.03

The auto.arima() function chose an ARIMA(0,0,1)(2,0,0)[7] model. We’ll first check the accuracy measures on the training set and then plot them.

accuracy(atm2.arima)

##                      ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set -0.4826825 6.684235 4.445472 -Inf  Inf 0.7890405 0.04564369

atm4.pred <- forecast(atm4.arima,h=31)

autoplot(atm4.pred, PI=T) + theme_light() +
  labs(title="Withdrawals from ATM4",
       subtitle = "ARIMA(0,0,1)(2,0,0)[7] Model",
       y="Amount (100's of $)",
       x="Week")

The RMSE accuracy measure appears better than the STL Seasonal Naive model, but the MASE is actually a bit worse. Also, looking at the graph above, the predictions don’t look very stable.

We’ll check the residuals next:

checkresiduals(atm4.arima)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,0,1)(2,0,0)[7] with non-zero mean
## Q* = 11.319, df = 11, p-value = 0.4169
## 
## Model df: 3.   Total lags used: 14

The residuals aren’t exactly normal, though there appear to be no significant correlations between them and the Ljung-Box confirms the same.

Finally, let’s look at all of the models’ performance with the test data:

# Check test set accuracy measures
testMeasures <- cbind(method = "STL", 
                      data.frame(
                        as.list(accuracy(atm4.stl, atm4.test)["Test set",])
                      ))

testMeasures <- rbind(testMeasures,
  cbind(method = "Holt-Winters", 
                      data.frame(
                        as.list(accuracy(atm4.hw, atm4.test)["Test set",])
                      ))
)

atm4.pred <- Arima(atm4.test,model=atm4.arima,biasadj=T)

testMeasures <- rbind(testMeasures,
  cbind(method = "ARIMA", 
                      data.frame(
                        as.list(accuracy(atm4.pred)["Training set",])
                      ), Theil.s.U = NA))

testMeasures %>% select(method, RMSE, MASE) %>%
  kable() %>% kable_styling(bootstrap_options = "striped", full_width = F)

method	RMSE	MASE
STL	21.28007	1.1667665
Holt-Winters	17.59295	0.9856173
ARIMA	12.33961	0.8749154

Surprisingly, the ARIMA model does better than all the others in terms of our chosen accuracy measures on the test data set. However, the instability of the model predictions makes me hesitant to use it.

In conclusion, I would choose the Holt-Winter model here, despite it’s worse performance in accuracy measures, because I feel it provides more stable predictions of the three models.

atm4.forecast <- forecast(atm4.trans, model=atm4.hw, h=31,
                          use.initial.values=TRUE,
                          biasadj = T)

atm4.forecast %>%
  kable() %>% kable_styling(bootstrap_options = "striped") %>%
  scroll_box(width = "100%", height = "200px")

	Point Forecast	Lo 80	Hi 80	Lo 95	Hi 95
59.42857	26.05965	10.470474	49.50389	5.2010605	65.25192
59.57143	27.51257	11.324439	51.64204	5.7757149	67.78627
59.71429	28.18119	11.699643	52.66995	6.0224496	69.03117
59.85714	27.16074	11.031793	51.32039	5.5456069	67.52032
60.00000	23.22162	8.635555	45.77789	3.9248643	61.11479
60.14286	10.16172	1.927026	25.85077	0.2560098	37.39999
60.28571	28.53184	11.760629	53.51730	6.0073660	70.23031
60.42857	26.05043	10.209555	50.09025	4.9321807	66.30303
60.57143	27.50308	11.052246	52.24166	5.4910460	68.85870
60.71429	28.17157	11.422702	53.27568	5.7313119	70.11364
60.85714	27.15131	10.763731	51.91736	5.2675011	68.58935
61.00000	23.21296	8.401166	46.33875	3.6950086	62.12720
61.14286	10.15619	1.824018	26.26222	0.2059312	38.17594
61.28571	28.52215	11.483438	54.12705	5.7171902	71.32070
61.42857	26.04121	9.953083	50.67825	4.6719199	67.35948
61.57143	27.49359	10.784547	52.84291	5.2150999	69.93647
61.71429	28.16195	11.150264	53.88306	5.4489254	71.20148
61.85714	27.14189	10.500105	52.51602	4.9980256	69.66383
62.00000	23.20431	8.171013	46.90141	3.4733746	63.14521
62.14286	10.15067	1.724389	26.67582	0.1618406	38.95794
62.28571	28.51247	11.210683	54.73854	5.4356753	72.41664
62.42857	26.03200	9.700924	51.26806	4.4200735	68.42159
62.57143	27.48410	10.521203	53.44597	4.9476692	71.01990
62.71429	28.15234	10.882190	54.49225	5.1750833	72.29501
62.85714	27.13246	10.240785	53.11654	4.7369799	70.74407
63.00000	23.19565	7.944979	47.46604	3.2597776	64.16911
63.14286	10.14514	1.628061	27.09168	0.1235512	39.74622
63.28571	28.50279	10.942235	55.35195	5.1626242	73.51843
63.42857	26.02278	9.452956	51.85985	4.1764548	69.48964
63.57143	27.47461	10.262092	54.05102	4.6885653	72.10929
63.71429	28.14273	10.618357	55.10345	4.9095971	73.39452

# Output point forecasts
output <- cbind(date = seq(from=as.Date("2010-05-01"),
                           to=as.Date("2010-05-31"),by=1),
      point_forecast = as.data.frame(InvBoxCox(atm4.forecast$mean,lambda=l)))

write.xlsx(output,"forecast_ATM4.xlsx")

autoplot(atm4.forecast, PI=T) +
  theme_light() +
  labs(title="Withdrawals from ATM4",
       subtitle = "Holt-Winters Forecasts",
       y="Amount (100's of $)",
       x="Week")

Part B - Forecasting Power

Read in Data

The date column needs to be adjusted to a yearmonth date rather than a character. There is also a missing value in the KWH column. I will replace the missing value with the prior year value of the same month. For example, the missing value for September, 2008 will be filled with the September, 2007 KWH value.

b <- read_excel("ResidentialCustomerForecastLoad-624.xlsx")

head(b)

## # A tibble: 6 × 3
##   CaseSequence `YYYY-MMM`     KWH
##          <dbl> <chr>        <dbl>
## 1          733 1998-Jan   6862583
## 2          734 1998-Feb   5838198
## 3          735 1998-Mar   5420658
## 4          736 1998-Apr   5010364
## 5          737 1998-May   4665377
## 6          738 1998-Jun   6467147

b <- b %>%
        rename(YearMonth = "YYYY-MMM") %>%
        mutate(YearMonth = yearmonth(YearMonth))

b.data <- as_tsibble(b, index = YearMonth) 

head(b.data)

## # A tsibble: 6 x 3 [1M]
##   CaseSequence YearMonth     KWH
##          <dbl>     <mth>   <dbl>
## 1          733  1998 Jan 6862583
## 2          734  1998 Feb 5838198
## 3          735  1998 Mar 5420658
## 4          736  1998 Apr 5010364
## 5          737  1998 May 4665377
## 6          738  1998 Jun 6467147

summary(b.data$KWH)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   770523  5429912  6283324  6502475  7620524 10655730        1

paste0("Num of missing values: ", sum(is.na(b.data)))

## [1] "Num of missing values: 1"

paste0("Row index of NA value: " ,which(is.na(b.data$KWH)))

## [1] "Row index of NA value: 129"

b.data[129,]

## # A tsibble: 1 x 3 [1M]
##   CaseSequence YearMonth   KWH
##          <dbl>     <mth> <dbl>
## 1          861  2008 Sep    NA

b.data[129,3] <- b.data[117,3]

paste0("Num of missing values: ", sum(is.na(b.data)))

## [1] "Num of missing values: 0"

Data Exploration

The data is seasonal and contains and outlier for July 2010 with a much lower power usage than any other month. There also appears to be a slight upward trend in energy usage which could be due to a rise / fall in local temperature month over month especially during summer and winter months. The outlier will be replaced with the previous year’s value for that month (Ex July 2010 will be replaced with July 2009).

b.data %>%
        autoplot(KWH) +
        ggtitle("Residential Customer Power Consumption w/ Outlier")

paste0("Row index of outlier: " ,which.min(b.data$KWH))

## [1] "Row index of outlier: 151"

b.data[151,]

## # A tsibble: 1 x 3 [1M]
##   CaseSequence YearMonth    KWH
##          <dbl>     <mth>  <dbl>
## 1          883  2010 Jul 770523

b.data[151,3] <- b.data[139,3]

b.data %>%
        autoplot(KWH) +
        ggtitle("Residential Customer Power Consumption w/ Adjusted Outlier")

Data Modeling

Check if Data is Stationary

Like in Part A, checking the stationarity of the time series is key to using ARIMA models. The data appears to have strong seasonal autocorrelation every 12 months. After taking 1 seasonal difference the data appears to be stationary.

gg_tsdisplay(b.data, y = KWH, plot_type='partial') +
        ggtitle("Before Differencing")

b.data %>%
  features(KWH, unitroot_kpss, lag = 12)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1      1.67        0.01

nsd <- b.data %>%
  features(KWH, unitroot_nsdiffs, lag = 12)

paste0("Number of Seasonal differences needed for stationarity: ", nsd$nsdiffs)

## [1] "Number of Seasonal differences needed for stationarity: 1"

b.data %>%
  features(difference(KWH, lag = 12), unitroot_kpss)

## # A tibble: 1 × 2
##   kpss_stat kpss_pvalue
##       <dbl>       <dbl>
## 1     0.162         0.1

gg_tsdisplay(b.data, difference(KWH, lag = 12), plot_type='partial') +
        ggtitle("ATM1 After Differencing")

## Warning: Removed 12 rows containing missing values (`geom_line()`).

## Warning: Removed 12 rows containing missing values (`geom_point()`).

b.data <- b.data %>%
        mutate(diff_KWH = difference(KWH, lag = 12))

Model Creation

A few different models with varying degrees of complexity were created to attempt to find a model that could be effective in forecasting future consumer power usage. Basic models like MEAN and SNAIVE were used as well as more complicated ETS and ARIMA models. Each model will be compared to find the best fitting model to use for forecasting.

fit5 <- b.data %>%
  model(
        MEAN = MEAN(KWH),
        SNAIVE = SNAIVE(KWH),
        ETS_Additive = ETS(KWH ~ error("A") + trend("A") + season("A")),
        ETS_Multiplicative = ETS(KWH ~ error("M") + trend("A") + season("M")),
        #ETS_Additive_Boxcox = ETS(box_cox(KWH, lambda) ~ error("A") + trend("N") + season("A")),
        #ETS_Multiplicative_Boxcox  = ETS(box_cox(KWH, lambda) ~ error("M") + trend("N") + season("M"))
        ARIMA = ARIMA(KWH)
        )

Model Evaluation

The ARIMA(0,0,4)(2,1,0)[12] w/ drift model outperformed all of the other models in terms of RMSE and MAE. This model will be used for forecasting.

# acc.met <- fit1 %>%
#        glance() %>%
#        select(.model:BIC)

# acc.met2 <- fit1.arima %>%
#        glance() %>%
#        select(.model:BIC)

#acc <- rbind(acc.met, acc.met2)

#acc.met10 <- fit5 %>%
#        accuracy() %>%
#        select(.model, RMSE, MAE)

# RMSE2 <- fit1.arima %>%
#        accuracy() %>%
#        select(RMSE)

#rmse <- rbind(RMSE, RMSE2)

#kableExtra::kable(cbind(acc.met, acc.met2))

# kableExtra::kable(acc.met10)

Model Diagnostics

The ARIMA(0,0,4)(2,1,0)[12] w/ drift meet most of the expectations for model forecasting. The AFC plot shows that innovation residuals are likely from white noise. The mean of the innovation residuals are close to zero given the scale of the innovation residuals is in the millions (mean = -8653). The homoscedascity property is not met, however, as the variance changes throughout the series. Finally the data is not exactly from a normal distribution. The homoscedascity and normality properties are helpful but not necessary for forecasting.

fit5.forecast <- b.data %>%
  model(
        ARIMA = ARIMA(KWH)
        )

fit5.forecast %>%
        gg_tsresiduals()

resid <- residuals(fit5.forecast, type = "innovation")
mean(resid$.resid)

## [1] -8653.906

Forecast

The forecast for the Electricity Consumption is provided below. The forecasts have been stored in a .csv file that will be up on my Github and attached to the Project submission.

fc5.final <- fit5.forecast %>%
  forecast(h = "1 year")


fc5.final %>%
  autoplot(b.data) +
  ggtitle("Residential Power Consumption: ARIMA(0,0,4)(2,1,0)[12] w/ drift Forecast")

DATA 624 Project 1