suppressPackageStartupMessages(library(dplyr))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(purrr))
suppressPackageStartupMessages(library(vroom))
suppressPackageStartupMessages(library(readr))
suppressPackageStartupMessages(library(lubridate))
suppressPackageStartupMessages(library(readxl))
suppressPackageStartupMessages(library(forecast))

Project 1

PART A - ATM Forecast, ATM624Data.xlsx

In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.

# Importing and cleaning up the data

ATMdata = readxl::read_excel("ATM624Data.xlsx")

ATMdata<-ATMdata %>%
  drop_na() %>%
  spread(ATM, Cash) %>% 
  mutate(DATE = as.Date(DATE, origin = "1899-12-30")) 

ATMts<-ts(ATMdata %>% select(-DATE))

# plotting withdrawn cash from all 4 ATMS individually.

autoplot(ATMts) +
  labs(title = "Withdrawn Cash", subtitle = "05/09 - 04/10", x = "Day") +
  scale_y_continuous("Cash (hundreds)", labels = dollar) +
  scale_color_discrete(NULL) +
  theme(legend.position = c(0.1, 0.8))

ATMdata %>% gather(atm, Cash, -DATE) %>% 
  ggplot(aes(x = DATE, y = Cash, col = atm)) +
  geom_line(show.legend = FALSE) +
  facet_wrap(~ atm, ncol = 1, scales = "free_y") +
  labs(title = "Withdrawn Cash", subtitle = "05/09 - 04/10", x = "Date") +
  scale_y_continuous("Cash (hundreds)", labels = dollar)

Each ATM is unique in behavior, therefore each will require a different approach to forecast

# Treating each ATM as separate time series

atm1<-ATMts[, "ATM1"]
atm1<-ts(atm1, frequency = 7)

atm2<-ATMts[, "ATM2"]
atm2<-ts(atm2, frequency = 7)
atm2[which(is.na(atm2))]<-median(atm2, na.rm = TRUE)

atm3<-ATMts[(nrow(ATMts) - 2):nrow(ATMts), "ATM3"]
atm3<-ts(atm3, start = 363)
atm3<-ATMts[, "ATM3"]
atm3[which(atm3 == 0)]<-NA

atm4<-ATMts[, "ATM4"]
atm4[which.max(atm4)]<-median(atm4, na.rm = TRUE)
atm4<-ts(atm4, frequency = 7)

ATM 1

ggtsdisplay(atm1, points = FALSE, main = "Withdrawals from ATM1", xlab = "Week", ylab = "Cash (hundreds)")

To account for seasonality the series is differenced with lag of 7

ggtsdisplay(diff(atm1, 7), points = FALSE,  main = "ATM1 Differenced (lag-7)", xlab = "Week", ylab = "Cash (hundreds")

# define function to create models & return AIC values for timeseries
# create model with Box-Cox and specified ARIMA parameters; extract AIC

atm1_lambda<-BoxCox.lambda(atm1)

atm_aic<-function(p, d, q, P, D, Q) {
  AIC(Arima(atm1, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm1_lambda))
}

# create possible combinations of p, q, P, Q except all zero

expand.grid(p = 0:1, q = 0:1, P = 0:1, Q = 0:1) %>%
  filter(p > 0 | q > 0 | P > 0 | Q > 0) %>% 
  mutate(aic = pmap_dbl(list(p, 0, q, P, 1, Q), atm_aic)) %>% 
  slice(which.min(aic))

##   p q P Q      aic
## 1 1 1 0 1 1096.583

# Ljung-Box test and diagnostic plotting to investigate residuals

atm1_fit<-Arima(atm1, order = c(1, 0, 1), seasonal = c(0, 1, 1), lambda = atm1_lambda)

Box.test(resid(atm1_fit), type = "L", fitdf = 3, lag = 7)

## 
##  Box-Ljung test
## 
## data:  resid(atm1_fit)
## X-squared = 8.0185, df = 4, p-value = 0.0909

ggtsdisplay(resid(atm1_fit), points = FALSE, plot.type = "histogram", main = "Residuals for ARIMA(1,0,1)(0,1,1) fit of ATM1 withdrawals", xlab = "Week", ylab = "Residual")

## Warning: Removed 3 rows containing non-finite values (stat_bin).

The residuals are normally distributed with a mean of zero. Based on the Ljung-Box the residuals are white noise. (p-value > 0.05) The Model is good to be used for forecasting

ATM 2

ggtsdisplay(atm2, points = FALSE, main = "Withdrawals from ATM2", xlab = "Week", ylab = "Cash (hundreds)")

ggtsdisplay(diff(atm2, 7), points = FALSE, main = "ATM2 Differenced (lag-7)", xlab = "Week", ylab = "Cash in hundreds")

# define function to create models & return AIC values for timeseries
# create model with Box-Cox and specified ARIMA parameters; extract AIC

atm2_lambda<-BoxCox.lambda(atm2)

## Warning in guerrero(x, lower, upper): Guerrero's method for selecting a Box-Cox
## parameter (lambda) is given for strictly positive data.

atm_aic<-function(p, d, q, P, D, Q) {
  AIC(Arima(atm2, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm2_lambda))
}

# create possible combinations of p, q, P, Q except all zero
expand.grid(p = c(0, 2, 5), q = c(0, 2, 5), P = 0:1, Q = 0:1) %>%
  filter(p > 0 | q > 0 | P > 0 | Q > 0) %>% 
  mutate(aic = pmap_dbl(list(p, 0, q, P, 1, Q), atm_aic)) %>% 
  slice(which.min(aic))

##   p q P Q      aic
## 1 5 5 0 1 2544.791

# Ljung-Box test and diagnostic plotting to investigate residuals

atm2_fit<-Arima(atm2, order = c(5, 0, 5), seasonal = c(0, 1, 1), lambda = atm2_lambda)

Box.test(resid(atm2_fit), type = "L", fitdf = 11, lag = 14)

## 
##  Box-Ljung test
## 
## data:  resid(atm2_fit)
## X-squared = 4.6959, df = 3, p-value = 0.1955

ggtsdisplay(resid(atm2_fit), points = FALSE, plot.type = "histogram", main = "Residuals for ARIMA(5,0,5)(0,1,1) of ATM2 withdrawals", xlab = "Week", ylab = "Residual")

The residuals are normally distributed with a mean of zero. Based on the Ljung-Box the residuals are white noise. (p-value > 0.05) The Model is good to be used for forecasting

ATM 3

Since only 3 observations at ATM3 and only these observations are used for the forecast. A simple mean forecast will be used for this ATM

ATM 4

ggtsdisplay(atm4, points = FALSE, main = "Withdrawals from ATM4", xlab = "Week", ylab = "Cash (hundreds)")

ggtsdisplay(diff(atm4, 7), points = FALSE, main = "ATM4 Differenced (lag-7)", xlab = "Week", ylab = "Cash in hundreds")

# define function to create models & return AIC values for timeseries
# create model with Box-Cox and specified ARIMA parameters; extract AIC
atm4_lambda<-BoxCox.lambda(atm4)
atm_aic<-function(p, d, q, P, D, Q) {
  AIC(Arima(atm4, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm4_lambda))
}

# create possible combinations of p, q, P, Q except all zero
expand.grid(p = c(0, 2, 5), q = c(0, 2, 5), P = 0:1, Q = 0:1) %>%
  filter(p > 0 | q > 0 | P > 0 | Q > 0) %>% 
  mutate(aic = pmap_dbl(list(p, 0, q, P, 1, Q), atm_aic)) %>% 
  slice(which.min(aic))

##   p q P Q      aic
## 1 5 2 0 1 2863.061

# Ljung-Box test and diagnostic plotting to investigate residuals
atm4_fit<-Arima(atm4, order = c(0, 0, 2), seasonal = c(0, 1, 1), lambda = atm4_lambda)

Box.test(resid(atm4_fit), type = "L", fitdf = 3, lag = 7)

## 
##  Box-Ljung test
## 
## data:  resid(atm4_fit)
## X-squared = 8.6819, df = 4, p-value = 0.06956

ggtsdisplay(resid(atm4_fit), points = FALSE, plot.type = "histogram", main = "Residuals for ARIMA(0,0,2)(0,1,1) of ATM4 withdrawals", xlab = "Week", ylab = "Residual")

The residuals are normally distributed with a mean of zero. Based on the Ljung-Box the residuals are white noise. (p-value > 0.05) The Model is good to be used for forecasting

atm1_forecast<-forecast(atm1_fit, 31, level = 95)
atm2_forecast<-forecast(atm2_fit, 31, level = 95)
atm3_forecast<-meanf(atm3, 31, level = 95)
atm4_forecast<-forecast(atm4_fit, 31, level = 95)

gridExtra::grid.arrange(
  autoplot(atm1_forecast) + 
    labs(title = "ATM1: ARIMA(1,0,1)(0,1,1)", x = "Week", y = NULL) +
    theme(legend.position = "none"),
  autoplot(atm2_forecast) + 
    labs(title = "ATM2: ARIMA(5,0,5)(0,1,1)", x = "Week", y = NULL) +
    theme(legend.position = "none"),
  autoplot(atm3_forecast) + 
    labs(title = "ATM3: mean", x = "Day", y = NULL) +
    theme(legend.position = "none"),
  autoplot(atm4_forecast) + 
    labs(title = "ATM4: ARIMA(0,0,2)(0,1,1)", x = "Week", y = NULL) +
    theme(legend.position = "none"),
  top = grid::textGrob("Forecasted ATM withdrawals (in hundreds of dollars) for 05/10\n")
)

## Warning: Removed 362 row(s) containing missing values (geom_path).

data_frame(DATE = rep(max(ATMdata$DATE) + 1:31, 4), ATM = rep(names(df)[-1], each = 31), Cash = c(atm1_forecast$mean, atm2_forecast$mean, atm3_forecast$mean, atm4_forecast$mean)) %>% 
  write_csv("output_proj1_partA.csv")

## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.

PART B - Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx

Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.

kwh<-read_excel("ResidentialCustomerForecastLoad-624.xlsx")
kwh<-ts(kwh[, "KWH"], start = c(1998, 1), frequency = 12)

autoplot(kwh) +
  labs(title = "Monthly Residential Power Usage", subtitle = "01/98 - 12/13")

There is a great deal of seasonality and high variance.

#Use Box-cox to reduce variance
kwh_lambda<-BoxCox.lambda(kwh)
kwh_trans<-BoxCox(kwh, kwh_lambda)

ggtsdisplay(diff(kwh_trans, 12), points = FALSE, main = "Differenced (lag-12) Box-Cox transformed residential power usage")

# define function to create models & return AIC values for timeseries
# create model with Box-Cox and specified ARIMA parameters; extract AIC
kwh_aic<-function(p, q, P, Q) {
  AIC(Arima(kwh, order = c(p, 0, q), seasonal = c(P, 1, Q), lambda = kwh_lambda))
}

expand.grid(p = c(0, 1, 4), q = c(0, 1, 4), P = 0:1, Q = 0:1) %>%
  filter(p > 0 | q > 0 | P > 0 | Q > 0, p < 4 | q < 4 | P < 1 | Q < 1) %>%
  mutate(aic = pmap_dbl(list(p, q, P, Q), kwh_aic)) %>% 
  slice(which.min(aic))

##   p q P Q      aic
## 1 1 0 0 1 575.2659

# Ljung-Box test and diagnostic plotting to investigate residuals
kwh_fit<-Arima(kwh, order = c(1, 0, 0), seasonal = c(0, 1, 1), lambda = kwh_lambda)

Box.test(resid(kwh_fit), type = "L", fitdf = 3, lag = 12)

## 
##  Box-Ljung test
## 
## data:  resid(kwh_fit)
## X-squared = 6.4543, df = 9, p-value = 0.6937

ggtsdisplay(resid(kwh_fit), points = FALSE, main = "Residential Power Usage Residuals for ARIMA(1,0,0)(0,1,1)")

expand.grid(p = c(1, 3), q = c(1, 3)) %>%
  mutate(aic = pmap_dbl(list(p, q, 0, 1), kwh_aic))

##   p q      aic
## 1 1 1 575.7090
## 2 3 1 579.7385
## 3 1 3 579.9391
## 4 3 3 581.4511

ggtsdisplay(resid(kwh_fit), points = FALSE, plot.type = "histogram", main = "Residuals for ARIMA(1,0,0)(0,1,1) of residential power usage")

## Warning: Removed 1 rows containing non-finite values (stat_bin).

The residuals are normally distributed with a mean of zero. Based on the Ljung-Box the residuals are white noise. (p-value > 0.05) The Model is good to be used for forecasting

kwh_forecast<-forecast(kwh_fit, 12, level = 95)
autoplot(kwh_forecast) + 
    labs(title = "Residential Energy Forecast for 2014", subtitle = "Using ARIMA(1,0,0)(0,1,1)", x = "Month", y = "kWh") +
    theme(legend.position = "none")

data_frame(`YYYY-MMM` = paste0(2014, "-", month.abb), KWH = kwh_forecast$mean) %>% 
  write_csv("output_proj1_partB.csv")

DATA 624 Project 1

Christopher Ayre

April 3, 2022

PART A - ATM Forecast, ATM624Data.xlsx

ATM 1

ATM 2

ATM 3

ATM 4

PART B - Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx