This project consists of 3 parts - two required and one bonus and is worth 15% of your grade. The project is due at 11:59 PM on Sunday March 31. I will accept late submissions with a penalty until the meetup after that when we review some projects.
library(tidyverse)
library(scales)
library(readxl)
library(forecast)
library(lubridate)
Part A – ATM Forecast, ATM624Data.xlsx
In part A, I want you to forecast how much cash is taken out of 4 different ATM machines for May 2010. The data is given in a single file. The variable ‘Cash’ is provided in hundreds of dollars, other than that it is straight forward. I am being somewhat ambiguous on purpose to make this have a little more business feeling. Explain and demonstrate your process, techniques used and not used, and your actual forecast. I am giving you data via an excel file, please provide your written report on your findings, visuals, discussion and your R code via an RPubs link along with the actual.rmd file Also please submit the forecast which you will put in an Excel readable file.df<-read_excel("/Users/hovig/Downloads/ATM624Data.xlsx")
df<-df %>%
drop_na() %>%
spread(ATM, Cash) %>%
mutate(DATE = as.Date(DATE, origin = "1899-12-30"))
atm_ts<-ts(df %>% select(-DATE))
autoplot(atm_ts) +
labs(title = "Withdrawn Cash From All 4 ATMS", subtitle = "05/09 - 04/10", x = "Day") +
scale_y_continuous("Cash withdrawn in hundreds", labels = dollar) +
scale_color_discrete(NULL) +
theme(legend.position = c(0.1, 0.8))
df %>% gather(atm, Cash, -DATE) %>%
ggplot(aes(x = DATE, y = Cash, col = atm)) +
geom_line(show.legend = FALSE) +
facet_wrap(~ atm, ncol = 1, scales = "free_y") +
labs(title = "Withdrawn Cash From All 4 ATMs", subtitle = "05/09 - 04/10", x = "Date") +
scale_y_continuous("Cash withdrawn in hundreds", labels = dollar)
As explained above, each of the ATMs behaves differently, so each will be forecasted separately using the following approach based on the previously mentioned observations:
Separate timeseries objects are created to perform these forecasts:
atm1<-atm_ts[, "ATM1"]
atm2<-atm_ts[, "ATM2"]
atm3<-atm_ts[(nrow(atm_ts) - 2):nrow(atm_ts), "ATM3"]
atm3<-ts(atm3, start = 363)
atm3<-atm_ts[, "ATM3"]
atm3[which(atm3 == 0)]<-NA
atm4<-atm_ts[, "ATM4"]
atm4[which.max(atm4)]<-median(atm4, na.rm = TRUE)
autoplot(ts(atm_ts[1:61, ])) +
labs(title = "Withdrawn Cash From All ATMs", subtitle = "05/09 - 06/10", x = "Day") +
scale_y_continuous("Withdrawn Cash in hundreds", labels = dollar) +
scale_color_discrete(NULL)
ts
objects with frequency = 1
) can not be decomposed to capture the seasonality identified above.frequency = 7
for ATMs 1, 2, & 4atm1<-ts(atm1, frequency = 7)
atm2<-ts(atm2, frequency = 7)
atm2[which(is.na(atm2))]<-median(atm2, na.rm = TRUE)
atm4<-ts(atm4, frequency = 7)
ggtsdisplay(atm1, points = FALSE, main = "Withdrawals from ATM1", xlab = "Week", ylab = "Cash in hundreds")
ggtsdisplay(diff(atm1, 7), points = FALSE, main = "Differenced (lag-7) withdrawals from ATM1", xlab = "Week", ylab = "Cash in hundreds")
atm1_lambda<-BoxCox.lambda(atm1)
atm_aic<-function(p, d, q, P, D, Q) {
AIC(Arima(atm1, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm1_lambda))
}
expand.grid(p = 0:1, q = 0:1, P = 0:1, Q = 0:1) %>%
filter(p > 0 | q > 0 | P > 0 | Q > 0) %>%
mutate(aic = pmap_dbl(list(p, 0, q, P, 1, Q), atm_aic)) %>%
slice(which.min(aic))
## p q P Q aic
## 1 1 1 0 1 1096.583
atm1_fit<-Arima(atm1, order = c(1, 0, 1), seasonal = c(0, 1, 1), lambda = atm1_lambda)
Box.test(resid(atm1_fit), type = "L", fitdf = 3, lag = 7)
##
## Box-Ljung test
##
## data: resid(atm1_fit)
## X-squared = 8.0185, df = 4, p-value = 0.0909
ggtsdisplay(resid(atm1_fit), points = FALSE, plot.type = "histogram", main = "Residuals for ARIMA(1,0,1)(0,1,1) fit of ATM1 withdrawals", xlab = "Week", ylab = "Residual")
## Warning: Removed 3 rows containing non-finite values (stat_bin).
ggtsdisplay(atm2, points = FALSE, main = "Withdrawals from ATM2", xlab = "Week", ylab = "Cash in hundreds")
ggtsdisplay(diff(atm2, 7), points = FALSE, main = "Differenced (lag-7) withdrawals from ATM2", xlab = "Week", ylab = "Cash in hundreds")
atm2_lambda<-BoxCox.lambda(atm2)
atm_aic<-function(p, d, q, P, D, Q) {
AIC(Arima(atm2, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm2_lambda))
}
expand.grid(p = c(0, 2, 5), q = c(0, 2, 5), P = 0:1, Q = 0:1) %>%
filter(p > 0 | q > 0 | P > 0 | Q > 0) %>%
mutate(aic = pmap_dbl(list(p, 0, q, P, 1, Q), atm_aic)) %>%
slice(which.min(aic))
## p q P Q aic
## 1 5 5 0 1 2544.791
atm2_fit<-Arima(atm2, order = c(5, 0, 5), seasonal = c(0, 1, 1), lambda = atm2_lambda)
Box.test(resid(atm2_fit), type = "L", fitdf = 11, lag = 14)
##
## Box-Ljung test
##
## data: resid(atm2_fit)
## X-squared = 4.6959, df = 3, p-value = 0.1955
ggtsdisplay(resid(atm2_fit), points = FALSE, plot.type = "histogram", main = "Residuals for ARIMA(5,0,5)(0,1,1) of ATM2 withdrawals", xlab = "Week", ylab = "Residual")
ggtsdisplay(atm4, points = FALSE, main = "Withdrawals from ATM4", xlab = "Week", ylab = "Cash in hundreds")
ggtsdisplay(diff(atm4, 7), points = FALSE, main = "Differenced (lag-7) withdrawals from ATM4", xlab = "Week", ylab = "Cash in hundreds")
atm4_lambda<-BoxCox.lambda(atm4)
atm_aic<-function(p, d, q, P, D, Q) {
AIC(Arima(atm4, order = c(p, d, q), seasonal = c(P, D, Q), lambda = atm4_lambda))
}
expand.grid(p = c(0, 2, 5), q = c(0, 2, 5), P = 0:1, Q = 0:1) %>%
filter(p > 0 | q > 0 | P > 0 | Q > 0) %>%
mutate(aic = pmap_dbl(list(p, 0, q, P, 1, Q), atm_aic)) %>%
slice(which.min(aic))
## p q P Q aic
## 1 5 2 0 1 2863.061
atm4_fit<-Arima(atm4, order = c(0, 0, 2), seasonal = c(0, 1, 1), lambda = atm4_lambda)
The residuals are investigated using a Ljung-Box test and diagnostic plotting:
Box.test(resid(atm4_fit), type = "L", fitdf = 3, lag = 7)
##
## Box-Ljung test
##
## data: resid(atm4_fit)
## X-squared = 8.6819, df = 4, p-value = 0.06956
ggtsdisplay(resid(atm4_fit), points = FALSE, plot.type = "histogram", main = "Residuals for ARIMA(0,0,2)(0,1,1) of ATM4 withdrawals", xlab = "Week", ylab = "Residual")
The four forecasts identified above are performed for May 2010 (31 days):
atm1_forecast<-forecast(atm1_fit, 31, level = 95)
atm2_forecast<-forecast(atm2_fit, 31, level = 95)
atm3_forecast<-meanf(atm3, 31, level = 95)
atm4_forecast<-forecast(atm4_fit, 31, level = 95)
Each of the forecasts are plotted below:
gridExtra::grid.arrange(
autoplot(atm1_forecast) +
labs(title = "ATM1: ARIMA(1,0,1)(0,1,1)", x = "Week", y = NULL) +
theme(legend.position = "none"),
autoplot(atm2_forecast) +
labs(title = "ATM2: ARIMA(5,0,5)(0,1,1)", x = "Week", y = NULL) +
theme(legend.position = "none"),
autoplot(atm3_forecast) +
labs(title = "ATM3: mean", x = "Day", y = NULL) +
theme(legend.position = "none"),
autoplot(atm4_forecast) +
labs(title = "ATM4: ARIMA(0,0,2)(0,1,1)", x = "Week", y = NULL) +
theme(legend.position = "none"),
top = grid::textGrob("Forecasted ATM withdrawals (in hundreds of dollars) for 05/10\n")
)
## Warning: Removed 362 rows containing missing values (geom_path).
data_frame(DATE = rep(max(df$DATE) + 1:31, 4), ATM = rep(names(df)[-1], each = 31), Cash = c(atm1_forecast$mean, atm2_forecast$mean, atm3_forecast$mean, atm4_forecast$mean)) %>%
write_csv("/Users/hovig/Downloads/output_data624_project1_partA.csv")
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
Part B – Forecasting Power, ResidentialCustomerForecastLoad-624.xlsx
Part B consists of a simple dataset of residential power usage for January 1998 until December 2013. Your assignment is to model these data and a monthly forecast for 2014. The data is given in a single file. The variable ‘KWH’ is power consumption in Kilowatt hours, the rest is straight forward. Add this to your existing files above.kwh<-read_excel("/Users/hovig/Downloads/ResidentialCustomerForecastLoad-624.xlsx")
kwh<-ts(kwh[, "KWH"], start = c(1998, 1), frequency = 12)
The timeseries is plotted to inspect its features:
autoplot(kwh) +
labs(title = "Monthly Residential Power Usage", subtitle = "01/98 - 12/13")
kwh_lambda<-BoxCox.lambda(kwh)
kwh_trans<-BoxCox(kwh, kwh_lambda)
The data transformed using \(\lambda = 0.113\) are plotted below with lag-12 differencing:
ggtsdisplay(diff(kwh_trans, 12), points = FALSE, main = "Differenced (lag-12) Box-Cox transformed residential power usage")
kWh
timeseries with \(D = 1\) and \(d = 0\)kwh_aic<-function(p, q, P, Q) {
AIC(Arima(kwh, order = c(p, 0, q), seasonal = c(P, 1, Q), lambda = kwh_lambda))
}
expand.grid(p = c(0, 1, 4), q = c(0, 1, 4), P = 0:1, Q = 0:1) %>%
filter(p > 0 | q > 0 | P > 0 | Q > 0, p < 4 | q < 4 | P < 1 | Q < 1) %>%
mutate(aic = pmap_dbl(list(p, q, P, Q), kwh_aic)) %>%
slice(which.min(aic))
## p q P Q aic
## 1 1 0 0 1 575.2659
kwh_fit<-Arima(kwh, order = c(1, 0, 0), seasonal = c(0, 1, 1), lambda = kwh_lambda)
Box.test(resid(kwh_fit), type = "L", fitdf = 3, lag = 12)
##
## Box-Ljung test
##
## data: resid(kwh_fit)
## X-squared = 6.4543, df = 9, p-value = 0.6937
ggtsdisplay(resid(kwh_fit), points = FALSE, main = "Residential Power Usage Residuals for ARIMA(1,0,0)(0,1,1)")
expand.grid(p = c(1, 3), q = c(1, 3)) %>%
mutate(aic = pmap_dbl(list(p, q, 0, 1), kwh_aic))
## p q aic
## 1 1 1 575.7090
## 2 3 1 579.7385
## 3 1 3 579.9391
## 4 3 3 581.4511
ggtsdisplay(resid(kwh_fit), points = FALSE, plot.type = "histogram", main = "Residuals for ARIMA(1,0,0)(0,1,1) of residential power usage")
## Warning: Removed 1 rows containing non-finite values (stat_bin).
kwh_forecast<-forecast(kwh_fit, 12, level = 95)
autoplot(kwh_forecast) +
labs(title = "Residential Energy Forecast for 2014", subtitle = "Using ARIMA(1,0,0)(0,1,1)", x = "Month", y = "kWh") +
theme(legend.position = "none")
data_frame(`YYYY-MMM` = paste0(2014, "-", month.abb), KWH = kwh_forecast$mean) %>%
write_csv("/Users/hovig/Downloads/output_data624_project1_partB.csv")
Part C – BONUS, optional (part or all), Waterflow_Pipe1.xlsx and Waterflow_Pipe2.xlsx
Part C consists of two data sets. These are simple 2 columns sets, however they have different time stamps. Your optional assignment is to time-base sequence the data and aggregate based on hour (example of what this looks like, follows). Note for multiple recordings within an hour, take the mean. Then to determine if the data is stationary and can it be forecast. If so, provide a week forward forecast and present results via Rpubs and .rmd and the forecast in an Excel readable file.Prior to loading in the provided data for the two pipelines, the following changes are made in both files to make the data more easily readable by R:
water1<-read_excel("/Users/hovig/Downloads/Waterflow_Pipe1.xlsx",col_types =c("date", "numeric"))
water2<-read_excel("/Users/hovig/Downloads/Waterflow_Pipe2.xlsx",col_types =c("date", "numeric"))
colnames(water1)= c("DateTime","WaterFlow")
colnames(water2)= c("DateTime","WaterFlow")
water1<-water1 %>%
mutate(Date = date(DateTime),
Hour = hour(DateTime) + 1) %>%
group_by(Date, Hour) %>%
summarize(WaterFlow = mean(WaterFlow)) %>%
ungroup() %>%
mutate(DateTime = ymd_h(paste(Date, Hour))) %>%
select(DateTime, WaterFlow)
water1
## # A tibble: 236 x 2
## DateTime WaterFlow
## <dttm> <dbl>
## 1 2015-10-23 01:00:00 26.1
## 2 2015-10-23 02:00:00 18.9
## 3 2015-10-23 03:00:00 15.2
## 4 2015-10-23 04:00:00 23.1
## 5 2015-10-23 05:00:00 15.5
## 6 2015-10-23 06:00:00 22.7
## 7 2015-10-23 07:00:00 20.6
## 8 2015-10-23 08:00:00 18.4
## 9 2015-10-23 09:00:00 20.8
## 10 2015-10-23 10:00:00 21.2
## # … with 226 more rows
water_df<-full_join(water1, water2, by = "DateTime", suffix = c("_1", "_2")) %>%
mutate(WaterFlow_1 = ifelse(is.na(WaterFlow_1), 0, WaterFlow_1)) %>%
mutate(WaterFlow = WaterFlow_1 + WaterFlow_2) %>%
select(DateTime, WaterFlow)
water_ts<-ts(water_df$WaterFlow, frequency = 24)
Plot timeseries:
autoplot(water_ts) +
labs(title = "Two Pipelines Hourly Water Flow", subtitle = "[October 23, 2015 - December 3, 2015]", x = "Day", y = "Total Waterflow")
This plot shows a decent amount of variability across the whole range, with an initial downward trend before day 10 followed by a roughly flat period through the end of the time window.
Due to the apparent non-stationarity, a lag-1 difference is taken
water_lambda<-BoxCox.lambda(water_ts)
water_trans<-BoxCox(water_ts, water_lambda)
ggtsdisplay(diff(water_trans), points = FALSE, main = "Transformed WaterFlow Differenced Box-Cox")
ARIMA(1,1,1) model is used:
water_fit<-Arima(water_ts, order = c(1, 1, 1), lambda = water_lambda)
The residuals of this fit are investigated:
Box.test(resid(water_fit), type = "L")
##
## Box-Ljung test
##
## data: resid(water_fit)
## X-squared = 2.0096e-05, df = 1, p-value = 0.9964
ggtsdisplay(resid(water_fit), points = FALSE, plot.type = "histogram", main = "WaterFlow Residuals for ARIMA(1,1,1)")
Using the ARIMA(1,1,1) model with 168 hours for forecast:
water_forecast<-forecast(water_fit, 168, level = 95)
autoplot(water_forecast) +
labs(title = "WaterFlow Forecasted", subtitle = "Using ARIMA(1,1,1)", x = "Day", y = "Total WaterFlow") +
theme(legend.position = "none")
data_frame(DateTime = max(water_df$DateTime) +
hours(1:168), WaterFlow = water_forecast$mean) %>%
write_csv("/Users/hovig/Downloads/output_data624_project1_partC.csv")