One-step Ahead Prediction for Cash Withdrawals: Comparing Three Approaches to Forecasting
R for Fun
Nguyen Chi Dung
# Load packages and all objects:
library(magrittr)
library(tidyverse)
library(data.table)
library(lubridate)
library(timetk)
library(stringr)
library(h2o)
# Set environment for using h2o package with 16 GB RAM:
h2o.init(nthreads = -1, max_mem_size = "16G")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 46 minutes 36 seconds
## H2O cluster timezone: Asia/Ho_Chi_Minh
## H2O data parsing timezone: UTC
## H2O cluster version: 3.20.0.2
## H2O cluster version age: 3 months and 18 days !!!
## H2O cluster name: H2O_started_from_R_chidung_rwi275
## H2O cluster total nodes: 1
## H2O cluster total memory: 15.91 GB
## H2O cluster total cores: 32
## H2O cluster allowed cores: 32
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.1 (2018-07-02)
h2o.no_progress()
# Load objects:
rm(list = ls())
system.time(load("/home/chidung/all_objects_atms_10_09.RData"))
## user system elapsed
## 147.374 3.004 150.382
# Write a function for black theme:
my_theme <- function(...) {
theme(
axis.line = element_blank(),
axis.text.x = element_text(color = "white", lineheight = 0.9),
axis.text.y = element_text(color = "white", lineheight = 0.9),
axis.ticks = element_line(color = "white", size = 0.2),
axis.title.x = element_text(color = "white", margin = margin(0, 10, 0, 0)),
axis.title.y = element_text(color = "white", angle = 90, margin = margin(0, 10, 0, 0)),
axis.ticks.length = unit(0.3, "lines"),
legend.background = element_rect(color = NA, fill = "gray10"),
legend.key = element_rect(color = "white", fill = "gray10"),
legend.key.size = unit(1.2, "lines"),
legend.key.height = NULL,
legend.key.width = NULL,
legend.text = element_text(color = "white"),
legend.title = element_text(face = "bold", hjust = 0, color = "white"),
legend.text.align = NULL,
legend.title.align = NULL,
legend.direction = "vertical",
legend.box = NULL,
panel.background = element_rect(fill = "gray10", color = NA),
panel.border = element_blank(),
panel.grid.major = element_line(color = "grey35"),
panel.grid.minor = element_line(color = "grey20"),
panel.spacing = unit(0.5, "lines"),
strip.background = element_rect(fill = "grey30", color = "grey10"),
strip.text.x = element_text(color = "white"),
strip.text.y = element_text(color = "white", angle = -90),
plot.background = element_rect(color = "gray10", fill = "gray10"),
plot.title = element_text(color = "white", hjust = 0, lineheight = 1.25,
margin = margin(2, 2, 2, 2)),
plot.subtitle = element_text(color = "white", hjust = 0, margin = margin(2, 2, 2, 2)),
plot.caption = element_text(color = "white", hjust = 0),
plot.margin = unit(rep(1, 4), "lines"))
}
# Define components of daily cash demand:
hanoi_atm_trans$RESP_TEXT %>% unique() ->> trans_type
cash_demand <- trans_type[c(1, 17, 5, 2)]
atm_demand_cash <- hanoi_atm_trans %>% filter(RESP_TEXT %in% cash_demand)
# Calculate daily cash demand:
atm_demand_cash %>%
mutate(DAYID = mdy(DAYID)) %>%
group_by(DAYID, TERMINAL_ID) %>%
summarise(daily_cash = sum(AMOUNT)) %>%
ungroup() %>%
mutate(daily_cash = daily_cash / 1000000) ->> daily_cash_demand_atms
# ATMs have over 500 transaction days:
over_500_atm <- daily_cash_demand_atms %>%
group_by(TERMINAL_ID) %>%
summarise(n_days = n_distinct(DAYID)) %>%
ungroup() %>%
arrange(-n_days) %>%
filter(n_days >= 500)
daily_cash_demand_atms_500 <- daily_cash_demand_atms %>%
filter(TERMINAL_ID %in% over_500_atm$TERMINAL_ID)
# Some specific cases:
case1_cycle_TERMINAL_ID <- "00002009"
case2_random_TERMINAL_ID <- "00002019"
# A specific case:
my_case1 <- daily_cash_demand_atms_500 %>%
filter(TERMINAL_ID == case1_cycle_TERMINAL_ID) %>%
select(-TERMINAL_ID)
# Visualize daily cash demands for above case:
my_case1 %>%
ggplot(aes(DAYID, daily_cash)) +
geom_line(color = "cyan") +
my_theme() +
labs(x = NULL, y = NULL, title = "Daily Amount of Cash Transactions for 00002009")

# Wrire a function to convert to h2o frame:
convert_to_h2o <- function(your_df) {
your_df %>%
select(-DAYID) %>%
as.h2o() %>%
return()
}
#--------------------------------------
# Version 1: Use lag 1 as a feature
#--------------------------------------
# A function for data pre-processing:
pre_proccess_v1 <- function(your_df) {
your_df %>%
mutate(lag1 = lag(daily_cash, n = 1L)) %>%
tk_augment_timeseries_signature() %>%
na.omit() %>%
mutate_if(is.ordered, as.character) %>%
mutate_if(is.character, as.factor)%>%
return()
}
# Used the function:
pre_proccess_v1(my_case1) ->> cash_clean
# Wrire a function for conducting One-step Ahead Prediction:
my_predict <- function(k) {
range_validation <- 60
n_ahead <- 1
train_tbl <- cash_clean %>%
slice((1 + k):(400 + k))
valid_tbl <- cash_clean %>%
slice((400 + 1 + k):(400 + k + range_validation))
test_tbl <- cash_clean %>%
slice((400 + k + range_validation + 1):(400 + k + range_validation + n_ahead))
train_h2o <- train_tbl %>% convert_to_h2o()
valid_h2o <- valid_tbl %>% convert_to_h2o()
test_h2o <- test_tbl %>% convert_to_h2o()
# Set target and features:
y <- "daily_cash"
x <- setdiff(names(train_h2o), y)
# Automatic ML:
automl_models_h2o <- h2o.automl(x = x, y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 60,
max_models = 30,
stopping_metric = "MAE",
seed = 29)
# Use for forecasting:
pred_h2o <- h2o.predict(automl_models_h2o, newdata = test_h2o) %>%
as.data.frame() %>%
pull(predict)
actual_predicted_df_test <- test_tbl %>%
mutate(predicted = pred_h2o, error = daily_cash - predicted, pct = error / daily_cash) %>%
select(DAYID, daily_cash, predicted, everything())
return(actual_predicted_df_test)
}
# Function for conducting One-step Ahead Prediction for (n + 1) consecutive days:
my_predict_n_days <- function(n_days) {
results <- lapply(0:n_days, my_predict)
results_df <- do.call("bind_rows", results)
return(results_df)
}
# Use this function:
system.time(my_predict_n_days(30) ->> results_df)
## user system elapsed
## 618.797 13.712 2584.947
# Visualizing predictions vs actuals:
vis_results <- function(r_df) {
r_df %>%
select(DAYID, Actual = daily_cash, Predicted = predicted) %>%
gather(a, b, -DAYID) %>%
ggplot(aes(DAYID, b, color = a)) +
geom_line() +
geom_point() +
scale_color_manual(values = c("purple", "orange"), name = "") +
my_theme()
}
results_df %>%
vis_results() +
labs(x = NULL, y = NULL, subtitle = "Approach Used: Included Lag 1 as a Feature",
title = "Version 1: One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

# Numeric results:
print_results <- function(df) {
df %>%
select(DAYID, Actual = daily_cash, Predicted = predicted) %>%
mutate_if(is.numeric, function(x) {round(x, 2)}) %>%
knitr::kable()
}
results_df %>%
print_results()
2018-04-28 |
85.13 |
71.84 |
2018-04-29 |
62.13 |
62.19 |
2018-04-30 |
27.38 |
30.53 |
2018-05-01 |
21.24 |
29.20 |
2018-05-02 |
65.50 |
66.23 |
2018-05-03 |
67.31 |
65.28 |
2018-05-04 |
17.34 |
20.46 |
2018-05-05 |
19.39 |
19.85 |
2018-05-06 |
16.55 |
19.13 |
2018-05-07 |
27.35 |
26.93 |
2018-05-08 |
12.87 |
47.04 |
2018-05-09 |
341.75 |
342.17 |
2018-05-10 |
406.48 |
343.96 |
2018-05-11 |
181.22 |
180.89 |
2018-05-12 |
181.20 |
180.54 |
2018-05-13 |
3.60 |
33.78 |
2018-05-14 |
124.67 |
79.27 |
2018-05-15 |
122.93 |
116.29 |
2018-05-16 |
74.98 |
77.16 |
2018-05-17 |
88.67 |
85.83 |
2018-05-18 |
17.00 |
37.79 |
2018-05-20 |
38.50 |
39.06 |
2018-05-22 |
49.51 |
46.03 |
2018-05-23 |
71.90 |
60.39 |
2018-05-24 |
59.91 |
57.56 |
2018-05-25 |
10.60 |
9.35 |
2018-05-26 |
34.35 |
39.33 |
2018-05-27 |
40.80 |
37.13 |
2018-05-28 |
36.75 |
31.65 |
2018-05-29 |
40.70 |
42.13 |
2018-05-30 |
38.30 |
37.10 |
# Total actual cash demand vs predicted:
results_df %>%
select(DAYID, daily_cash, predicted) %>%
gather(a, value, -DAYID) %>%
group_by(a) %>%
summarise(cash = sum(value))
## # A tibble: 2 x 2
## a cash
## <chr> <dbl>
## 1 daily_cash 2386.
## 2 predicted 2336.
#-------------------------------------------
# Version 2: Not use lag 1 as a feature
#-------------------------------------------
pre_proccess_v2 <- function(your_df) {
your_df %>%
tk_augment_timeseries_signature() %>%
na.omit() %>%
mutate_if(is.ordered, as.character) %>%
mutate_if(is.character, as.factor)%>%
return()
}
pre_proccess_v2(my_case1) ->> cash_clean
my_predict_n_days(30) ->> results_df
results_df %>%
vis_results() +
labs(x = NULL, y = NULL, subtitle = "Approach Used: Removed Lag 1",
title = "Version 2: One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

results_df %>%
print_results()
2018-04-28 |
85.13 |
84.53 |
2018-04-29 |
62.13 |
61.63 |
2018-04-30 |
27.38 |
29.84 |
2018-05-01 |
21.24 |
35.57 |
2018-05-02 |
65.50 |
66.61 |
2018-05-03 |
67.31 |
67.00 |
2018-05-04 |
17.34 |
16.88 |
2018-05-05 |
19.39 |
17.11 |
2018-05-06 |
16.55 |
21.74 |
2018-05-07 |
27.35 |
46.03 |
2018-05-08 |
12.87 |
64.10 |
2018-05-09 |
341.75 |
335.66 |
2018-05-10 |
406.48 |
357.01 |
2018-05-11 |
181.22 |
186.83 |
2018-05-12 |
181.20 |
180.75 |
2018-05-13 |
3.60 |
6.68 |
2018-05-14 |
124.67 |
137.32 |
2018-05-15 |
122.93 |
105.49 |
2018-05-16 |
74.98 |
74.93 |
2018-05-17 |
88.67 |
87.24 |
2018-05-18 |
17.00 |
37.57 |
2018-05-20 |
38.50 |
36.68 |
2018-05-22 |
49.51 |
49.44 |
2018-05-23 |
71.90 |
66.29 |
2018-05-24 |
59.91 |
55.61 |
2018-05-25 |
10.60 |
15.28 |
2018-05-26 |
34.35 |
33.69 |
2018-05-27 |
40.80 |
43.57 |
2018-05-28 |
36.75 |
40.25 |
2018-05-29 |
40.70 |
42.11 |
2018-05-30 |
38.30 |
44.22 |
results_df %>%
select(DAYID, daily_cash, predicted) %>%
gather(a, value, -DAYID) %>%
group_by(a) %>%
summarise(cash = sum(value))
## # A tibble: 2 x 2
## a cash
## <chr> <dbl>
## 1 daily_cash 2386.
## 2 predicted 2448.
#---------------------
# ARIMA Approach
#---------------------
library(fpp2)
predict_from_arima <- function(k) {
range_validation <- 60
n_ahead <- 1
train_tbl <- cash_clean %>% slice((1 + k):(400 + k))
valid_tbl <- cash_clean %>% slice((400 + 1 + k):(400 + k + range_validation))
test_tbl <- cash_clean %>% slice((400 + k + range_validation + 1):(400 + k + range_validation + n_ahead))
train_arima <- bind_rows(train_tbl, valid_tbl) %>% select(1:2)
test_arima <- test_tbl %>% select(1:2)
# ARIMA model:
my_arima <- auto.arima(train_arima[, 2] %>% ts(start = 1))
# Use the model for forecasting:
predicted_arima <- forecast(my_arima, h = 1)$mean %>% as.vector()
actual_predicted_df_test <- test_arima %>%
mutate(predicted = predicted_arima)
return(actual_predicted_df_test)
}
# Use this function:
lapply(0:30, predict_from_arima) ->> arima_results
arima_results <- do.call("bind_rows", arima_results)
# Compare cash demands predicted by ARIMA approach and actuals:
arima_results %>%
vis_results() +
labs(x = NULL, y = NULL, subtitle = "Approach Used: ARIMA",
title = "Version 3: One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

arima_results %>%
print_results()
2018-04-28 |
85.13 |
52.07 |
2018-04-29 |
62.13 |
75.00 |
2018-04-30 |
27.38 |
60.91 |
2018-05-01 |
21.24 |
40.84 |
2018-05-02 |
65.50 |
39.39 |
2018-05-03 |
67.31 |
66.37 |
2018-05-04 |
17.34 |
68.40 |
2018-05-05 |
19.39 |
39.71 |
2018-05-06 |
16.55 |
42.38 |
2018-05-07 |
27.35 |
42.74 |
2018-05-08 |
12.87 |
51.20 |
2018-05-09 |
341.75 |
43.81 |
2018-05-10 |
406.48 |
233.72 |
2018-05-11 |
181.22 |
270.92 |
2018-05-12 |
181.20 |
117.83 |
2018-05-13 |
3.60 |
114.23 |
2018-05-14 |
124.67 |
3.29 |
2018-05-15 |
122.93 |
109.67 |
2018-05-16 |
74.98 |
100.30 |
2018-05-17 |
88.67 |
48.04 |
2018-05-18 |
17.00 |
59.17 |
2018-05-20 |
38.50 |
17.50 |
2018-05-22 |
49.51 |
34.39 |
2018-05-23 |
71.90 |
43.62 |
2018-05-24 |
59.91 |
58.95 |
2018-05-25 |
10.60 |
52.62 |
2018-05-26 |
34.35 |
24.60 |
2018-05-27 |
40.80 |
42.20 |
2018-05-28 |
36.75 |
48.19 |
2018-05-29 |
40.70 |
47.50 |
2018-05-30 |
38.30 |
51.60 |