Forecasting Daily Cash Demand: A Short Comparision between Automatic Machine Learning and ARIMA Aprroach (Case 1)
R for Pleasure
Nguyen Chi Dung
#==============================
# Exploratory Data Analysis
#==============================
# Inspect data path:
rm(list = ls())
my_path <- dir("/home/chidung/Desktop/atm_cash_data", full.names = TRUE)
# Import data:
library(tidyverse)
library(magrittr)
my_case1 <- read_csv(my_path[1])
my_case2 <- read_csv(my_path[2])
# A function for black theme:
my_theme <- function(...) {
theme(
axis.line = element_blank(),
axis.text.x = element_text(color = "white", lineheight = 0.9),
axis.text.y = element_text(color = "white", lineheight = 0.9),
axis.ticks = element_line(color = "white", size = 0.2),
axis.title.x = element_text(color = "white", margin = margin(0, 10, 0, 0)),
axis.title.y = element_text(color = "white", angle = 90, margin = margin(0, 10, 0, 0)),
axis.ticks.length = unit(0.3, "lines"),
legend.background = element_rect(color = NA, fill = "gray10"),
legend.key = element_rect(color = "white", fill = "gray10"),
legend.key.size = unit(1.2, "lines"),
legend.key.height = NULL,
legend.key.width = NULL,
legend.text = element_text(color = "white"),
legend.title = element_text(face = "bold", hjust = 0, color = "white"),
legend.text.align = NULL,
legend.title.align = NULL,
legend.direction = "vertical",
legend.box = NULL,
panel.background = element_rect(fill = "gray10", color = NA),
panel.border = element_blank(),
panel.grid.major = element_line(color = "grey35"),
panel.grid.minor = element_line(color = "grey20"),
panel.spacing = unit(0.5, "lines"),
strip.background = element_rect(fill = "grey30", color = "grey10"),
strip.text.x = element_text(color = "white"),
strip.text.y = element_text(color = "white", angle = -90),
plot.background = element_rect(color = "gray10", fill = "gray10"),
plot.title = element_text(color = "white", hjust = 0, lineheight = 1.25, margin = margin(2, 2, 2, 2)),
plot.subtitle = element_text(color = "white", hjust = 0, margin = margin(2, 2, 2, 2)),
plot.caption = element_text(color = "white", hjust = 0),
plot.margin = unit(rep(1, 4), "lines"))
}
# Daily Withdrawals of cash for case 1 and case 2:
my_case1 %>%
ggplot(aes(DAYID, daily_cash)) +
geom_line(color = "cyan") +
my_theme() +
labs(x = NULL, y = NULL, title = "Daily Cash Withdrawals (Terminal ID: 00002009)")

my_case2 %>%
ggplot(aes(DAYID, daily_cash)) +
geom_line(color = "cyan") +
my_theme() +
labs(x = NULL, y = NULL, title = "Daily Cash Withdrawals (Terminal ID: 00002019)")

# A function for data pre-processing:
pre_proccess <- function(your_df) {
your_df %>%
timetk::tk_augment_timeseries_signature() %>%
mutate_if(is.ordered, as.character) %>%
mutate_if(is.character, as.factor)%>%
na.omit() %>%
select(-hour, -minute, -second, -am.pm, -index.num, -year.iso, -year, -month, -day) %>%
return()
}
# Used the function:
pre_proccess(my_case1) ->> cash_clean
#=======================================
# Automatic Machine Learning Approach
#=======================================
# Wrire a function to convert to h2o frame:
convert_to_h2o <- function(your_df) {
your_df %>%
h2o::as.h2o() %>%
return()
}
# Wrire a function for conducting One-step Ahead Prediction:
library(h2o)
h2o.init(nthreads = 32, max_mem_size = "16G")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 hours 1 minutes
## H2O cluster timezone: Asia/Ho_Chi_Minh
## H2O data parsing timezone: UTC
## H2O cluster version: 3.20.0.2
## H2O cluster version age: 3 months and 19 days !!!
## H2O cluster name: H2O_started_from_R_chidung_rjy105
## H2O cluster total nodes: 1
## H2O cluster total memory: 15.96 GB
## H2O cluster total cores: 32
## H2O cluster allowed cores: 32
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.1 (2018-07-02)
h2o.no_progress()
h2o.removeAll()
## [1] 0
my_predict <- function(k) {
range_validation <- 60
n_ahead <- 1
train_tbl <- cash_clean %>%
slice((1 + k):(400 + k))
valid_tbl <- cash_clean %>%
slice((400 + 1 + k):(400 + k + range_validation))
test_tbl <- cash_clean %>%
slice((400 + k + range_validation + 1):(400 + k + range_validation + n_ahead))
train_h2o <- train_tbl %>% select(-DAYID) %>% convert_to_h2o()
valid_h2o <- valid_tbl %>% select(-DAYID) %>% convert_to_h2o()
test_h2o <- test_tbl %>% select(-DAYID) %>% convert_to_h2o()
# Set target and features:
y <- "daily_cash"
x <- setdiff(names(train_h2o), y)
# Automatic ML:
automl_models_h2o <- h2o.automl(x = x, y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 60,
max_models = 30,
stopping_metric = "MAE",
seed = 29)
# Use for forecasting:
pred_h2o <- h2o.predict(automl_models_h2o, newdata = test_h2o) %>%
as.data.frame() %>%
pull(predict)
actual_predicted_df_test <- test_tbl %>%
mutate(predicted = pred_h2o, error = daily_cash - predicted, pct = error / daily_cash) %>%
select(DAYID, daily_cash, predicted, everything())
return(actual_predicted_df_test)
}
# Function for conducting One-step Ahead Prediction for (n + 1) consecutive days:
my_predict_n_days <- function(n_days) {
results <- lapply(0:n_days, my_predict)
results_df <- do.call("bind_rows", results)
return(results_df)
}
# Use this function:
system.time(my_predict_n_days(30) ->> results_df)
## user system elapsed
## 51.868 1.977 2019.704
# Function visualizes predictions vs actuals:
vis_results <- function(r_df) {
r_df %>%
select(DAYID, Actual = daily_cash, Predicted = predicted) %>%
gather(a, b, -DAYID) %>%
ggplot(aes(DAYID, b, color = a)) +
geom_line() +
geom_point() +
scale_color_manual(values = c("purple", "orange"), name = "") +
my_theme()
}
# Visualize predicted - actual results
results_df %>%
vis_results() +
labs(x = NULL, y = NULL, subtitle = "Approach Used: Automatic Machine Learning",
title = "One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

# Function presents predicted - actual results:
print_results <- function(df) {
df %>%
select(DAYID, Actual = daily_cash, Predicted = predicted) %>%
mutate_if(is.numeric, function(x) {round(x, 2)}) %>%
knitr::kable()
}
results_df %>%
print_results()
2018-04-30 |
27.38 |
27.94 |
2018-05-01 |
21.24 |
20.54 |
2018-05-02 |
65.50 |
67.81 |
2018-05-03 |
67.31 |
69.20 |
2018-05-04 |
17.34 |
15.30 |
2018-05-05 |
19.39 |
13.50 |
2018-05-06 |
16.55 |
16.24 |
2018-05-07 |
27.35 |
27.92 |
2018-05-08 |
12.87 |
65.52 |
2018-05-09 |
341.75 |
311.15 |
2018-05-10 |
406.48 |
281.85 |
2018-05-11 |
181.22 |
177.19 |
2018-05-12 |
181.20 |
178.21 |
2018-05-13 |
3.60 |
63.08 |
2018-05-14 |
116.37 |
97.57 |
2018-05-15 |
122.93 |
88.60 |
2018-05-16 |
74.98 |
71.46 |
2018-05-17 |
88.67 |
90.95 |
2018-05-18 |
17.00 |
20.53 |
2018-05-20 |
38.50 |
31.92 |
2018-05-22 |
49.51 |
56.79 |
2018-05-23 |
71.90 |
67.21 |
2018-05-24 |
59.91 |
57.81 |
2018-05-25 |
10.60 |
13.73 |
2018-05-26 |
34.35 |
31.92 |
2018-05-27 |
40.80 |
26.86 |
2018-05-28 |
36.75 |
36.42 |
2018-05-29 |
40.70 |
39.85 |
2018-05-30 |
38.30 |
31.94 |
2018-05-31 |
49.48 |
46.67 |
2018-06-01 |
33.65 |
20.72 |
# Compare total actual vs predicted demands (31 days):
results_df %>%
select(DAYID, daily_cash, predicted) %>%
gather(a, value, -DAYID) %>%
group_by(a) %>%
summarise(cash = sum(value))
## # A tibble: 2 x 2
## a cash
## <chr> <dbl>
## 1 daily_cash 2314.
## 2 predicted 2166.
#========================
# ARIMA Approach
#========================
library(fpp2)
predict_from_arima <- function(k) {
range_validation <- 60
n_ahead <- 1
train_tbl <- cash_clean %>% slice((1 + k):(400 + k))
valid_tbl <- cash_clean %>% slice((400 + 1 + k):(400 + k + range_validation))
test_tbl <- cash_clean %>% slice((400 + k + range_validation + 1):(400 + k + range_validation + n_ahead))
train_arima <- bind_rows(train_tbl, valid_tbl) %>% select(1:2)
test_arima <- test_tbl %>% select(1:2)
# Automatic ARIMA model as proposed by Hyndman and Khandakar (2008):
my_arima <- auto.arima(train_arima[, 2] %>% ts(start = 1))
# Use the model for forecasting:
predicted_arima <- forecast(my_arima, h = 1)$mean %>% as.vector()
actual_predicted_df_test <- test_arima %>%
mutate(predicted = predicted_arima)
return(actual_predicted_df_test)
}
# Use this function:
system.time(lapply(0:30, predict_from_arima) ->> arima_results)
## user system elapsed
## 13.460 21.515 5.335
arima_results <- do.call("bind_rows", arima_results)
# Compare cash demands predicted by ARIMA approach and actuals:
arima_results %>%
vis_results() +
labs(x = NULL, y = NULL, subtitle = "Approach Used: ARIMA",
title = "One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

# Presents predicted - actual results:
arima_results %>%
print_results()
2018-04-30 |
27.38 |
60.89 |
2018-05-01 |
21.24 |
40.70 |
2018-05-02 |
65.50 |
39.11 |
2018-05-03 |
67.31 |
67.96 |
2018-05-04 |
17.34 |
69.26 |
2018-05-05 |
19.39 |
38.88 |
2018-05-06 |
16.55 |
42.66 |
2018-05-07 |
27.35 |
58.43 |
2018-05-08 |
12.87 |
47.82 |
2018-05-09 |
341.75 |
43.65 |
2018-05-10 |
406.48 |
244.21 |
2018-05-11 |
181.22 |
268.28 |
2018-05-12 |
181.20 |
117.84 |
2018-05-13 |
3.60 |
114.13 |
2018-05-14 |
116.37 |
2.32 |
2018-05-15 |
122.93 |
76.22 |
2018-05-16 |
74.98 |
100.38 |
2018-05-17 |
88.67 |
48.33 |
2018-05-18 |
17.00 |
60.34 |
2018-05-20 |
38.50 |
17.06 |
2018-05-22 |
49.51 |
34.14 |
2018-05-23 |
71.90 |
43.54 |
2018-05-24 |
59.91 |
59.02 |
2018-05-25 |
10.60 |
52.63 |
2018-05-26 |
34.35 |
24.31 |
2018-05-27 |
40.80 |
42.12 |
2018-05-28 |
36.75 |
48.16 |
2018-05-29 |
40.70 |
47.45 |
2018-05-30 |
38.30 |
51.61 |
2018-05-31 |
49.48 |
51.63 |
2018-06-01 |
33.65 |
59.78 |
#====================================================================================
# Comparision between the two approaches besed on some Forecast Accuracy Measures
#====================================================================================
# Function calculate some accuracy measures:
get_accuracy_measures <- function(your_result_df) {
act <- your_result_df %>% pull(daily_cash)
pred <- your_result_df %>% pull(predicted)
err <- act - pred
per_err <- abs(err / act)
# Mean Absolute Error (MAE):
mae <- err %>% abs() %>% mean()
# Mean Squared Error (MSE):
mse <- mean(err^2)
# Mean Absolute Percentage Error (MAPE):
mape <- 100*mean(per_err, trim = 5)
# Return results:
return(data.frame(MAE = mae, MSE = mse, MAPE = mape))
}
# Use above function for comparing:
bind_rows(results_df %>% get_accuracy_measures(), arima_results %>% get_accuracy_measures) %>%
mutate(Approach = c("Automatic ML", "ARIMA")) %>%
select(Approach, everything()) %>%
mutate_if(is.numeric, function(x) {round(x, 2)}) %>%
knitr::kable()
Automatic ML |
13.37 |
805.30 |
7.08 |
ARIMA |
45.61 |
5512.86 |
45.49 |