One-step Ahead Prediction for Cash Withdrawals: Comparing Three Approaches to Forecasting

R for Fun

Nguyen Chi Dung

# Load packages and all objects: 
library(magrittr)
library(tidyverse)
library(data.table)
library(lubridate)
library(timetk) 
library(stringr)
library(h2o)

# Set environment for using h2o package with 16 GB RAM: 

h2o.init(nthreads = -1, max_mem_size = "16G")
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         46 minutes 36 seconds 
##     H2O cluster timezone:       Asia/Ho_Chi_Minh 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.20.0.2 
##     H2O cluster version age:    3 months and 18 days !!! 
##     H2O cluster name:           H2O_started_from_R_chidung_rwi275 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   15.91 GB 
##     H2O cluster total cores:    32 
##     H2O cluster allowed cores:  32 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)
h2o.no_progress()

# Load objects: 
rm(list = ls())
system.time(load("/home/chidung/all_objects_atms_10_09.RData"))
##    user  system elapsed 
## 147.374   3.004 150.382
# Write a function for black theme: 

my_theme <- function(...) {
  theme(
    axis.line = element_blank(),  
    axis.text.x = element_text(color = "white", lineheight = 0.9),  
    axis.text.y = element_text(color = "white", lineheight = 0.9),  
    axis.ticks = element_line(color = "white", size  =  0.2),  
    axis.title.x = element_text(color = "white", margin = margin(0, 10, 0, 0)),  
    axis.title.y = element_text(color = "white", angle = 90, margin = margin(0, 10, 0, 0)),  
    axis.ticks.length = unit(0.3, "lines"),   
    legend.background = element_rect(color = NA, fill = "gray10"),  
    legend.key = element_rect(color = "white",  fill = "gray10"),  
    legend.key.size = unit(1.2, "lines"),  
    legend.key.height = NULL,  
    legend.key.width = NULL,      
    legend.text = element_text(color = "white"),  
    legend.title = element_text(face = "bold", hjust = 0, color = "white"),  
    legend.text.align = NULL,  
    legend.title.align = NULL,  
    legend.direction = "vertical",  
    legend.box = NULL, 
    panel.background = element_rect(fill = "gray10", color  =  NA),  
    panel.border = element_blank(),
    panel.grid.major = element_line(color = "grey35"),  
    panel.grid.minor = element_line(color = "grey20"),  
    panel.spacing = unit(0.5, "lines"),   
    strip.background = element_rect(fill = "grey30", color = "grey10"),  
    strip.text.x = element_text(color = "white"),  
    strip.text.y = element_text(color = "white", angle = -90),  
    plot.background = element_rect(color = "gray10", fill = "gray10"),  
    plot.title = element_text(color = "white", hjust = 0, lineheight = 1.25,
                              margin = margin(2, 2, 2, 2)),  
    plot.subtitle = element_text(color = "white", hjust = 0, margin = margin(2, 2, 2, 2)),  
    plot.caption = element_text(color = "white", hjust = 0),  
    plot.margin = unit(rep(1, 4), "lines"))
  
}

# Define components of daily cash demand:

hanoi_atm_trans$RESP_TEXT %>% unique() ->> trans_type
cash_demand <- trans_type[c(1, 17, 5, 2)]
atm_demand_cash <- hanoi_atm_trans %>% filter(RESP_TEXT %in% cash_demand)

# Calculate daily cash demand: 

atm_demand_cash %>% 
  mutate(DAYID = mdy(DAYID)) %>% 
  group_by(DAYID, TERMINAL_ID) %>% 
  summarise(daily_cash = sum(AMOUNT)) %>% 
  ungroup() %>% 
  mutate(daily_cash = daily_cash / 1000000) ->> daily_cash_demand_atms

# ATMs have over 500 transaction days:
over_500_atm <- daily_cash_demand_atms %>% 
  group_by(TERMINAL_ID) %>% 
  summarise(n_days = n_distinct(DAYID)) %>% 
  ungroup() %>% 
  arrange(-n_days) %>% 
  filter(n_days >= 500)

daily_cash_demand_atms_500 <- daily_cash_demand_atms %>% 
  filter(TERMINAL_ID %in% over_500_atm$TERMINAL_ID)

# Some specific cases: 
case1_cycle_TERMINAL_ID <- "00002009"
case2_random_TERMINAL_ID <- "00002019"

# A specific case: 
my_case1 <- daily_cash_demand_atms_500 %>% 
  filter(TERMINAL_ID == case1_cycle_TERMINAL_ID) %>% 
  select(-TERMINAL_ID)


# Visualize daily cash demands for above case: 
my_case1 %>% 
  ggplot(aes(DAYID, daily_cash)) + 
  geom_line(color = "cyan") + 
  my_theme() + 
  labs(x = NULL, y = NULL, title = "Daily Amount of Cash Transactions for 00002009")

# Wrire a function to convert to h2o frame: 

convert_to_h2o <- function(your_df) {
  your_df %>% 
    select(-DAYID) %>% 
    as.h2o() %>% 
    return()
}


#--------------------------------------
#   Version 1: Use lag 1 as a feature
#--------------------------------------

# A function for data pre-processing:  

pre_proccess_v1 <- function(your_df) {
  your_df %>% 
    mutate(lag1 = lag(daily_cash, n = 1L)) %>% 
    tk_augment_timeseries_signature() %>% 
    na.omit() %>% 
    mutate_if(is.ordered, as.character) %>% 
    mutate_if(is.character, as.factor)%>% 
    return()
}

# Used the function: 
pre_proccess_v1(my_case1) ->> cash_clean

# Wrire a function for conducting One-step Ahead Prediction: 

my_predict <- function(k) {
  
  range_validation <- 60
  n_ahead <- 1
  
  train_tbl <- cash_clean %>% 
    slice((1 + k):(400 + k))
  
  valid_tbl <- cash_clean %>% 
    slice((400 + 1 + k):(400 + k + range_validation)) 
  
  test_tbl  <- cash_clean %>% 
    slice((400 + k + range_validation + 1):(400 + k + range_validation + n_ahead))
  

  train_h2o <- train_tbl %>% convert_to_h2o()
  valid_h2o <- valid_tbl %>% convert_to_h2o()
  test_h2o <- test_tbl %>% convert_to_h2o()
  
  # Set target and features: 
  
  y <- "daily_cash"
  x <- setdiff(names(train_h2o), y)
  
  # Automatic ML: 
  automl_models_h2o <- h2o.automl(x = x, y = y, 
                                 training_frame = train_h2o,
                                 validation_frame = valid_h2o, 
                                 leaderboard_frame = test_h2o, 
                                 max_runtime_secs = 60, 
                                 max_models = 30, 
                                 stopping_metric = "MAE", 
                                 seed = 29)
  
  
  # Use for forecasting: 
  pred_h2o <- h2o.predict(automl_models_h2o, newdata = test_h2o) %>% 
    as.data.frame() %>% 
    pull(predict)
  
  actual_predicted_df_test <- test_tbl %>% 
    mutate(predicted = pred_h2o, error = daily_cash - predicted, pct = error / daily_cash) %>% 
    select(DAYID, daily_cash, predicted, everything())
  
  return(actual_predicted_df_test)
  
}

# Function for conducting One-step Ahead Prediction for (n + 1) consecutive days: 

my_predict_n_days <- function(n_days) {
  results <- lapply(0:n_days, my_predict)
  results_df <- do.call("bind_rows", results)
  return(results_df)
  
}

# Use this function: 

system.time(my_predict_n_days(30) ->> results_df)
##     user   system  elapsed 
##  618.797   13.712 2584.947
# Visualizing predictions vs actuals: 

vis_results <- function(r_df) {
  r_df %>% 
  select(DAYID, Actual = daily_cash, Predicted = predicted) %>% 
  gather(a, b, -DAYID) %>% 
  ggplot(aes(DAYID, b, color = a)) + 
  geom_line() + 
  geom_point() + 
  scale_color_manual(values = c("purple", "orange"), name = "") + 
  my_theme()
  
}


results_df %>% 
  vis_results() + 
  labs(x = NULL, y = NULL, subtitle = "Approach Used: Included Lag 1 as a Feature", 
       title = "Version 1: One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

# Numeric results: 

print_results <- function(df) {
  df %>% 
  select(DAYID, Actual = daily_cash, Predicted = predicted) %>% 
  mutate_if(is.numeric, function(x) {round(x, 2)}) %>% 
  knitr::kable()
}


results_df %>% 
  print_results()
DAYID Actual Predicted
2018-04-28 85.13 71.84
2018-04-29 62.13 62.19
2018-04-30 27.38 30.53
2018-05-01 21.24 29.20
2018-05-02 65.50 66.23
2018-05-03 67.31 65.28
2018-05-04 17.34 20.46
2018-05-05 19.39 19.85
2018-05-06 16.55 19.13
2018-05-07 27.35 26.93
2018-05-08 12.87 47.04
2018-05-09 341.75 342.17
2018-05-10 406.48 343.96
2018-05-11 181.22 180.89
2018-05-12 181.20 180.54
2018-05-13 3.60 33.78
2018-05-14 124.67 79.27
2018-05-15 122.93 116.29
2018-05-16 74.98 77.16
2018-05-17 88.67 85.83
2018-05-18 17.00 37.79
2018-05-20 38.50 39.06
2018-05-22 49.51 46.03
2018-05-23 71.90 60.39
2018-05-24 59.91 57.56
2018-05-25 10.60 9.35
2018-05-26 34.35 39.33
2018-05-27 40.80 37.13
2018-05-28 36.75 31.65
2018-05-29 40.70 42.13
2018-05-30 38.30 37.10
# Total actual cash demand vs predicted: 
results_df %>% 
  select(DAYID, daily_cash, predicted) %>% 
  gather(a, value, -DAYID) %>% 
  group_by(a) %>% 
  summarise(cash = sum(value))
## # A tibble: 2 x 2
##   a           cash
##   <chr>      <dbl>
## 1 daily_cash 2386.
## 2 predicted  2336.
#-------------------------------------------
#   Version 2: Not use lag 1 as a feature
#-------------------------------------------

pre_proccess_v2 <- function(your_df) {
  your_df %>% 
    tk_augment_timeseries_signature() %>% 
    na.omit() %>% 
    mutate_if(is.ordered, as.character) %>% 
    mutate_if(is.character, as.factor)%>% 
    return()
}

pre_proccess_v2(my_case1) ->> cash_clean

my_predict_n_days(30) ->> results_df

results_df %>% 
  vis_results() + 
  labs(x = NULL, y = NULL, subtitle = "Approach Used: Removed Lag 1", 
       title = "Version 2: One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

results_df %>% 
  print_results()
DAYID Actual Predicted
2018-04-28 85.13 84.53
2018-04-29 62.13 61.63
2018-04-30 27.38 29.84
2018-05-01 21.24 35.57
2018-05-02 65.50 66.61
2018-05-03 67.31 67.00
2018-05-04 17.34 16.88
2018-05-05 19.39 17.11
2018-05-06 16.55 21.74
2018-05-07 27.35 46.03
2018-05-08 12.87 64.10
2018-05-09 341.75 335.66
2018-05-10 406.48 357.01
2018-05-11 181.22 186.83
2018-05-12 181.20 180.75
2018-05-13 3.60 6.68
2018-05-14 124.67 137.32
2018-05-15 122.93 105.49
2018-05-16 74.98 74.93
2018-05-17 88.67 87.24
2018-05-18 17.00 37.57
2018-05-20 38.50 36.68
2018-05-22 49.51 49.44
2018-05-23 71.90 66.29
2018-05-24 59.91 55.61
2018-05-25 10.60 15.28
2018-05-26 34.35 33.69
2018-05-27 40.80 43.57
2018-05-28 36.75 40.25
2018-05-29 40.70 42.11
2018-05-30 38.30 44.22
results_df %>% 
  select(DAYID, daily_cash, predicted) %>% 
  gather(a, value, -DAYID) %>% 
  group_by(a) %>% 
  summarise(cash = sum(value))
## # A tibble: 2 x 2
##   a           cash
##   <chr>      <dbl>
## 1 daily_cash 2386.
## 2 predicted  2448.
#---------------------
#    ARIMA Approach
#---------------------
library(fpp2)

predict_from_arima <- function(k) {

  range_validation <- 60
  n_ahead <- 1
  
  train_tbl <- cash_clean %>% slice((1 + k):(400 + k))
  valid_tbl <- cash_clean %>% slice((400 + 1 + k):(400 + k + range_validation)) 
  test_tbl  <- cash_clean %>% slice((400 + k + range_validation + 1):(400 + k + range_validation + n_ahead))
  
  train_arima <- bind_rows(train_tbl, valid_tbl) %>% select(1:2)
  test_arima <- test_tbl %>% select(1:2)
  
  # ARIMA model: 
  my_arima <- auto.arima(train_arima[, 2] %>% ts(start = 1))

  # Use the model for forecasting: 
  predicted_arima <- forecast(my_arima, h = 1)$mean %>% as.vector()

  actual_predicted_df_test <- test_arima %>% 
    mutate(predicted = predicted_arima) 
  
  return(actual_predicted_df_test)
  
}

# Use this function: 

lapply(0:30, predict_from_arima) ->> arima_results
arima_results <- do.call("bind_rows", arima_results)

# Compare cash demands predicted by ARIMA approach and actuals: 

arima_results %>% 
  vis_results() + 
  labs(x = NULL, y = NULL, subtitle = "Approach Used: ARIMA", 
       title = "Version 3: One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

arima_results %>% 
  print_results()
DAYID Actual Predicted
2018-04-28 85.13 52.07
2018-04-29 62.13 75.00
2018-04-30 27.38 60.91
2018-05-01 21.24 40.84
2018-05-02 65.50 39.39
2018-05-03 67.31 66.37
2018-05-04 17.34 68.40
2018-05-05 19.39 39.71
2018-05-06 16.55 42.38
2018-05-07 27.35 42.74
2018-05-08 12.87 51.20
2018-05-09 341.75 43.81
2018-05-10 406.48 233.72
2018-05-11 181.22 270.92
2018-05-12 181.20 117.83
2018-05-13 3.60 114.23
2018-05-14 124.67 3.29
2018-05-15 122.93 109.67
2018-05-16 74.98 100.30
2018-05-17 88.67 48.04
2018-05-18 17.00 59.17
2018-05-20 38.50 17.50
2018-05-22 49.51 34.39
2018-05-23 71.90 43.62
2018-05-24 59.91 58.95
2018-05-25 10.60 52.62
2018-05-26 34.35 24.60
2018-05-27 40.80 42.20
2018-05-28 36.75 48.19
2018-05-29 40.70 47.50
2018-05-30 38.30 51.60