One-step Ahead Prediction for Cash Withdrawals: Comparing Three Approaches to Forecasting

R for Fun

Nguyen Chi Dung

# Load packages and all objects: 
library(magrittr)
library(tidyverse)
library(data.table)
library(lubridate)
library(timetk) 
library(stringr)
library(h2o)

# Set environment for using h2o package with 16 GB RAM: 

h2o.init(nthreads = -1, max_mem_size = "16G")

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         46 minutes 36 seconds 
##     H2O cluster timezone:       Asia/Ho_Chi_Minh 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.20.0.2 
##     H2O cluster version age:    3 months and 18 days !!! 
##     H2O cluster name:           H2O_started_from_R_chidung_rwi275 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   15.91 GB 
##     H2O cluster total cores:    32 
##     H2O cluster allowed cores:  32 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)

h2o.no_progress()

# Load objects: 
rm(list = ls())
system.time(load("/home/chidung/all_objects_atms_10_09.RData"))

##    user  system elapsed 
## 147.374   3.004 150.382

# Write a function for black theme: 

my_theme <- function(...) {
  theme(
    axis.line = element_blank(),  
    axis.text.x = element_text(color = "white", lineheight = 0.9),  
    axis.text.y = element_text(color = "white", lineheight = 0.9),  
    axis.ticks = element_line(color = "white", size  =  0.2),  
    axis.title.x = element_text(color = "white", margin = margin(0, 10, 0, 0)),  
    axis.title.y = element_text(color = "white", angle = 90, margin = margin(0, 10, 0, 0)),  
    axis.ticks.length = unit(0.3, "lines"),   
    legend.background = element_rect(color = NA, fill = "gray10"),  
    legend.key = element_rect(color = "white",  fill = "gray10"),  
    legend.key.size = unit(1.2, "lines"),  
    legend.key.height = NULL,  
    legend.key.width = NULL,      
    legend.text = element_text(color = "white"),  
    legend.title = element_text(face = "bold", hjust = 0, color = "white"),  
    legend.text.align = NULL,  
    legend.title.align = NULL,  
    legend.direction = "vertical",  
    legend.box = NULL, 
    panel.background = element_rect(fill = "gray10", color  =  NA),  
    panel.border = element_blank(),
    panel.grid.major = element_line(color = "grey35"),  
    panel.grid.minor = element_line(color = "grey20"),  
    panel.spacing = unit(0.5, "lines"),   
    strip.background = element_rect(fill = "grey30", color = "grey10"),  
    strip.text.x = element_text(color = "white"),  
    strip.text.y = element_text(color = "white", angle = -90),  
    plot.background = element_rect(color = "gray10", fill = "gray10"),  
    plot.title = element_text(color = "white", hjust = 0, lineheight = 1.25,
                              margin = margin(2, 2, 2, 2)),  
    plot.subtitle = element_text(color = "white", hjust = 0, margin = margin(2, 2, 2, 2)),  
    plot.caption = element_text(color = "white", hjust = 0),  
    plot.margin = unit(rep(1, 4), "lines"))
  
}

# Define components of daily cash demand:

hanoi_atm_trans$RESP_TEXT %>% unique() ->> trans_type
cash_demand <- trans_type[c(1, 17, 5, 2)]
atm_demand_cash <- hanoi_atm_trans %>% filter(RESP_TEXT %in% cash_demand)

# Calculate daily cash demand: 

atm_demand_cash %>% 
  mutate(DAYID = mdy(DAYID)) %>% 
  group_by(DAYID, TERMINAL_ID) %>% 
  summarise(daily_cash = sum(AMOUNT)) %>% 
  ungroup() %>% 
  mutate(daily_cash = daily_cash / 1000000) ->> daily_cash_demand_atms

# ATMs have over 500 transaction days:
over_500_atm <- daily_cash_demand_atms %>% 
  group_by(TERMINAL_ID) %>% 
  summarise(n_days = n_distinct(DAYID)) %>% 
  ungroup() %>% 
  arrange(-n_days) %>% 
  filter(n_days >= 500)

daily_cash_demand_atms_500 <- daily_cash_demand_atms %>% 
  filter(TERMINAL_ID %in% over_500_atm$TERMINAL_ID)

# Some specific cases: 
case1_cycle_TERMINAL_ID <- "00002009"
case2_random_TERMINAL_ID <- "00002019"

# A specific case: 
my_case1 <- daily_cash_demand_atms_500 %>% 
  filter(TERMINAL_ID == case1_cycle_TERMINAL_ID) %>% 
  select(-TERMINAL_ID)


# Visualize daily cash demands for above case: 
my_case1 %>% 
  ggplot(aes(DAYID, daily_cash)) + 
  geom_line(color = "cyan") + 
  my_theme() + 
  labs(x = NULL, y = NULL, title = "Daily Amount of Cash Transactions for 00002009")

# Wrire a function to convert to h2o frame: 

convert_to_h2o <- function(your_df) {
  your_df %>% 
    select(-DAYID) %>% 
    as.h2o() %>% 
    return()
}


#--------------------------------------
#   Version 1: Use lag 1 as a feature
#--------------------------------------

# A function for data pre-processing:  

pre_proccess_v1 <- function(your_df) {
  your_df %>% 
    mutate(lag1 = lag(daily_cash, n = 1L)) %>% 
    tk_augment_timeseries_signature() %>% 
    na.omit() %>% 
    mutate_if(is.ordered, as.character) %>% 
    mutate_if(is.character, as.factor)%>% 
    return()
}

# Used the function: 
pre_proccess_v1(my_case1) ->> cash_clean

# Wrire a function for conducting One-step Ahead Prediction: 

my_predict <- function(k) {
  
  range_validation <- 60
  n_ahead <- 1
  
  train_tbl <- cash_clean %>% 
    slice((1 + k):(400 + k))
  
  valid_tbl <- cash_clean %>% 
    slice((400 + 1 + k):(400 + k + range_validation)) 
  
  test_tbl  <- cash_clean %>% 
    slice((400 + k + range_validation + 1):(400 + k + range_validation + n_ahead))
  

  train_h2o <- train_tbl %>% convert_to_h2o()
  valid_h2o <- valid_tbl %>% convert_to_h2o()
  test_h2o <- test_tbl %>% convert_to_h2o()
  
  # Set target and features: 
  
  y <- "daily_cash"
  x <- setdiff(names(train_h2o), y)
  
  # Automatic ML: 
  automl_models_h2o <- h2o.automl(x = x, y = y, 
                                 training_frame = train_h2o,
                                 validation_frame = valid_h2o, 
                                 leaderboard_frame = test_h2o, 
                                 max_runtime_secs = 60, 
                                 max_models = 30, 
                                 stopping_metric = "MAE", 
                                 seed = 29)
  
  
  # Use for forecasting: 
  pred_h2o <- h2o.predict(automl_models_h2o, newdata = test_h2o) %>% 
    as.data.frame() %>% 
    pull(predict)
  
  actual_predicted_df_test <- test_tbl %>% 
    mutate(predicted = pred_h2o, error = daily_cash - predicted, pct = error / daily_cash) %>% 
    select(DAYID, daily_cash, predicted, everything())
  
  return(actual_predicted_df_test)
  
}

# Function for conducting One-step Ahead Prediction for (n + 1) consecutive days: 

my_predict_n_days <- function(n_days) {
  results <- lapply(0:n_days, my_predict)
  results_df <- do.call("bind_rows", results)
  return(results_df)
  
}

# Use this function: 

system.time(my_predict_n_days(30) ->> results_df)

##     user   system  elapsed 
##  618.797   13.712 2584.947

# Visualizing predictions vs actuals: 

vis_results <- function(r_df) {
  r_df %>% 
  select(DAYID, Actual = daily_cash, Predicted = predicted) %>% 
  gather(a, b, -DAYID) %>% 
  ggplot(aes(DAYID, b, color = a)) + 
  geom_line() + 
  geom_point() + 
  scale_color_manual(values = c("purple", "orange"), name = "") + 
  my_theme()
  
}


results_df %>% 
  vis_results() + 
  labs(x = NULL, y = NULL, subtitle = "Approach Used: Included Lag 1 as a Feature", 
       title = "Version 1: One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

# Numeric results: 

print_results <- function(df) {
  df %>% 
  select(DAYID, Actual = daily_cash, Predicted = predicted) %>% 
  mutate_if(is.numeric, function(x) {round(x, 2)}) %>% 
  knitr::kable()
}


results_df %>% 
  print_results()

DAYID	Actual	Predicted
2018-04-28	85.13	71.84
2018-04-29	62.13	62.19
2018-04-30	27.38	30.53
2018-05-01	21.24	29.20
2018-05-02	65.50	66.23
2018-05-03	67.31	65.28
2018-05-04	17.34	20.46
2018-05-05	19.39	19.85
2018-05-06	16.55	19.13
2018-05-07	27.35	26.93
2018-05-08	12.87	47.04
2018-05-09	341.75	342.17
2018-05-10	406.48	343.96
2018-05-11	181.22	180.89
2018-05-12	181.20	180.54
2018-05-13	3.60	33.78
2018-05-14	124.67	79.27
2018-05-15	122.93	116.29
2018-05-16	74.98	77.16
2018-05-17	88.67	85.83
2018-05-18	17.00	37.79
2018-05-20	38.50	39.06
2018-05-22	49.51	46.03
2018-05-23	71.90	60.39
2018-05-24	59.91	57.56
2018-05-25	10.60	9.35
2018-05-26	34.35	39.33
2018-05-27	40.80	37.13
2018-05-28	36.75	31.65
2018-05-29	40.70	42.13
2018-05-30	38.30	37.10

# Total actual cash demand vs predicted: 
results_df %>% 
  select(DAYID, daily_cash, predicted) %>% 
  gather(a, value, -DAYID) %>% 
  group_by(a) %>% 
  summarise(cash = sum(value))

## # A tibble: 2 x 2
##   a           cash
##   <chr>      <dbl>
## 1 daily_cash 2386.
## 2 predicted  2336.

#-------------------------------------------
#   Version 2: Not use lag 1 as a feature
#-------------------------------------------

pre_proccess_v2 <- function(your_df) {
  your_df %>% 
    tk_augment_timeseries_signature() %>% 
    na.omit() %>% 
    mutate_if(is.ordered, as.character) %>% 
    mutate_if(is.character, as.factor)%>% 
    return()
}

pre_proccess_v2(my_case1) ->> cash_clean

my_predict_n_days(30) ->> results_df

results_df %>% 
  vis_results() + 
  labs(x = NULL, y = NULL, subtitle = "Approach Used: Removed Lag 1", 
       title = "Version 2: One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

results_df %>% 
  print_results()

DAYID	Actual	Predicted
2018-04-28	85.13	84.53
2018-04-29	62.13	61.63
2018-04-30	27.38	29.84
2018-05-01	21.24	35.57
2018-05-02	65.50	66.61
2018-05-03	67.31	67.00
2018-05-04	17.34	16.88
2018-05-05	19.39	17.11
2018-05-06	16.55	21.74
2018-05-07	27.35	46.03
2018-05-08	12.87	64.10
2018-05-09	341.75	335.66
2018-05-10	406.48	357.01
2018-05-11	181.22	186.83
2018-05-12	181.20	180.75
2018-05-13	3.60	6.68
2018-05-14	124.67	137.32
2018-05-15	122.93	105.49
2018-05-16	74.98	74.93
2018-05-17	88.67	87.24
2018-05-18	17.00	37.57
2018-05-20	38.50	36.68
2018-05-22	49.51	49.44
2018-05-23	71.90	66.29
2018-05-24	59.91	55.61
2018-05-25	10.60	15.28
2018-05-26	34.35	33.69
2018-05-27	40.80	43.57
2018-05-28	36.75	40.25
2018-05-29	40.70	42.11
2018-05-30	38.30	44.22

results_df %>% 
  select(DAYID, daily_cash, predicted) %>% 
  gather(a, value, -DAYID) %>% 
  group_by(a) %>% 
  summarise(cash = sum(value))

## # A tibble: 2 x 2
##   a           cash
##   <chr>      <dbl>
## 1 daily_cash 2386.
## 2 predicted  2448.

#---------------------
#    ARIMA Approach
#---------------------
library(fpp2)

predict_from_arima <- function(k) {

  range_validation <- 60
  n_ahead <- 1
  
  train_tbl <- cash_clean %>% slice((1 + k):(400 + k))
  valid_tbl <- cash_clean %>% slice((400 + 1 + k):(400 + k + range_validation)) 
  test_tbl  <- cash_clean %>% slice((400 + k + range_validation + 1):(400 + k + range_validation + n_ahead))
  
  train_arima <- bind_rows(train_tbl, valid_tbl) %>% select(1:2)
  test_arima <- test_tbl %>% select(1:2)
  
  # ARIMA model: 
  my_arima <- auto.arima(train_arima[, 2] %>% ts(start = 1))

  # Use the model for forecasting: 
  predicted_arima <- forecast(my_arima, h = 1)$mean %>% as.vector()

  actual_predicted_df_test <- test_arima %>% 
    mutate(predicted = predicted_arima) 
  
  return(actual_predicted_df_test)
  
}

# Use this function: 

lapply(0:30, predict_from_arima) ->> arima_results
arima_results <- do.call("bind_rows", arima_results)

# Compare cash demands predicted by ARIMA approach and actuals: 

arima_results %>% 
  vis_results() + 
  labs(x = NULL, y = NULL, subtitle = "Approach Used: ARIMA", 
       title = "Version 3: One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

arima_results %>% 
  print_results()

DAYID	Actual	Predicted
2018-04-28	85.13	52.07
2018-04-29	62.13	75.00
2018-04-30	27.38	60.91
2018-05-01	21.24	40.84
2018-05-02	65.50	39.39
2018-05-03	67.31	66.37
2018-05-04	17.34	68.40
2018-05-05	19.39	39.71
2018-05-06	16.55	42.38
2018-05-07	27.35	42.74
2018-05-08	12.87	51.20
2018-05-09	341.75	43.81
2018-05-10	406.48	233.72
2018-05-11	181.22	270.92
2018-05-12	181.20	117.83
2018-05-13	3.60	114.23
2018-05-14	124.67	3.29
2018-05-15	122.93	109.67
2018-05-16	74.98	100.30
2018-05-17	88.67	48.04
2018-05-18	17.00	59.17
2018-05-20	38.50	17.50
2018-05-22	49.51	34.39
2018-05-23	71.90	43.62
2018-05-24	59.91	58.95
2018-05-25	10.60	52.62
2018-05-26	34.35	24.60
2018-05-27	40.80	42.20
2018-05-28	36.75	48.19
2018-05-29	40.70	47.50
2018-05-30	38.30	51.60