Forecasting Daily Cash Demand: A Short Comparision between Automatic Machine Learning and ARIMA Aprroach (Case 1)

R for Pleasure

Nguyen Chi Dung

#==============================
#  Exploratory Data Analysis
#==============================

# Inspect data path: 

rm(list = ls())
my_path <- dir("/home/chidung/Desktop/atm_cash_data", full.names = TRUE)

# Import data: 

library(tidyverse)
library(magrittr)

my_case1 <- read_csv(my_path[1])
my_case2 <- read_csv(my_path[2])

# A function for black theme: 

my_theme <- function(...) {
  theme(
    axis.line = element_blank(),  
    axis.text.x = element_text(color = "white", lineheight = 0.9),  
    axis.text.y = element_text(color = "white", lineheight = 0.9),  
    axis.ticks = element_line(color = "white", size  =  0.2),  
    axis.title.x = element_text(color = "white", margin = margin(0, 10, 0, 0)),  
    axis.title.y = element_text(color = "white", angle = 90, margin = margin(0, 10, 0, 0)),  
    axis.ticks.length = unit(0.3, "lines"),   
    legend.background = element_rect(color = NA, fill = "gray10"),  
    legend.key = element_rect(color = "white",  fill = "gray10"),  
    legend.key.size = unit(1.2, "lines"),  
    legend.key.height = NULL,  
    legend.key.width = NULL,      
    legend.text = element_text(color = "white"),  
    legend.title = element_text(face = "bold", hjust = 0, color = "white"),  
    legend.text.align = NULL,  
    legend.title.align = NULL,  
    legend.direction = "vertical",  
    legend.box = NULL, 
    panel.background = element_rect(fill = "gray10", color  =  NA),  
    panel.border = element_blank(),
    panel.grid.major = element_line(color = "grey35"),  
    panel.grid.minor = element_line(color = "grey20"),  
    panel.spacing = unit(0.5, "lines"),   
    strip.background = element_rect(fill = "grey30", color = "grey10"),  
    strip.text.x = element_text(color = "white"),  
    strip.text.y = element_text(color = "white", angle = -90),  
    plot.background = element_rect(color = "gray10", fill = "gray10"),  
    plot.title = element_text(color = "white", hjust = 0, lineheight = 1.25, margin = margin(2, 2, 2, 2)),  
    plot.subtitle = element_text(color = "white", hjust = 0, margin = margin(2, 2, 2, 2)),  
    plot.caption = element_text(color = "white", hjust = 0),  
    plot.margin = unit(rep(1, 4), "lines"))
  
}

# Daily Withdrawals of cash for case 1 and case 2: 
my_case1 %>% 
  ggplot(aes(DAYID, daily_cash)) + 
  geom_line(color = "cyan") + 
  my_theme() + 
  labs(x = NULL, y = NULL, title = "Daily Cash Withdrawals (Terminal ID: 00002009)")

my_case2 %>% 
  ggplot(aes(DAYID, daily_cash)) + 
  geom_line(color = "cyan") + 
  my_theme() + 
  labs(x = NULL, y = NULL, title = "Daily Cash Withdrawals (Terminal ID: 00002019)")

# A function for data pre-processing: 

pre_proccess <- function(your_df) {
  your_df %>% 
    timetk::tk_augment_timeseries_signature() %>% 
    mutate_if(is.ordered, as.character) %>% 
    mutate_if(is.character, as.factor)%>% 
    na.omit() %>% 
    select(-hour, -minute, -second, -am.pm, -index.num, -year.iso, -year, -month, -day) %>% 
    return()
}


# Used the function: 
pre_proccess(my_case1) ->> cash_clean

#=======================================
#  Automatic Machine Learning Approach
#=======================================

# Wrire a function to convert to h2o frame: 

convert_to_h2o <- function(your_df) {
  your_df %>% 
    h2o::as.h2o() %>% 
    return()
}

# Wrire a function for conducting One-step Ahead Prediction: 

library(h2o)
h2o.init(nthreads = 32, max_mem_size = "16G")

##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 hours 1 minutes 
##     H2O cluster timezone:       Asia/Ho_Chi_Minh 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.20.0.2 
##     H2O cluster version age:    3 months and 19 days !!! 
##     H2O cluster name:           H2O_started_from_R_chidung_rjy105 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   15.96 GB 
##     H2O cluster total cores:    32 
##     H2O cluster allowed cores:  32 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)

h2o.no_progress()
h2o.removeAll()

## [1] 0

my_predict <- function(k) {
  
  range_validation <- 60
  n_ahead <- 1
  
  train_tbl <- cash_clean %>% 
    slice((1 + k):(400 + k))
  
  valid_tbl <- cash_clean %>% 
    slice((400 + 1 + k):(400 + k + range_validation)) 
  
  test_tbl  <- cash_clean %>% 
    slice((400 + k + range_validation + 1):(400 + k + range_validation + n_ahead))
  
  
  train_h2o <- train_tbl %>% select(-DAYID) %>% convert_to_h2o()
  valid_h2o <- valid_tbl %>% select(-DAYID) %>% convert_to_h2o()
  test_h2o <- test_tbl %>% select(-DAYID) %>% convert_to_h2o()
  
  # Set target and features: 
  
  y <- "daily_cash"
  x <- setdiff(names(train_h2o), y)
  
  # Automatic ML: 
  automl_models_h2o <- h2o.automl(x = x, y = y, 
                                  training_frame = train_h2o,
                                  validation_frame = valid_h2o, 
                                  leaderboard_frame = test_h2o, 
                                  max_runtime_secs = 60, 
                                  max_models = 30, 
                                  stopping_metric = "MAE", 
                                  seed = 29)
  
  
  # Use for forecasting: 
  pred_h2o <- h2o.predict(automl_models_h2o, newdata = test_h2o) %>% 
    as.data.frame() %>% 
    pull(predict)
  
  actual_predicted_df_test <- test_tbl %>% 
    mutate(predicted = pred_h2o, error = daily_cash - predicted, pct = error / daily_cash) %>% 
    select(DAYID, daily_cash, predicted, everything())
  
  return(actual_predicted_df_test)
  
}

# Function for conducting One-step Ahead Prediction for (n + 1) consecutive days: 

my_predict_n_days <- function(n_days) {
  results <- lapply(0:n_days, my_predict)
  results_df <- do.call("bind_rows", results)
  return(results_df)
  
}

# Use this function: 

system.time(my_predict_n_days(30) ->> results_df)

##     user   system  elapsed 
##   51.868    1.977 2019.704

# Function visualizes predictions vs actuals: 

vis_results <- function(r_df) {
  r_df %>% 
    select(DAYID, Actual = daily_cash, Predicted = predicted) %>% 
    gather(a, b, -DAYID) %>% 
    ggplot(aes(DAYID, b, color = a)) + 
    geom_line() + 
    geom_point() + 
    scale_color_manual(values = c("purple", "orange"), name = "") + 
    my_theme()
  
}

# Visualize predicted - actual results
results_df %>% 
  vis_results() + 
  labs(x = NULL, y = NULL, subtitle = "Approach Used: Automatic Machine Learning", 
       title = "One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

# Function presents predicted - actual results: 

print_results <- function(df) {
  df %>% 
    select(DAYID, Actual = daily_cash, Predicted = predicted) %>% 
    mutate_if(is.numeric, function(x) {round(x, 2)}) %>% 
    knitr::kable()
}

results_df %>% 
  print_results()

DAYID	Actual	Predicted
2018-04-30	27.38	27.94
2018-05-01	21.24	20.54
2018-05-02	65.50	67.81
2018-05-03	67.31	69.20
2018-05-04	17.34	15.30
2018-05-05	19.39	13.50
2018-05-06	16.55	16.24
2018-05-07	27.35	27.92
2018-05-08	12.87	65.52
2018-05-09	341.75	311.15
2018-05-10	406.48	281.85
2018-05-11	181.22	177.19
2018-05-12	181.20	178.21
2018-05-13	3.60	63.08
2018-05-14	116.37	97.57
2018-05-15	122.93	88.60
2018-05-16	74.98	71.46
2018-05-17	88.67	90.95
2018-05-18	17.00	20.53
2018-05-20	38.50	31.92
2018-05-22	49.51	56.79
2018-05-23	71.90	67.21
2018-05-24	59.91	57.81
2018-05-25	10.60	13.73
2018-05-26	34.35	31.92
2018-05-27	40.80	26.86
2018-05-28	36.75	36.42
2018-05-29	40.70	39.85
2018-05-30	38.30	31.94
2018-05-31	49.48	46.67
2018-06-01	33.65	20.72

# Compare total actual vs predicted demands (31 days): 

results_df %>% 
  select(DAYID, daily_cash, predicted) %>% 
  gather(a, value, -DAYID) %>% 
  group_by(a) %>% 
  summarise(cash = sum(value))

## # A tibble: 2 x 2
##   a           cash
##   <chr>      <dbl>
## 1 daily_cash 2314.
## 2 predicted  2166.

#========================
#     ARIMA Approach
#========================

library(fpp2)

predict_from_arima <- function(k) {
  
  range_validation <- 60
  n_ahead <- 1
  
  train_tbl <- cash_clean %>% slice((1 + k):(400 + k))
  valid_tbl <- cash_clean %>% slice((400 + 1 + k):(400 + k + range_validation)) 
  test_tbl  <- cash_clean %>% slice((400 + k + range_validation + 1):(400 + k + range_validation + n_ahead))
  
  train_arima <- bind_rows(train_tbl, valid_tbl) %>% select(1:2)
  test_arima <- test_tbl %>% select(1:2)
  
  # Automatic ARIMA model as proposed by Hyndman and Khandakar (2008): 
  my_arima <- auto.arima(train_arima[, 2] %>% ts(start = 1))
  
  # Use the model for forecasting: 
  predicted_arima <- forecast(my_arima, h = 1)$mean %>% as.vector()
  
  actual_predicted_df_test <- test_arima %>% 
    mutate(predicted = predicted_arima) 
  
  return(actual_predicted_df_test)
  
}

# Use this function: 

system.time(lapply(0:30, predict_from_arima) ->> arima_results)

##    user  system elapsed 
##  13.460  21.515   5.335

arima_results <- do.call("bind_rows", arima_results)

# Compare cash demands predicted by ARIMA approach and actuals: 

arima_results %>% 
  vis_results() + 
  labs(x = NULL, y = NULL, subtitle = "Approach Used: ARIMA", 
       title = "One-step Ahead Prediction for Cash Withdrawals (31 Consecutive Days)")

#  Presents predicted - actual results: 
arima_results %>% 
  print_results()

DAYID	Actual	Predicted
2018-04-30	27.38	60.89
2018-05-01	21.24	40.70
2018-05-02	65.50	39.11
2018-05-03	67.31	67.96
2018-05-04	17.34	69.26
2018-05-05	19.39	38.88
2018-05-06	16.55	42.66
2018-05-07	27.35	58.43
2018-05-08	12.87	47.82
2018-05-09	341.75	43.65
2018-05-10	406.48	244.21
2018-05-11	181.22	268.28
2018-05-12	181.20	117.84
2018-05-13	3.60	114.13
2018-05-14	116.37	2.32
2018-05-15	122.93	76.22
2018-05-16	74.98	100.38
2018-05-17	88.67	48.33
2018-05-18	17.00	60.34
2018-05-20	38.50	17.06
2018-05-22	49.51	34.14
2018-05-23	71.90	43.54
2018-05-24	59.91	59.02
2018-05-25	10.60	52.63
2018-05-26	34.35	24.31
2018-05-27	40.80	42.12
2018-05-28	36.75	48.16
2018-05-29	40.70	47.45
2018-05-30	38.30	51.61
2018-05-31	49.48	51.63
2018-06-01	33.65	59.78

#====================================================================================
#  Comparision between the two approaches besed on some Forecast Accuracy Measures
#====================================================================================

# Function calculate some accuracy measures: 

get_accuracy_measures <- function(your_result_df) {
  
  act <- your_result_df %>% pull(daily_cash)
  pred <- your_result_df %>% pull(predicted)
  err <- act - pred 
  per_err <- abs(err / act)
  
  # Mean Absolute Error (MAE): 
  mae <- err %>% abs() %>% mean()
  
  # Mean Squared Error (MSE): 
  mse <- mean(err^2)
  
  # Mean Absolute Percentage Error (MAPE): 
  mape <- 100*mean(per_err, trim = 5)
  
  # Return results: 
  return(data.frame(MAE = mae, MSE = mse, MAPE = mape))
}


# Use above function for comparing: 

bind_rows(results_df %>% get_accuracy_measures(), arima_results %>% get_accuracy_measures) %>% 
  mutate(Approach = c("Automatic ML", "ARIMA")) %>% 
  select(Approach, everything()) %>% 
  mutate_if(is.numeric, function(x) {round(x, 2)}) %>% 
  knitr::kable()

Approach	MAE	MSE	MAPE
Automatic ML	13.37	805.30	7.08
ARIMA	45.61	5512.86	45.49