Step 07: Models - Keras LSTM

Setup

Load the relevant libraries.

# rm(list = ls())
# .rs.restartR()


# data manipulation
library("plyr")
library("tidyverse")
library("magrittr")
library("data.table")
library("lubridate")
library("sqldf")


# time series specific packages
library("timetk")
library("zoo")
library("tibbletime")


# modeling
library("fpp2")
library("prophet")
library("caret")
library("randomForest")
library("xgboost")
library("h2o")
library("keras")
# use_session_with_seed(123456789) # setting the seed to obtain reproducible results
# see https://keras.rstudio.com/articles/faq.html#how-can-i-obtain-reproducible-results-using-keras-during-development and https://cran.r-project.org/web/packages/keras/vignettes/faq.html
# can also re-enable gpu and parallel processing by using:  use_session_with_seed(42, disable_gpu = FALSE, disable_parallel_cpu = FALSE)



# other
library("geosphere")          # specific for distance calculations from lat-lon pairs
library("naniar")             # inspecting missing data
library("rlang")              # building functions
library("recipes")            # used in Keras modeling to design matrices
library("rsample")            # rolling samples for validation stats
library("tfruns")             # used in Keras modeling for trainin runs
library("stringr")            # string manipulation
library("ggplot2")            # viz
library("sweep")              # more easily pull out model statistics
library("yardstick")          # easily calculate accuracy stats
library("doParallel")         # parallel processing

Session Info.

sessionInfo()

## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] doParallel_1.0.14   iterators_1.0.10    foreach_1.4.4      
##  [4] yardstick_0.0.2     sweep_0.2.1.1       tfruns_1.4         
##  [7] rsample_0.0.3       recipes_0.1.4       rlang_0.3.0.1      
## [10] naniar_0.4.1        geosphere_1.5-7     keras_2.2.4        
## [13] h2o_3.20.0.8        xgboost_0.71.2      randomForest_4.6-14
## [16] caret_6.0-81        lattice_0.20-38     prophet_0.3.0.1    
## [19] Rcpp_1.0.0          fpp2_2.3            expsmooth_2.3      
## [22] fma_2.3             forecast_8.4        tibbletime_0.1.1   
## [25] zoo_1.8-4           timetk_0.1.1.1      sqldf_0.4-11       
## [28] RSQLite_2.1.1       gsubfn_0.7          proto_1.0.0        
## [31] lubridate_1.7.4     data.table_1.11.8   magrittr_1.5       
## [34] forcats_0.3.0       stringr_1.3.1       dplyr_0.7.8        
## [37] purrr_0.2.5         readr_1.2.1         tidyr_0.8.2        
## [40] tibble_1.4.2        ggplot2_3.1.0       tidyverse_1.2.1    
## [43] plyr_1.8.4         
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.3-2   class_7.3-14       visdat_0.5.1      
##  [4] rprojroot_1.3-2    base64enc_0.1-3    rstudioapi_0.8    
##  [7] rstan_2.18.2       bit64_0.9-7        prodlim_2018.04.18
## [10] xml2_1.2.0         codetools_0.2-15   splines_3.5.1     
## [13] knitr_1.20         zeallot_0.1.0      jsonlite_1.5      
## [16] pROC_1.13.0        broom_0.5.0        compiler_3.5.1    
## [19] httr_1.3.1         backports_1.1.2    assertthat_0.2.0  
## [22] Matrix_1.2-15      lazyeval_0.2.1     cli_1.0.1         
## [25] htmltools_0.3.6    prettyunits_1.0.2  tools_3.5.1       
## [28] bindrcpp_0.2.2     gtable_0.2.0       glue_1.3.0        
## [31] reshape2_1.4.3     cellranger_1.1.0   fracdiff_1.4-2    
## [34] urca_1.3-0         debugme_1.1.0      nlme_3.1-137      
## [37] lmtest_0.9-36      timeDate_3043.102  gower_0.1.2       
## [40] ps_1.2.1           rvest_0.3.2        MASS_7.3-51.1     
## [43] scales_1.0.0       ipred_0.9-8        hms_0.4.2         
## [46] inline_0.3.15      yaml_2.2.0         quantmod_0.4-13   
## [49] curl_3.2           reticulate_1.10    memoise_1.1.0     
## [52] gridExtra_2.3      loo_2.0.0          StanHeaders_2.18.0
## [55] uroot_2.0-9        rpart_4.1-13       stringi_1.2.4     
## [58] tensorflow_1.10    tseries_0.10-46    TTR_0.23-4        
## [61] pkgbuild_1.0.2     lava_1.6.4         chron_2.3-53      
## [64] bitops_1.0-6       pkgconfig_2.0.2    matrixStats_0.54.0
## [67] evaluate_0.12      bindr_0.1.1        bit_1.1-14        
## [70] processx_3.2.0     tidyselect_0.2.5   R6_2.3.0          
## [73] generics_0.0.2     DBI_1.0.0          whisker_0.3-2     
## [76] pillar_1.3.0       haven_2.0.0        withr_2.1.2       
## [79] xts_0.11-2         sp_1.3-1           RCurl_1.95-4.11   
## [82] survival_2.43-3    nnet_7.3-12        modelr_0.1.2      
## [85] crayon_1.3.4       rmarkdown_1.10     grid_3.5.1        
## [88] readxl_1.1.0       blob_1.1.1         callr_3.0.0       
## [91] ModelMetrics_1.2.2 digest_0.6.18      stats4_3.5.1      
## [94] munsell_0.5.0      tcltk_3.5.1        quadprog_1.5-5

Setup the root directory.

Setting wd as the working directory.

wd <- getwd()

wd

## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy"

Modeling

NOTE: add_el_stop_id the output produced in Step 06

add_el_stop_id <-
  readRDS(paste0(wd,
                 "/Models/",
                 "add_el_stop_id.Rds"
                 )
          )

Keras for LSTM

Based on examples from http://www.business-science.io/timeseries-analysis/2018/07/01/keras-lstm-sunspots-part2.html and https://tensorflow.rstudio.com/blog/sunspots-lstm.html

Try an example on just the most recent split for one `el_stop_id`.

First, I create the train/val/test datasets.

example_split <- add_el_stop_id$`40600`$splits[[13]]
example_split_id <- add_el_stop_id$`40600`$id[[13]]


##### split into train (2/3 of original train data), validation (1/3 of original train data), and test (original validation data)
keras_split_pt <-
  ((analysis(example_split) %>% 
      nrow()
    ) * (2/3)
   ) %>% 
  round(digits = 0) 
  
df_trn <-
  analysis(example_split)[1:keras_split_pt,
                           ,
                          drop = FALSE
                          ] %>% 
  select(el_date, el_rides)

df_val <-
  analysis(example_split)[(keras_split_pt + 1):nrow(analysis(example_split)
                                                    ),
                           ,
                          drop = FALSE
                          ] %>% 
  select(el_date, el_rides)

df_tst <- assessment(example_split) %>% 
  select(el_date, el_rides)


rm(keras_split_pt)


df <- bind_rows(
  df_trn %>% add_column(key = "01_training"),
  df_val %>% add_column(key = "02_validation"),
  df_tst %>% add_column(key = "03_testing")
  ) %>% 
  as_tbl_time(index = el_date) %>% 
  select(el_date,
         key,
         el_rides
         )

## Warning: `list_len()` is soft-deprecated as of rlang 0.2.0.
## Please use `new_list()` instead
## This warning is displayed once per session.

# View(df)

df %>% 
  group_by(key) %>% 
  summarize(min_date = min(el_date),
            max_date = max(el_date),
            rows = n()
         ) %>% 
  ungroup()

# rm(list = ls(pattern = "df_"))
rm(list = ls(pattern = "example_split"))

Now I center and scale the data.

rec_obj <- recipe(el_rides ~ ., df) %>% 
  step_sqrt(el_rides) %>% 
  step_center(el_rides) %>% 
  step_scale(el_rides) %>% 
  prep()

df_processed_tbl <- bake(rec_obj, df)

# View(df_processed_tbl)

center_history <- rec_obj$steps[[2]]$means["el_rides"]
scale_history  <- rec_obj$steps[[3]]$sds["el_rides"]

c("center" = center_history, "scale" = scale_history)

## center.el_rides  scale.el_rides 
##       21.464909        3.464935

Functions used to build out the LSTM infrastructure.

# create the matrix
build_matrix <- function(tseries, overall_timesteps) {
  t(sapply(1:(length(tseries) - overall_timesteps + 1),
           function(x) 
             tseries[x:(x + overall_timesteps - 1)]
           )
    )
  }


# update the dimensiions
reshape_X_3d <- function(X) {
  dim(X) <- c(dim(X)[1],
              dim(X)[2],
              1
              )
  X
  }

Extract el_rides from data frames.

train_vals <- df_processed_tbl %>% 
  filter(key == "01_training") %>%
  select(el_rides) %>%
  pull()

valid_vals <- df_processed_tbl %>%
  filter(key == "02_validation") %>%
  select(el_rides) %>%
  pull()

test_vals <- df_processed_tbl %>%
  filter(key == "03_testing") %>%
  select(el_rides) %>%
  pull()

Here I shape the data for LSTM modeling.

# From [http://www.business-science.io/timeseries-analysis/2018/07/01/keras-lstm-sunspots-part2.html](http://www.business-science.io/timeseries-analysis/2018/07/01/keras-lstm-sunspots-part2.html)
# Keras LSTM expects the input as well as the target data to be in a specific shape. The input has to be a 3-d array of size num_samples, num_timesteps, num_features.
# 
    # Num_samples is the number of observations in the set. This will get fed to the model in portions of batch_size. 
    # The second dimension, num_timesteps, is the length of the hidden state we were talking about above. 
    # Finally, the third dimension is the number of predictors we’re using. For univariate time series, this is 1.

n_timesteps <- 7
n_predictions <- n_timesteps
batch_size <- 28


###### build the windowed matrices
train_matrix <- build_matrix(train_vals, n_timesteps + n_predictions)
valid_matrix <- build_matrix(valid_vals, n_timesteps + n_predictions)
test_matrix <- build_matrix(test_vals, n_timesteps + n_predictions)


###### separate matrices into training and testing parts
# also, discard last batch if there are fewer than batch_size samples (a purely technical requirement)
X_train <- train_matrix[, 1:n_timesteps]
y_train <- train_matrix[, (n_timesteps + 1):(n_timesteps * 2)]
X_train <- X_train[1:(nrow(X_train) %/% batch_size * batch_size), ]
y_train <- y_train[1:(nrow(y_train) %/% batch_size * batch_size), ]

X_valid <- valid_matrix[, 1:n_timesteps]
y_valid <- valid_matrix[, (n_timesteps + 1):(n_timesteps * 2)]
X_valid <- X_valid[1:(nrow(X_valid) %/% batch_size * batch_size), ]
y_valid <- y_valid[1:(nrow(y_valid) %/% batch_size * batch_size), ]

X_test <- test_matrix[, 1:n_timesteps]
y_test <- test_matrix[, (n_timesteps + 1):(n_timesteps * 2)]
X_test <- X_test[1:(nrow(X_test) %/% batch_size * batch_size), ]
y_test <- y_test[1:(nrow(y_test) %/% batch_size * batch_size), ]


###### add on the required third axis
X_train <- reshape_X_3d(X_train)
X_valid <- reshape_X_3d(X_valid)
X_test <- reshape_X_3d(X_test)

y_train <- reshape_X_3d(y_train)
y_valid <- reshape_X_3d(y_valid)
y_test <- reshape_X_3d(y_test)

Now, loop through various parameters settings.

# devtools::install_github("rstudio/keras")
# install_keras()  # should only have to install once
# user  system elapsed 
#   4.610   5.864 700.531
# ~ 12 min
start <- proc.time()

tot_cores <- detectCores()
cl <- makeCluster(tot_cores - 1)
registerDoParallel(cl)

# 3 * 3 * 2 * 2 * 2 = 72 different combinations tried
# set the outer-most-loop with lrn_rate as it has the most number of possibilities

keras_parameter_tuning_trials <-
  foreach(lrn_rate = seq(0.001, 0.003, 0.001), # seq(0.00, 0.003, 0.001)
          .packages = c("tfruns", "keras", "tidyverse", "purrr", "yardstick"),
          .combine = c,
          .verbose = FALSE
          ) %:%
  foreach(decay = seq(0.001, 0.003, 0.001), # seq(0.002, 0.003, 0.001)
          packages = c("tfruns", "keras", "tidyverse", "purrr", "yardstick"),
          .combine = c,
          .verbose = FALSE
          ) %:% 
  foreach(drpout = seq(0.1, 0.2, 0.1), # seq(0.0, 0.2, 0.1)
          packages = c("tfruns", "keras", "tidyverse", "purrr", "yardstick"),
          .combine = c,
          .verbose = FALSE
          ) %:% 
  foreach(rec_drpout = seq(0.1, 0.2, 0.1), # seq(0.0, 0.2, 0.1)
          .packages = c("tfruns", "keras", "tidyverse", "purrr", "yardstick"),
          .combine = c,
          .verbose = FALSE
          ) %:%
  foreach(momntm = seq(0.8, 0.9, 0.1), # eq(0.7, 0.9, 0.1)
          packages = c("tfruns", "keras", "tidyverse", "purrr", "yardstick"),
          .combine = c,
          .verbose = FALSE
          ) %dopar% {
            
###### Build the LSTM Model
FLAGS <- flags(
  # There is a so-called "stateful LSTM" in Keras. While LSTM is stateful per se,
  # this adds a further tweak where the hidden states get initialized with values 
  # from the item at same position in the previous batch.
  # This is helpful just under specific circumstances, or if you want to create an
  # "infinite stream" of states, in which case you'd use 1 as the batch size.
  # Below, we show how the code would have to be changed to use this, but it won't be further
  # discussed here.
  flag_boolean("stateful", FALSE),
  # Should we use several layers of LSTM?
  # Again, just included for completeness, it did not yield any superior performance on this task.
  # This will actually stack exactly one additional layer of LSTM units.
  flag_boolean("stack_layers", FALSE),
  # number of samples fed to the model in one go
  flag_integer("batch_size", batch_size),  # 28  --  # 180
  # size of the hidden state, equals size of predictions
  flag_integer("n_timesteps", n_timesteps), # 7
  # how many epochs to train for
  flag_integer("n_epochs", 100),
  # loss function. Found to work better for this specific case than mean squared error
  flag_string("loss", "logcosh"),  # mape
  # optimizer = stochastic gradient descent. Seemed to work better than adam or rmsprop here
  # (as indicated by limited testing)
  flag_string("optimizer_type", "adam"),  # sgd
  # size of the LSTM layer
  flag_integer("n_units", 128),
  # parameter to the early stopping callback
  flag_integer("patience", 10),
  
  # learning rate
  flag_numeric("lr", 0.003), # 0.003  lrn_rate
  # learning rate decay
  flag_numeric("decay", 0.003), # 0.003  decay
  # fraction of the units to drop for the linear transformation of the inputs
  flag_numeric("dropout", 0.2), # 0.2  drpout
  # fraction of the units to drop for the linear transformation of the recurrent state
  flag_numeric("recurrent_dropout", 0.2), # 0.2  rec_drpout
  # momentum, an additional parameter to the SGD optimizer
  flag_numeric("momentum", 0.9) # 0.9  momntm
)

# the number of predictions we'll make equals the length of the hidden state
n_predictions <- FLAGS$n_timesteps

# how many features = predictors we have
n_features <- 1

# just in case we wanted to try different optimizers, we could add here
optimizer <- switch(FLAGS$optimizer_type,
                    sgd = optimizer_sgd(lr = FLAGS$lr,
                                        momentum = FLAGS$momentum
                                        ),
                    adam = optimizer_adam(lr = FLAGS$lr,
                                          decay = FLAGS$decay
                                          )
                    )

# callbacks to be passed to the fit() function
# We just use one here: we may stop before n_epochs if the loss on the validation set
# does not decrease (by a configurable amount, over a configurable time)
callbacks <-
  list(callback_early_stopping(patience = FLAGS$patience)
       )


##### Run longer version
model <- keras_model_sequential()

model %>%
  layer_lstm(
    units = FLAGS$n_units,
    batch_input_shape  = c(FLAGS$batch_size, FLAGS$n_timesteps, n_features),
    dropout = FLAGS$dropout,
    recurrent_dropout = FLAGS$recurrent_dropout,
    return_sequences = TRUE,
    stateful = FLAGS$stateful
  )

if (FLAGS$stack_layers) {
  model %>%
    layer_lstm(
      units = FLAGS$n_units,
      dropout = FLAGS$dropout,
      recurrent_dropout = FLAGS$recurrent_dropout,
      return_sequences = TRUE,
      stateful = FLAGS$stateful
    )
  }

model %>% time_distributed(layer_dense(units = 1))


# summary(model)


##### Root Mean Squared Error (needed for accuracy stats)
# this must be defined within the Keras loop
rmse_metric <-
  custom_metric("rmse_metric",
                function(y_true, y_pred) {
                  K <- backend()
                  K$sqrt(K$mean(K$square(K$relu(y_pred - y_true)
                                         )
                                )
                         )
                  }
                )


model %>%
  compile(
    loss = FLAGS$loss,
    optimizer = optimizer,
    metrics = list(mape = "mean_absolute_percentage_error",
                   mse = "mean_squared_error",
                   rmse_metric
                   )
  )

if (!FLAGS$stateful) {
  history <-
    model %>% 
    fit(
      x = X_train,
      y = y_train,
      validation_data = list(X_valid, y_valid),
      batch_size = FLAGS$batch_size,
      epochs     = FLAGS$n_epochs,
      callbacks = callbacks,
      verbose = 0
      )
  } else {
    history <-
      for (i in 1:FLAGS$n_epochs) {
        model %>% 
          fit(
            x = X_train,
            y = y_train,
            validation_data = list(X_valid, y_valid),
            callbacks = callbacks,
            batch_size = FLAGS$batch_size,
            epochs = 1,
            shuffle = FALSE,
            verbose = 0
            )
        
        model %>% reset_states()
        }
    }

if (FLAGS$stateful)
  model %>% reset_states()


# plot(history)
# plot(history, metrics = "loss")
# plot(history, metrics = "rmse_metric")


##### Calculate MAPE
## Training Set
pred_train <- model %>%
  predict(X_train, batch_size = FLAGS$batch_size) %>%
  .[, , 1]

# Retransform values to original scale
pred_train_norm <- (pred_train * scale_history + center_history) ^2

compare_train <- df %>% filter(key == "01_training")


# build a dataframe that has both actual and predicted values
for (i in 1:nrow(pred_train_norm)
     ) {
  varname <- paste0("pred_train", i)
  
  compare_train <-
    mutate(compare_train,
           !!varname := c(rep(NA,
                              FLAGS$n_timesteps + i - 1
                              ),
                          pred_train_norm[i,],
                          rep(NA, 
                              nrow(compare_train) - FLAGS$n_timesteps * 2 - i + 1
                              )
                          )
           )
  }

coln <- colnames(compare_train)[4:ncol(compare_train)]
cols <- coln %>% 
  map(~ sym(.x)
      )

# mape_train <-
#   map_dbl(cols,
#           function(col) {
#             data = compare_train %>%
#               mutate(#error = el_rides - !!col,
#                      pct_error = el_rides - !!col / el_rides * 100
#                      )
# 
#             mape = data$pct_error %>%
#               abs() %>%
#               mean(na.rm = TRUE)
#             
#             # rmse = (data$error)^2 %>% 
#             #   mean(na.rm = TRUE) %>% 
#             #   sqrt()
# 
#             # return(list(mape_ = mape,
#             #             rmse_ = rmse
#             #             )
#             #        )
#             return(mape)
#             }
#           ) %>%
#   mean()
# 
# mape_train

rmse_train <-
  map(cols,
      function(col)
        rmse(data = compare_train,
             truth = el_rides,
             estimate = !!col,
             na.rm = TRUE
             ) %>% 
        select(.estimate)
      ) %>% 
  bind_rows()

rmse_train <- mean(rmse_train$.estimate, na.rm = TRUE)

rmse_train

name <- paste0("learningrate_",
               lrn_rate,
               "___",
               "decay_",
               decay,
               "___",
               "dropout_",
               drpout,
               "___",
               "recurrentdropout_",
               rec_drpout,
               "___",
               "momentum_",
               momntm
               )


rmse_train_list <- list()
rmse_train_list[[name]] <- rmse_train

return(rmse_train_list)
    }


stopCluster(cl)
rm(cl)

runtime_dopar <- proc.time() - start
runtime_dopar

##    user  system elapsed 
##   3.502   5.789 865.858

rm(start)

Munge to get a more presentable listing (as a dataframe) of models tried.

rmse_vals_list <-
  pmap(.l = list(a = keras_parameter_tuning_trials,
                 b = names(keras_parameter_tuning_trials)
                 ),
       .f = function(a, b) {
         rmse_val = a
         
         df = data.frame(parameters = b,
                         rmse = rmse_val
                         )
         
         return(df)
         }
       )

rmse_vals_df <-
  bind_rows(rmse_vals_list) %>% 
  separate(col = parameters,
           into = c("learn_rate",
                    "decay",
                    "dropout",
                    "recurrent_dropout",
                    "momentum"
                    ),
           sep = "___",
           remove = FALSE
           ) %>% 
  separate(col = learn_rate,
           into = c("lrn_rate",
                    "lrn_rate_value"
                    ),
           sep = "_"
           ) %>% 
  separate(col = decay,
           into = c("decay",
                    "decay_value"
                    ),
           sep = "_"
           ) %>% 
  separate(col = dropout,
           into = c("drpout",
                    "drpout_value"
                    ),
           sep = "_"
           ) %>% 
  separate(col = recurrent_dropout,
           into = c("rec_drpout",
                    "rec_drpout_value"
                    ),
           sep = "_"
           ) %>% 
  separate(col = momentum,
           into = c("momtum",
                    "momtum_value"
                    ),
           sep = "_"
           )


runtime_dopar

##    user  system elapsed 
##   3.502   5.789 865.858

# View(rmse_vals_df)
rmse_vals_df %>% 
  arrange(rmse)

View(rmse_vals_df %>% 
       arrange(rmse)
     )


rm(rmse_vals_list)

Now, run the model using the best parameter settings.

Setup the model.

# use the best parameter settings
best_settings <-
  rmse_vals_df %>% 
  filter(rmse == min(rmse)
        )

saveRDS(best_settings,
        paste0(wd,
               "/Models/",
               "best_settings.Rds"
               )
        )

# best_settings <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "best_settings.Rds"
#                  )
#           )


###### Build the LSTM Model

# Update FLAG parameters with `best_settings` values
FLAGS <- flags(
  # There is a so-called "stateful LSTM" in Keras. While LSTM is stateful per se,
  # this adds a further tweak where the hidden states get initialized with values 
  # from the item at same position in the previous batch.
  # This is helpful just under specific circumstances, or if you want to create an
  # "infinite stream" of states, in which case you'd use 1 as the batch size.
  # Below, we show how the code would have to be changed to use this, but it won't be further
  # discussed here.
  flag_boolean("stateful", FALSE),
  # Should we use several layers of LSTM?
  # Again, just included for completeness, it did not yield any superior performance on this task.
  # This will actually stack exactly one additional layer of LSTM units.
  flag_boolean("stack_layers", FALSE),
  # number of samples fed to the model in one go
  flag_integer("batch_size", batch_size),  # 28  # 180
  # size of the hidden state, equals size of predictions
  flag_integer("n_timesteps", n_timesteps),  # 7
  # how many epochs to train for
  flag_integer("n_epochs", 100),
  # loss function. Found to work better for this specific case than mean squared error
  flag_string("loss", "logcosh"),  # mape
  # optimizer = stochastic gradient descent. Seemed to work better than adam or rmsprop here
  # (as indicated by limited testing)
  flag_string("optimizer_type", "adam"),  # sgd
  # size of the LSTM layer
  flag_integer("n_units", 128),
  # parameter to the early stopping callback
  flag_integer("patience", 10),
  
  # learning rate
  flag_numeric("lr", best_settings$lrn_rate_value), # 0.003  lrn_rate
  # learning rate decay
  flag_numeric("decay", best_settings$decay_value), # 0.003  decay
  # fraction of the units to drop for the linear transformation of the inputs
  flag_numeric("dropout", best_settings$drpout_value), # 0.2  drpout
  # fraction of the units to drop for the linear transformation of the recurrent state
  flag_numeric("recurrent_dropout", best_settings$rec_drpout_value), # 0.2  rec_drpout
  # momentum, an additional parameter to the SGD optimizer
  flag_numeric("momentum", best_settings$momtum_value) # 0.9  momntm
  )


# the number of predictions we'll make equals the length of the hidden state
n_predictions <- FLAGS$n_timesteps

# how many features = predictors we have
n_features <- 1

# just in case we wanted to try different optimizers, we could add here
optimizer <- switch(FLAGS$optimizer_type,
                    sgd = optimizer_sgd(lr = FLAGS$lr,
                                        momentum = FLAGS$momentum
                                        ),
                    adam = optimizer_adam(lr = FLAGS$lr,
                                          decay = FLAGS$decay
                                          )
                    )

# callbacks to be passed to the fit() function
# We just use one here: we may stop before n_epochs if the loss on the validation set
# does not decrease (by a configurable amount, over a configurable time)
callbacks <-
  list(callback_early_stopping(patience = FLAGS$patience)
       )

Run the model.

model <- keras_model_sequential()

model %>%
  layer_lstm(
    units = FLAGS$n_units,
    batch_input_shape  = c(FLAGS$batch_size, FLAGS$n_timesteps, n_features),
    dropout = FLAGS$dropout,
    recurrent_dropout = FLAGS$recurrent_dropout,
    return_sequences = TRUE,
    stateful = FLAGS$stateful
  )

if (FLAGS$stack_layers) {
  model %>%
    layer_lstm(
      units = FLAGS$n_units,
      dropout = FLAGS$dropout,
      recurrent_dropout = FLAGS$recurrent_dropout,
      return_sequences = TRUE,
      stateful = FLAGS$stateful
    )
}

model %>% time_distributed(layer_dense(units = 1))

# Root Mean Squared Error (needed for accuracy stats)
rmse_metric <-
  custom_metric("rmse_metric",
                function(y_true, y_pred) {
                  K <- backend()
                  K$sqrt(K$mean(K$square(K$relu(y_pred - y_true)
                                         )
                                )
                         )
                  }
                )

model %>%
  compile(
    loss = FLAGS$loss,
    optimizer = optimizer,
    metrics = list(mape = "mean_absolute_percentage_error",
                   mse = "mean_squared_error",
                   rmse_metric
                   )
  )

if (!FLAGS$stateful) {
  history <-
    model %>% 
    fit(
      x = X_train,
      y = y_train,
      validation_data = list(X_valid, y_valid),
      batch_size = FLAGS$batch_size,
      epochs     = FLAGS$n_epochs,
      callbacks = callbacks,
      verbose = 0
      )
  } else {
    history <-
      for (i in 1:FLAGS$n_epochs) {
        model %>% 
          fit(
            x = X_train,
            y = y_train,
            validation_data = list(X_valid, y_valid),
            callbacks = callbacks,
            batch_size = FLAGS$batch_size,
            epochs = 1,
            shuffle = FALSE,
            verbose = 0
            )
        
        model %>% reset_states()
        }
    }

if (FLAGS$stateful)
  model %>% reset_states()

summary(model)

## ___________________________________________________________________________
## Layer (type)                     Output Shape                  Param #     
## ===========================================================================
## lstm_1 (LSTM)                    (28, 7, 128)                  66560       
## ___________________________________________________________________________
## time_distributed_1 (TimeDistribu (28, 7, 1)                    129         
## ===========================================================================
## Total params: 66,689
## Trainable params: 66,689
## Non-trainable params: 0
## ___________________________________________________________________________

plot(history)

plot(history, metrics = "loss")

plot(history, metrics = "rmse_metric")

save_model_hdf5(object = model,
                filepath = paste0(wd,
                                  "/Models/",
                                  'keras_optimal_model.h5'
                                  )
                )

# model <-
#   load_model_hdf5(filepath = paste0(wd,
#                                     "/Models/",
#                                     'keras_optimal_model.h5'
#                                     )
#                   )

Calculate accuracy (MAPE and RMSE) stats.

## Training Set
pred_train <- model %>%
  predict(X_train, batch_size = FLAGS$batch_size) %>%
  .[, , 1]

# Retransform values to original scale
pred_train_norm <- (pred_train * scale_history + center_history) ^2

compare_train <- df %>% filter(key == "01_training")


# build a dataframe that has both actual and predicted values
for (i in 1:nrow(pred_train_norm)
     ) {
  varname <- paste0("pred_train", i)
  
  compare_train <-
    mutate(compare_train,
           !!varname := c(rep(NA,
                              FLAGS$n_timesteps + i - 1
                              ),
                          pred_train_norm[i,],
                          rep(NA, 
                              nrow(compare_train) - FLAGS$n_timesteps * 2 - i + 1
                              )
                          )
           )
  }

coln <- colnames(compare_train)[4:ncol(compare_train)]
cols <- coln %>% 
  map(~ sym(.x)
      )

# MAPE
mape_train <-
  map_dbl(cols,
          function(col) {
            data = compare_train %>%
              mutate(pct_error = el_rides - !!col / el_rides * 100
                     )

            mape = data$pct_error %>%
              abs() %>%
              mean(na.rm = TRUE)

            return(mape)
            }
          ) %>%
  mean()


# RMSE
rmse_train <-
  map(cols,
      function(col)
        rmse(data = compare_train,
             truth = el_rides,
             estimate = !!col,
             na.rm = TRUE
             ) %>% 
        select(.estimate)
      ) %>% 
  bind_rows()

rmse_train <- mean(rmse_train$.estimate, na.rm = TRUE)

rmse_train

## [1] 63.94936

### Calculate on test data
pred_test <- model %>% 
  predict(X_test, batch_size = FLAGS$batch_size) %>% 
  .[, , 1]

# Retransform values to original scale
pred_test <- (pred_test * scale_history + center_history) ^2

compare_test <- df %>% filter(key == "03_testing")

# build a dataframe that has both actual and predicted values
for (i in 1:nrow(pred_test)
     ) {
  varname <- paste0("pred_test", i)
  compare_test <-
    mutate(compare_test,
           !!varname := c(rep(NA,
                              FLAGS$n_timesteps + i - 1
                              ),
                          pred_test[i,],
                          rep(NA,
                              nrow(compare_test) - FLAGS$n_timesteps * 2 - i + 1
                              )
                          )
           )
  }


coln <- colnames(compare_test)[4:ncol(compare_test)]
cols <- coln %>% 
  map(~ sym(.x)
      )

# MAPE
mape_test <-
  map_dbl(cols,
          function(col) {
            data = compare_test %>%
              mutate(pct_error = el_rides - !!col / el_rides * 100
                     )

            mape = data$pct_error %>%
              abs() %>%
              mean(na.rm = TRUE)

            return(mape)
            }
          ) %>%
  mean()


# RMSE
rmse_test <-
  map(cols,
      function(col)
        rmse(data = compare_test,
             truth = el_rides,
             estimate = !!col,
             na.rm = TRUE
             ) %>% 
        select(.estimate)
      ) %>% 
  bind_rows()

rmse_test <- mean(rmse_test$.estimate, na.rm = TRUE)

rmse_test

## [1] 65.49447

message("mape_train")

## mape_train

mape_train

## [1] 371.8885

message("mape_test")

## mape_test

mape_test

## [1] 377.3498

message("rmse_train")

## rmse_train

rmse_train

## [1] 63.94936

message("rmse_test")

## rmse_test

rmse_test

## [1] 65.49447

# View(compare_train)
# View(compare_test)

Plot actual vs. prediction on test data.

ggplot(compare_test,
       aes(x = el_date, 
           y = el_rides
           )
       ) +
  geom_line() +
  geom_line(aes(y = pred_test1), color = "cyan", na.rm = TRUE) +
  geom_line(aes(y = pred_test30), color = "red", na.rm = TRUE) +
  geom_line(aes(y = pred_test60), color = "green", na.rm = TRUE) +
  geom_line(aes(y = pred_test90), color = "violet", na.rm = TRUE) +
  geom_line(aes(y = pred_test120), color = "cyan", na.rm = TRUE) +
  geom_line(aes(y = pred_test150), color = "red", na.rm = TRUE) +
  geom_line(aes(y = pred_test180), color = "green", na.rm = TRUE) +
  labs(title = "Keras LSTM Predictions on Test Set",
       subtitle = "L Stop ID 40600:  split 13 of 13"
       ) +
  theme_minimal() +
  NULL

Run the LSTM model for each split.

# Build the function
obtain_predictions <-
  function(split) {
    # split into train (2/3 of original train data), validation (1/3 of original train data), and test (original validation data)
    keras_split_pt <-
      ((analysis(split) %>% 
          nrow()
        ) * (2/3)
       ) %>% 
      round(digits = 0)
    
    df_trn <-
      analysis(split)[1:keras_split_pt,
                       ,
                      drop = FALSE
                      ] %>% 
      select(el_date, el_rides)
    
    df_val <-
      analysis(split)[(keras_split_pt + 1):nrow(analysis(split)
                                                ),
                       ,
                      drop = FALSE
                      ] %>% 
      select(el_date, el_rides)
    
    df_tst <- assessment(split) %>% 
      select(el_date, el_rides)
    
    
    df <-
      bind_rows(df_trn %>% add_column(key = "01_training"),
                df_val %>% add_column(key = "02_validation"),
                df_tst %>% add_column(key = "03_testing")
                ) %>% 
      as_tbl_time(index = el_date) %>% 
      select(el_date,
             key,
             el_rides
             )
    
    
    ##### Center & Scaling the data
    rec_obj <- recipe(el_rides ~ ., df) %>% 
      step_sqrt(el_rides) %>% 
      step_center(el_rides) %>% 
      step_scale(el_rides) %>% 
      prep()
    
    df_processed_tbl <- bake(rec_obj, df)
    
    center_history <- rec_obj$steps[[2]]$means["el_rides"]
    scale_history  <- rec_obj$steps[[3]]$sds["el_rides"]
    
    
    ###### Build the LSTM Model
    FLAGS <- FLAGS
    
    n_predictions <- FLAGS$n_timesteps
    n_features <- 1
    
    optimizer <- switch(FLAGS$optimizer_type,
                        sgd = optimizer_sgd(lr = FLAGS$lr,
                                            momentum = FLAGS$momentum
                                            ),
                        adam = optimizer_adam(lr = FLAGS$lr,
                                              decay = FLAGS$decay
                                              )
                        )
    
    callbacks <- list(callback_early_stopping(patience = FLAGS$patience)
                      )
    
    
    ###### extract el_ridess from data frame
    train_vals <- df_processed_tbl %>% 
      filter(key == "01_training") %>%
      select(el_rides) %>% 
      pull()
    
    valid_vals <- df_processed_tbl %>% 
      filter(key == "02_validation") %>% 
      select(el_rides) %>% 
      pull()
    
    test_vals <- df_processed_tbl %>% 
      filter(key == "03_testing") %>% 
      select(el_rides) %>% 
      pull()
    
    
    train_matrix <- build_matrix(train_vals, FLAGS$n_timesteps + n_predictions)
    valid_matrix <- build_matrix(valid_vals, FLAGS$n_timesteps + n_predictions)
    test_matrix <- build_matrix(test_vals, FLAGS$n_timesteps + n_predictions)
    
    
    ###### separate matrices into training and testing parts
    # also, discard last batch if there are fewer than batch_size samples (a purely technical requirement)
    # nrow(X_train) %/% 360 * 360
    # nrow(X_valid) %/% 360 * 360
    X_train <- train_matrix[, 1:FLAGS$n_timesteps]
    y_train <- train_matrix[, (FLAGS$n_timesteps + 1):(FLAGS$n_timesteps * 2)]
    X_train <- X_train[1:(nrow(X_train) %/% FLAGS$batch_size * FLAGS$batch_size), ]
    y_train <- y_train[1:(nrow(y_train) %/% FLAGS$batch_size * FLAGS$batch_size), ]
    
    X_valid <- valid_matrix[, 1:FLAGS$n_timesteps]
    y_valid <- valid_matrix[, (FLAGS$n_timesteps + 1):(FLAGS$n_timesteps * 2)]
    X_valid <- X_valid[1:(nrow(X_valid) %/% FLAGS$batch_size * FLAGS$batch_size), ]
    y_valid <- y_valid[1:(nrow(y_valid) %/% FLAGS$batch_size * FLAGS$batch_size), ]
    
    X_test <- test_matrix[, 1:FLAGS$n_timesteps]
    y_test <- test_matrix[, (FLAGS$n_timesteps + 1):(FLAGS$n_timesteps * 2)]
    X_test <- X_test[1:(nrow(X_test) %/% FLAGS$batch_size * FLAGS$batch_size), ]
    y_test <- y_test[1:(nrow(y_test) %/% FLAGS$batch_size * FLAGS$batch_size), ]
    
    
    X_train <- reshape_X_3d(X_train)
    X_valid <- reshape_X_3d(X_valid)
    X_test <- reshape_X_3d(X_test)
    
    y_train <- reshape_X_3d(y_train)
    y_valid <- reshape_X_3d(y_valid)
    y_test <- reshape_X_3d(y_test)
    
    
    ##### Run longer version
    model <- keras_model_sequential()
    
    model %>%
      layer_lstm(
        units = FLAGS$n_units,
        batch_input_shape  = c(FLAGS$batch_size, FLAGS$n_timesteps, n_features),
        dropout = FLAGS$dropout,
        recurrent_dropout = FLAGS$recurrent_dropout,
        return_sequences = TRUE,
        stateful = FLAGS$stateful
      )
    
    if (FLAGS$stack_layers) {
      model %>%
        layer_lstm(
          units = FLAGS$n_units,
          dropout = FLAGS$dropout,
          recurrent_dropout = FLAGS$recurrent_dropout,
          return_sequences = TRUE,
          stateful = FLAGS$stateful
        )
    }
    
    model %>% time_distributed(layer_dense(units = 1))
    
    # Root Mean Squared Error (needed for accuracy stats)
    # (must be defined within the Keras loop)
    rmse_metric <-
      custom_metric("rmse_metric",
                    function(y_true, y_pred) {
                      K <- backend()
                      K$sqrt(K$mean(K$square(K$relu(y_pred - y_true)
                                             )
                                    )
                      )
                      }
                  )
    
    model %>% 
      compile(
        loss = FLAGS$loss,
        optimizer = optimizer,
        metrics = list(mape = "mean_absolute_percentage_error",
                       mse = "mean_squared_error",
                       rmse_metric
                       )
        )
    
    if (!FLAGS$stateful) {
      history <- model %>% 
        fit(
          x = X_train,
          y = y_train,
          validation_data = list(X_valid, y_valid),
          batch_size = FLAGS$batch_size,
          epochs     = FLAGS$n_epochs,
          callbacks = callbacks,
          verbose = 0
          )
      } else {
        history <-
          for (i in 1:FLAGS$n_epochs) {
            model %>% 
              fit(
                x = X_train,
                y = y_train,
                validation_data = list(X_valid, y_valid),
                callbacks = callbacks,
                batch_size = FLAGS$batch_size,
                epochs = 1,
                shuffle = FALSE,
                verbose = 0
                )
            
            model %>% reset_states()
            }
        }
    
    if (FLAGS$stateful)
      model %>% reset_states()
    
    
    ## Get predictions
    pred_train <- model %>%
      predict(X_train, batch_size = FLAGS$batch_size) %>%
      .[, , 1]
    
    pred_valid <- model %>%
      predict(X_valid, batch_size = FLAGS$batch_size) %>%
      .[, , 1]
    
    pred_test <- model %>% 
      predict(X_test, batch_size = FLAGS$batch_size) %>% 
      .[, , 1]
    
    # Retransform values to original scale
    pred_train_norm <- (pred_train * scale_history + center_history) ^2
    pred_valid_norm <- (pred_valid * scale_history + center_history) ^2
    pred_test_norm <- (pred_test * scale_history + center_history) ^2
    
    compare_train <- df %>% filter(key == "01_training")
    compare_valid <- df %>% filter(key == "02_validation")
    compare_test <- df %>% filter(key == "03_testing")
    
    ##### build a dataframe that has both actual and predicted values
    for (i in 1:nrow(pred_train_norm)
         ) {
      varname <- paste0("pred_train", i)
      
      compare_train <-
        mutate(compare_train,
               !!varname := c(rep(NA,
                                  FLAGS$n_timesteps + i - 1
                                  ),
                              pred_train_norm[i,],
                              rep(NA, 
                                  nrow(compare_train) - FLAGS$n_timesteps * 2 - i + 1
                                  )
                              )
               )
      }
    
    for (i in 1:nrow(pred_valid_norm)
         ) {
      varname <- paste0("pred_valid", i)
      
      compare_valid <-
        mutate(compare_valid,
               !!varname := c(rep(NA,
                                  FLAGS$n_timesteps + i - 1
                                  ),
                              pred_valid_norm[i,],
                              rep(NA, 
                                  nrow(compare_valid) - FLAGS$n_timesteps * 2 - i + 1
                                  )
                              )
               )
      }
    
    for (i in 1:nrow(pred_test_norm)
         ) {
      varname <- paste0("pred_test", i)
      compare_test <-
        mutate(compare_test,
               !!varname := c(rep(NA,
                                  FLAGS$n_timesteps + i - 1
                                  ),
                              pred_test_norm[i,],
                              rep(NA,
                                  nrow(compare_test) - FLAGS$n_timesteps * 2 - i + 1
                                  )
                              )
               )
     }
    
    ##### final output
    list(train = compare_train,
         valid = compare_valid,
         test = compare_test
         )
    }

Run the function.

# best_settings <-
#   readRDS(paste0(wd,
#                  "/Data_Processed/",
#                  "best_settings.Rds"
#                  )
#           )

# using foreach on tot_cores - 1 (i.e., 3) cores
# user   system  elapsed 
#  21.834   16.848 2159.961 
# ~ 36 min

# user   system  elapsed 
# 2308.015  241.913 1852.001
# ~ 31 min

start <- proc.time()

# tot_cores <- detectCores()
# cl <- makeCluster(tot_cores - 1)
# registerDoParallel(cl)
# 
# all_split_preds <-
#   foreach(i = add_el_stop_id,
#           .packages = c("rsample",
#                         "tibbletime",
#                         "recipes",
#                         "tfruns",
#                         "keras",
#                         "tidyverse",
#                         "purrr",
#                         "yardstick"
#                         ),
#           .combine = c,
#           .verbose = FALSE
#           ) %dopar% {
#             dat = i %>%
#               mutate(predict = map(splits,
#                                    obtain_predictions
#                                    )
#                      )
# 
#             return(dat)
#             }

all_split_preds <-
  pmap(.l = list(a = add_el_stop_id),
       .f = function(a) {
         a %>%
           mutate(predict = map(splits,
                                obtain_predictions
                                )
                  )
         }
       )


runtime_keras_all <- proc.time() - start
runtime_keras_all

##     user   system  elapsed 
## 2402.160  252.783 1878.307

# stopCluster(cl)
# rm(cl)
rm(start)


saveRDS(all_split_preds,
        paste0(wd,
               "/Models/",
               "all_split_preds.Rds"
               )
        )

# all_split_preds <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "all_split_preds.Rds"
#                  )
#           )


# View(all_split_preds$`40910`$predict)
# add_el_stop_id$`40910`$splits %>% length()

Calculate accuracy stats.

First, create the function to calculate the stats.

accuracy_stats <-
  function(df, stat_type) {
    coln <- colnames(df)[4:ncol(df)]
    cols <- coln %>% 
      map(~ sym(.x)
          )
    
    if(stat_type == "rmse") {
      result = map(cols,
                   function(col)
                     rmse(data = df,
                          truth = el_rides,
                          estimate = !!col,
                          na.rm = TRUE
                          ) %>% 
                     select(.estimate)
                   ) %>% 
        bind_rows()
      
      result = mean(result$.estimate, na.rm = TRUE)
      } else {
        result = map_dbl(cols,
                         function(col) {
                           data = df %>% 
                             mutate(pct_error = el_rides - !!col / el_rides * 100
                                    )
                           
                           mape_stat = data$pct_error %>% 
                             abs() %>% 
                             mean(na.rm = TRUE)
                           
                           return(mape_stat)
                           }
                         ) %>% 
          mean()
        }
    
    return(result)
    }

Munge the data to be able to run the accuracy_stats function.

all_split_preds_unnest <-
  all_split_preds %>% 
  map(~ unnest(.x,
               predict
               )
      )

names(all_split_preds_unnest$`40600`)

##  [1] "id"                        "start_date"               
##  [3] "h2o.interpolation"         "h2o.extrapolation"        
##  [5] "h2o.limvars.interpolation" "h2o.limvars.extrapolation"
##  [7] "arima.interpolation"       "arima_xreg.interpolation" 
##  [9] "prophet.interpolation"     "prophet_hol.interpolation"
## [11] "rf.interpolation"          "rf.extrapolation"         
## [13] "xgb.interpolation"         "xgb.extrapolation"        
## [15] "arima.extrapolation"       "arima_xreg.extrapolation" 
## [17] "prophet.extrapolation"     "prophet_hol.extrapolation"
## [19] "el_stop_id"                "predict"

all_split_preds_individ <-
  pmap(.l = list(a = all_split_preds_unnest),
       .f = function(a) {
         lngth_tot = length(a$predict)
         
         train_ = a[seq(1, lngth_tot - 2, by = 3), ] # train is the 1st of every 3 sets
         valid_ = a[seq(2, lngth_tot - 1, by = 3), ] # valid is the 2nd of every 3 sets
         test_ = a[seq(3, lngth_tot, by = 3), ] # test is the 3rd of every 3 sets

         return(list(train = train_,
                     valid = valid_,
                     test = test_
                     )
                )
         }
       )

Now I can run the accuracy_stats function to calculate the stats.

# user   system  elapsed 
# 2050.926  337.664 2625.468
# ~ 44 min
start <- proc.time()
all_split_rmse_mape <-
  pmap(.l = list(a = all_split_preds_individ),
       .f = function(a) {
         trn = a$train
         vld = a$valid
         tst = a$test
         
         data_trn = trn %>% 
           mutate(keras.interpolation_rmse =
                    map_dbl(predict,
                            ~ accuracy_stats(df = .x,
                                             stat_type = "rmse"
                                             )
                            ),
                  keras.interpolation_mape =
                    map_dbl(predict,
                            ~ accuracy_stats(df = .x,
                                             stat_type = "mape"
                                             )
                            )
                  ) %>% 
           select(-predict)
         
         data_vld = vld %>% 
           select(el_stop_id, id, start_date, predict) %>% 
           mutate(keras.extrapolation_rmse =
                    map_dbl(predict,
                            ~ accuracy_stats(df = .x,
                                             stat_type = "rmse"
                                             )
                            ),
                  keras.extrapolation_mape =
                    map_dbl(predict,
                            ~ accuracy_stats(df = .x,
                                             stat_type = "mape"
                                             )
                            )
                  ) %>% 
           select(-predict)
         
         data_tst = tst %>% 
           select(el_stop_id, id, start_date, predict) %>% 
           mutate(keras.test_rmse =
                    map_dbl(predict,
                            ~ accuracy_stats(df = .x,
                                             stat_type = "rmse"
                                             )
                            ),
                  keras.test_mape =
                    map_dbl(predict,
                            ~ accuracy_stats(df = .x,
                                             stat_type = "mape"
                                             )
                            )
                  ) %>% 
           select(-predict)
         
         full_data =
           data_trn %>% 
           left_join(y = data_vld,
                     by = c("el_stop_id" = "el_stop_id",
                            "id" = "id",
                            "start_date" = "start_date"
                            )
                     ) %>% 
           left_join(y = data_tst,
                     by = c("el_stop_id" = "el_stop_id",
                            "id" = "id",
                            "start_date" = "start_date"
                            )
                     )
         
         return(full_data)
         }
       )

runtime_accuracy_stats_train <- proc.time() - start
runtime_accuracy_stats_train

##     user   system  elapsed 
## 1941.338  311.683 2278.183

rm(start)


saveRDS(all_split_rmse_mape,
        paste0(wd,
               "/Models/",
               "all_split_rmse_mape.Rds"
               )
        )

Now I calculate the median and mean stats for each type of model.

mean_med_stats <-
  all_split_rmse_mape %>% 
  map(~ gather(.x,
               key = "model",
               value = "value",
               -id,
               -start_date,
               -el_stop_id
               ) %>% 
        filter(#!str_detect(model, "extrapolation"),
               !str_detect(model, "mape"), # mape was only used for Keras, whereas rmse was used in all models (including Keras)
               ) %>% 
        group_by(el_stop_id,
                 model
                 ) %>% 
        summarise(mean = mean(value, na.rm = TRUE),
                  median = median(value, na.rm = TRUE)
                  ) %>% 
        ungroup() %>% 
        arrange(el_stop_id,
                median
                ) %>% 
        mutate(stat_type = case_when(str_detect(model, "interpolation") ~ "interpolation",
                                     str_detect(model, "extrapolation") ~ "extrapolation",
                                     TRUE ~ "other"
                                     )#,
               # sdf = str_locate_all(model, "\\.")
               )
      ) %>% 
  bind_rows() %>% 
  group_by(stat_type,
           el_stop_id
           ) %>% 
  arrange(median) %>% 
  mutate(row_num = row_number()) %>% 
  ungroup() %>% 
  arrange(stat_type,
          el_stop_id,
          median
          )

mean_med_stats