Load the relevant libraries.
# rm(list = ls())
# .rs.restartR()
# data manipulation
library("plyr")
library("tidyverse")
library("magrittr")
library("data.table")
library("lubridate")
library("sqldf")
# time series specific packages
library("timetk")
library("zoo")
library("tibbletime")
# modeling
library("fpp2")
library("prophet")
library("caret")
library("randomForest")
library("xgboost")
library("h2o")
library("keras")
# use_session_with_seed(123456789) # setting the seed to obtain reproducible results
# see https://keras.rstudio.com/articles/faq.html#how-can-i-obtain-reproducible-results-using-keras-during-development and https://cran.r-project.org/web/packages/keras/vignettes/faq.html
# can also re-enable gpu and parallel processing by using: use_session_with_seed(42, disable_gpu = FALSE, disable_parallel_cpu = FALSE)
# other
library("geosphere") # specific for distance calculations from lat-lon pairs
library("naniar") # inspecting missing data
library("rlang") # building functions
library("recipes") # used in Keras modeling to design matrices
library("rsample") # rolling samples for validation stats
library("tfruns") # used in Keras modeling for trainin runs
library("stringr") # string manipulation
library("ggplot2") # viz
library("sweep") # more easily pull out model statistics
library("yardstick") # easily calculate accuracy stats
library("doParallel") # parallel processing
Session Info.
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] doParallel_1.0.14 iterators_1.0.10 foreach_1.4.4
## [4] yardstick_0.0.2 sweep_0.2.1.1 tfruns_1.4
## [7] rsample_0.0.3 recipes_0.1.4 rlang_0.3.0.1
## [10] naniar_0.4.1 geosphere_1.5-7 keras_2.2.4
## [13] h2o_3.20.0.8 xgboost_0.71.2 randomForest_4.6-14
## [16] caret_6.0-81 lattice_0.20-38 prophet_0.3.0.1
## [19] Rcpp_1.0.0 fpp2_2.3 expsmooth_2.3
## [22] fma_2.3 forecast_8.4 tibbletime_0.1.1
## [25] zoo_1.8-4 timetk_0.1.1.1 sqldf_0.4-11
## [28] RSQLite_2.1.1 gsubfn_0.7 proto_1.0.0
## [31] lubridate_1.7.4 data.table_1.11.8 magrittr_1.5
## [34] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8
## [37] purrr_0.2.5 readr_1.2.1 tidyr_0.8.2
## [40] tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1
## [43] plyr_1.8.4
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 class_7.3-14 visdat_0.5.1
## [4] rprojroot_1.3-2 base64enc_0.1-3 rstudioapi_0.8
## [7] rstan_2.18.2 bit64_0.9-7 prodlim_2018.04.18
## [10] xml2_1.2.0 codetools_0.2-15 splines_3.5.1
## [13] knitr_1.20 zeallot_0.1.0 jsonlite_1.5
## [16] pROC_1.13.0 broom_0.5.0 compiler_3.5.1
## [19] httr_1.3.1 backports_1.1.2 assertthat_0.2.0
## [22] Matrix_1.2-15 lazyeval_0.2.1 cli_1.0.1
## [25] htmltools_0.3.6 prettyunits_1.0.2 tools_3.5.1
## [28] bindrcpp_0.2.2 gtable_0.2.0 glue_1.3.0
## [31] reshape2_1.4.3 cellranger_1.1.0 fracdiff_1.4-2
## [34] urca_1.3-0 debugme_1.1.0 nlme_3.1-137
## [37] lmtest_0.9-36 timeDate_3043.102 gower_0.1.2
## [40] ps_1.2.1 rvest_0.3.2 MASS_7.3-51.1
## [43] scales_1.0.0 ipred_0.9-8 hms_0.4.2
## [46] inline_0.3.15 yaml_2.2.0 quantmod_0.4-13
## [49] curl_3.2 reticulate_1.10 memoise_1.1.0
## [52] gridExtra_2.3 loo_2.0.0 StanHeaders_2.18.0
## [55] uroot_2.0-9 rpart_4.1-13 stringi_1.2.4
## [58] tensorflow_1.10 tseries_0.10-46 TTR_0.23-4
## [61] pkgbuild_1.0.2 lava_1.6.4 chron_2.3-53
## [64] bitops_1.0-6 pkgconfig_2.0.2 matrixStats_0.54.0
## [67] evaluate_0.12 bindr_0.1.1 bit_1.1-14
## [70] processx_3.2.0 tidyselect_0.2.5 R6_2.3.0
## [73] generics_0.0.2 DBI_1.0.0 whisker_0.3-2
## [76] pillar_1.3.0 haven_2.0.0 withr_2.1.2
## [79] xts_0.11-2 sp_1.3-1 RCurl_1.95-4.11
## [82] survival_2.43-3 nnet_7.3-12 modelr_0.1.2
## [85] crayon_1.3.4 rmarkdown_1.10 grid_3.5.1
## [88] readxl_1.1.0 blob_1.1.1 callr_3.0.0
## [91] ModelMetrics_1.2.2 digest_0.6.18 stats4_3.5.1
## [94] munsell_0.5.0 tcltk_3.5.1 quadprog_1.5-5
Setup the root directory.
Setting wd as the working directory.
wd <- getwd()
wd
## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy"
NOTE: add_el_stop_id the output produced in Step 06
add_el_stop_id <-
readRDS(paste0(wd,
"/Models/",
"add_el_stop_id.Rds"
)
)
el_stop_id.First, I create the train/val/test datasets.
example_split <- add_el_stop_id$`40600`$splits[[13]]
example_split_id <- add_el_stop_id$`40600`$id[[13]]
##### split into train (2/3 of original train data), validation (1/3 of original train data), and test (original validation data)
keras_split_pt <-
((analysis(example_split) %>%
nrow()
) * (2/3)
) %>%
round(digits = 0)
df_trn <-
analysis(example_split)[1:keras_split_pt,
,
drop = FALSE
] %>%
select(el_date, el_rides)
df_val <-
analysis(example_split)[(keras_split_pt + 1):nrow(analysis(example_split)
),
,
drop = FALSE
] %>%
select(el_date, el_rides)
df_tst <- assessment(example_split) %>%
select(el_date, el_rides)
rm(keras_split_pt)
df <- bind_rows(
df_trn %>% add_column(key = "01_training"),
df_val %>% add_column(key = "02_validation"),
df_tst %>% add_column(key = "03_testing")
) %>%
as_tbl_time(index = el_date) %>%
select(el_date,
key,
el_rides
)
## Warning: `list_len()` is soft-deprecated as of rlang 0.2.0.
## Please use `new_list()` instead
## This warning is displayed once per session.
# View(df)
df %>%
group_by(key) %>%
summarize(min_date = min(el_date),
max_date = max(el_date),
rows = n()
) %>%
ungroup()
# rm(list = ls(pattern = "df_"))
rm(list = ls(pattern = "example_split"))
Now I center and scale the data.
rec_obj <- recipe(el_rides ~ ., df) %>%
step_sqrt(el_rides) %>%
step_center(el_rides) %>%
step_scale(el_rides) %>%
prep()
df_processed_tbl <- bake(rec_obj, df)
# View(df_processed_tbl)
center_history <- rec_obj$steps[[2]]$means["el_rides"]
scale_history <- rec_obj$steps[[3]]$sds["el_rides"]
c("center" = center_history, "scale" = scale_history)
## center.el_rides scale.el_rides
## 21.464909 3.464935
Functions used to build out the LSTM infrastructure.
# create the matrix
build_matrix <- function(tseries, overall_timesteps) {
t(sapply(1:(length(tseries) - overall_timesteps + 1),
function(x)
tseries[x:(x + overall_timesteps - 1)]
)
)
}
# update the dimensiions
reshape_X_3d <- function(X) {
dim(X) <- c(dim(X)[1],
dim(X)[2],
1
)
X
}
Extract el_rides from data frames.
train_vals <- df_processed_tbl %>%
filter(key == "01_training") %>%
select(el_rides) %>%
pull()
valid_vals <- df_processed_tbl %>%
filter(key == "02_validation") %>%
select(el_rides) %>%
pull()
test_vals <- df_processed_tbl %>%
filter(key == "03_testing") %>%
select(el_rides) %>%
pull()
Here I shape the data for LSTM modeling.
# From [http://www.business-science.io/timeseries-analysis/2018/07/01/keras-lstm-sunspots-part2.html](http://www.business-science.io/timeseries-analysis/2018/07/01/keras-lstm-sunspots-part2.html)
# Keras LSTM expects the input as well as the target data to be in a specific shape. The input has to be a 3-d array of size num_samples, num_timesteps, num_features.
#
# Num_samples is the number of observations in the set. This will get fed to the model in portions of batch_size.
# The second dimension, num_timesteps, is the length of the hidden state we were talking about above.
# Finally, the third dimension is the number of predictors we’re using. For univariate time series, this is 1.
n_timesteps <- 7
n_predictions <- n_timesteps
batch_size <- 28
###### build the windowed matrices
train_matrix <- build_matrix(train_vals, n_timesteps + n_predictions)
valid_matrix <- build_matrix(valid_vals, n_timesteps + n_predictions)
test_matrix <- build_matrix(test_vals, n_timesteps + n_predictions)
###### separate matrices into training and testing parts
# also, discard last batch if there are fewer than batch_size samples (a purely technical requirement)
X_train <- train_matrix[, 1:n_timesteps]
y_train <- train_matrix[, (n_timesteps + 1):(n_timesteps * 2)]
X_train <- X_train[1:(nrow(X_train) %/% batch_size * batch_size), ]
y_train <- y_train[1:(nrow(y_train) %/% batch_size * batch_size), ]
X_valid <- valid_matrix[, 1:n_timesteps]
y_valid <- valid_matrix[, (n_timesteps + 1):(n_timesteps * 2)]
X_valid <- X_valid[1:(nrow(X_valid) %/% batch_size * batch_size), ]
y_valid <- y_valid[1:(nrow(y_valid) %/% batch_size * batch_size), ]
X_test <- test_matrix[, 1:n_timesteps]
y_test <- test_matrix[, (n_timesteps + 1):(n_timesteps * 2)]
X_test <- X_test[1:(nrow(X_test) %/% batch_size * batch_size), ]
y_test <- y_test[1:(nrow(y_test) %/% batch_size * batch_size), ]
###### add on the required third axis
X_train <- reshape_X_3d(X_train)
X_valid <- reshape_X_3d(X_valid)
X_test <- reshape_X_3d(X_test)
y_train <- reshape_X_3d(y_train)
y_valid <- reshape_X_3d(y_valid)
y_test <- reshape_X_3d(y_test)
Now, loop through various parameters settings.
# devtools::install_github("rstudio/keras")
# install_keras() # should only have to install once
# user system elapsed
# 4.610 5.864 700.531
# ~ 12 min
start <- proc.time()
tot_cores <- detectCores()
cl <- makeCluster(tot_cores - 1)
registerDoParallel(cl)
# 3 * 3 * 2 * 2 * 2 = 72 different combinations tried
# set the outer-most-loop with lrn_rate as it has the most number of possibilities
keras_parameter_tuning_trials <-
foreach(lrn_rate = seq(0.001, 0.003, 0.001), # seq(0.00, 0.003, 0.001)
.packages = c("tfruns", "keras", "tidyverse", "purrr", "yardstick"),
.combine = c,
.verbose = FALSE
) %:%
foreach(decay = seq(0.001, 0.003, 0.001), # seq(0.002, 0.003, 0.001)
packages = c("tfruns", "keras", "tidyverse", "purrr", "yardstick"),
.combine = c,
.verbose = FALSE
) %:%
foreach(drpout = seq(0.1, 0.2, 0.1), # seq(0.0, 0.2, 0.1)
packages = c("tfruns", "keras", "tidyverse", "purrr", "yardstick"),
.combine = c,
.verbose = FALSE
) %:%
foreach(rec_drpout = seq(0.1, 0.2, 0.1), # seq(0.0, 0.2, 0.1)
.packages = c("tfruns", "keras", "tidyverse", "purrr", "yardstick"),
.combine = c,
.verbose = FALSE
) %:%
foreach(momntm = seq(0.8, 0.9, 0.1), # eq(0.7, 0.9, 0.1)
packages = c("tfruns", "keras", "tidyverse", "purrr", "yardstick"),
.combine = c,
.verbose = FALSE
) %dopar% {
###### Build the LSTM Model
FLAGS <- flags(
# There is a so-called "stateful LSTM" in Keras. While LSTM is stateful per se,
# this adds a further tweak where the hidden states get initialized with values
# from the item at same position in the previous batch.
# This is helpful just under specific circumstances, or if you want to create an
# "infinite stream" of states, in which case you'd use 1 as the batch size.
# Below, we show how the code would have to be changed to use this, but it won't be further
# discussed here.
flag_boolean("stateful", FALSE),
# Should we use several layers of LSTM?
# Again, just included for completeness, it did not yield any superior performance on this task.
# This will actually stack exactly one additional layer of LSTM units.
flag_boolean("stack_layers", FALSE),
# number of samples fed to the model in one go
flag_integer("batch_size", batch_size), # 28 -- # 180
# size of the hidden state, equals size of predictions
flag_integer("n_timesteps", n_timesteps), # 7
# how many epochs to train for
flag_integer("n_epochs", 100),
# loss function. Found to work better for this specific case than mean squared error
flag_string("loss", "logcosh"), # mape
# optimizer = stochastic gradient descent. Seemed to work better than adam or rmsprop here
# (as indicated by limited testing)
flag_string("optimizer_type", "adam"), # sgd
# size of the LSTM layer
flag_integer("n_units", 128),
# parameter to the early stopping callback
flag_integer("patience", 10),
# learning rate
flag_numeric("lr", 0.003), # 0.003 lrn_rate
# learning rate decay
flag_numeric("decay", 0.003), # 0.003 decay
# fraction of the units to drop for the linear transformation of the inputs
flag_numeric("dropout", 0.2), # 0.2 drpout
# fraction of the units to drop for the linear transformation of the recurrent state
flag_numeric("recurrent_dropout", 0.2), # 0.2 rec_drpout
# momentum, an additional parameter to the SGD optimizer
flag_numeric("momentum", 0.9) # 0.9 momntm
)
# the number of predictions we'll make equals the length of the hidden state
n_predictions <- FLAGS$n_timesteps
# how many features = predictors we have
n_features <- 1
# just in case we wanted to try different optimizers, we could add here
optimizer <- switch(FLAGS$optimizer_type,
sgd = optimizer_sgd(lr = FLAGS$lr,
momentum = FLAGS$momentum
),
adam = optimizer_adam(lr = FLAGS$lr,
decay = FLAGS$decay
)
)
# callbacks to be passed to the fit() function
# We just use one here: we may stop before n_epochs if the loss on the validation set
# does not decrease (by a configurable amount, over a configurable time)
callbacks <-
list(callback_early_stopping(patience = FLAGS$patience)
)
##### Run longer version
model <- keras_model_sequential()
model %>%
layer_lstm(
units = FLAGS$n_units,
batch_input_shape = c(FLAGS$batch_size, FLAGS$n_timesteps, n_features),
dropout = FLAGS$dropout,
recurrent_dropout = FLAGS$recurrent_dropout,
return_sequences = TRUE,
stateful = FLAGS$stateful
)
if (FLAGS$stack_layers) {
model %>%
layer_lstm(
units = FLAGS$n_units,
dropout = FLAGS$dropout,
recurrent_dropout = FLAGS$recurrent_dropout,
return_sequences = TRUE,
stateful = FLAGS$stateful
)
}
model %>% time_distributed(layer_dense(units = 1))
# summary(model)
##### Root Mean Squared Error (needed for accuracy stats)
# this must be defined within the Keras loop
rmse_metric <-
custom_metric("rmse_metric",
function(y_true, y_pred) {
K <- backend()
K$sqrt(K$mean(K$square(K$relu(y_pred - y_true)
)
)
)
}
)
model %>%
compile(
loss = FLAGS$loss,
optimizer = optimizer,
metrics = list(mape = "mean_absolute_percentage_error",
mse = "mean_squared_error",
rmse_metric
)
)
if (!FLAGS$stateful) {
history <-
model %>%
fit(
x = X_train,
y = y_train,
validation_data = list(X_valid, y_valid),
batch_size = FLAGS$batch_size,
epochs = FLAGS$n_epochs,
callbacks = callbacks,
verbose = 0
)
} else {
history <-
for (i in 1:FLAGS$n_epochs) {
model %>%
fit(
x = X_train,
y = y_train,
validation_data = list(X_valid, y_valid),
callbacks = callbacks,
batch_size = FLAGS$batch_size,
epochs = 1,
shuffle = FALSE,
verbose = 0
)
model %>% reset_states()
}
}
if (FLAGS$stateful)
model %>% reset_states()
# plot(history)
# plot(history, metrics = "loss")
# plot(history, metrics = "rmse_metric")
##### Calculate MAPE
## Training Set
pred_train <- model %>%
predict(X_train, batch_size = FLAGS$batch_size) %>%
.[, , 1]
# Retransform values to original scale
pred_train_norm <- (pred_train * scale_history + center_history) ^2
compare_train <- df %>% filter(key == "01_training")
# build a dataframe that has both actual and predicted values
for (i in 1:nrow(pred_train_norm)
) {
varname <- paste0("pred_train", i)
compare_train <-
mutate(compare_train,
!!varname := c(rep(NA,
FLAGS$n_timesteps + i - 1
),
pred_train_norm[i,],
rep(NA,
nrow(compare_train) - FLAGS$n_timesteps * 2 - i + 1
)
)
)
}
coln <- colnames(compare_train)[4:ncol(compare_train)]
cols <- coln %>%
map(~ sym(.x)
)
# mape_train <-
# map_dbl(cols,
# function(col) {
# data = compare_train %>%
# mutate(#error = el_rides - !!col,
# pct_error = el_rides - !!col / el_rides * 100
# )
#
# mape = data$pct_error %>%
# abs() %>%
# mean(na.rm = TRUE)
#
# # rmse = (data$error)^2 %>%
# # mean(na.rm = TRUE) %>%
# # sqrt()
#
# # return(list(mape_ = mape,
# # rmse_ = rmse
# # )
# # )
# return(mape)
# }
# ) %>%
# mean()
#
# mape_train
rmse_train <-
map(cols,
function(col)
rmse(data = compare_train,
truth = el_rides,
estimate = !!col,
na.rm = TRUE
) %>%
select(.estimate)
) %>%
bind_rows()
rmse_train <- mean(rmse_train$.estimate, na.rm = TRUE)
rmse_train
name <- paste0("learningrate_",
lrn_rate,
"___",
"decay_",
decay,
"___",
"dropout_",
drpout,
"___",
"recurrentdropout_",
rec_drpout,
"___",
"momentum_",
momntm
)
rmse_train_list <- list()
rmse_train_list[[name]] <- rmse_train
return(rmse_train_list)
}
stopCluster(cl)
rm(cl)
runtime_dopar <- proc.time() - start
runtime_dopar
## user system elapsed
## 3.502 5.789 865.858
rm(start)
Munge to get a more presentable listing (as a dataframe) of models tried.
rmse_vals_list <-
pmap(.l = list(a = keras_parameter_tuning_trials,
b = names(keras_parameter_tuning_trials)
),
.f = function(a, b) {
rmse_val = a
df = data.frame(parameters = b,
rmse = rmse_val
)
return(df)
}
)
rmse_vals_df <-
bind_rows(rmse_vals_list) %>%
separate(col = parameters,
into = c("learn_rate",
"decay",
"dropout",
"recurrent_dropout",
"momentum"
),
sep = "___",
remove = FALSE
) %>%
separate(col = learn_rate,
into = c("lrn_rate",
"lrn_rate_value"
),
sep = "_"
) %>%
separate(col = decay,
into = c("decay",
"decay_value"
),
sep = "_"
) %>%
separate(col = dropout,
into = c("drpout",
"drpout_value"
),
sep = "_"
) %>%
separate(col = recurrent_dropout,
into = c("rec_drpout",
"rec_drpout_value"
),
sep = "_"
) %>%
separate(col = momentum,
into = c("momtum",
"momtum_value"
),
sep = "_"
)
runtime_dopar
## user system elapsed
## 3.502 5.789 865.858
# View(rmse_vals_df)
rmse_vals_df %>%
arrange(rmse)
View(rmse_vals_df %>%
arrange(rmse)
)
rm(rmse_vals_list)
Now, run the model using the best parameter settings.
Setup the model.
# use the best parameter settings
best_settings <-
rmse_vals_df %>%
filter(rmse == min(rmse)
)
saveRDS(best_settings,
paste0(wd,
"/Models/",
"best_settings.Rds"
)
)
# best_settings <-
# readRDS(paste0(wd,
# "/Models/",
# "best_settings.Rds"
# )
# )
###### Build the LSTM Model
# Update FLAG parameters with `best_settings` values
FLAGS <- flags(
# There is a so-called "stateful LSTM" in Keras. While LSTM is stateful per se,
# this adds a further tweak where the hidden states get initialized with values
# from the item at same position in the previous batch.
# This is helpful just under specific circumstances, or if you want to create an
# "infinite stream" of states, in which case you'd use 1 as the batch size.
# Below, we show how the code would have to be changed to use this, but it won't be further
# discussed here.
flag_boolean("stateful", FALSE),
# Should we use several layers of LSTM?
# Again, just included for completeness, it did not yield any superior performance on this task.
# This will actually stack exactly one additional layer of LSTM units.
flag_boolean("stack_layers", FALSE),
# number of samples fed to the model in one go
flag_integer("batch_size", batch_size), # 28 # 180
# size of the hidden state, equals size of predictions
flag_integer("n_timesteps", n_timesteps), # 7
# how many epochs to train for
flag_integer("n_epochs", 100),
# loss function. Found to work better for this specific case than mean squared error
flag_string("loss", "logcosh"), # mape
# optimizer = stochastic gradient descent. Seemed to work better than adam or rmsprop here
# (as indicated by limited testing)
flag_string("optimizer_type", "adam"), # sgd
# size of the LSTM layer
flag_integer("n_units", 128),
# parameter to the early stopping callback
flag_integer("patience", 10),
# learning rate
flag_numeric("lr", best_settings$lrn_rate_value), # 0.003 lrn_rate
# learning rate decay
flag_numeric("decay", best_settings$decay_value), # 0.003 decay
# fraction of the units to drop for the linear transformation of the inputs
flag_numeric("dropout", best_settings$drpout_value), # 0.2 drpout
# fraction of the units to drop for the linear transformation of the recurrent state
flag_numeric("recurrent_dropout", best_settings$rec_drpout_value), # 0.2 rec_drpout
# momentum, an additional parameter to the SGD optimizer
flag_numeric("momentum", best_settings$momtum_value) # 0.9 momntm
)
# the number of predictions we'll make equals the length of the hidden state
n_predictions <- FLAGS$n_timesteps
# how many features = predictors we have
n_features <- 1
# just in case we wanted to try different optimizers, we could add here
optimizer <- switch(FLAGS$optimizer_type,
sgd = optimizer_sgd(lr = FLAGS$lr,
momentum = FLAGS$momentum
),
adam = optimizer_adam(lr = FLAGS$lr,
decay = FLAGS$decay
)
)
# callbacks to be passed to the fit() function
# We just use one here: we may stop before n_epochs if the loss on the validation set
# does not decrease (by a configurable amount, over a configurable time)
callbacks <-
list(callback_early_stopping(patience = FLAGS$patience)
)
Run the model.
model <- keras_model_sequential()
model %>%
layer_lstm(
units = FLAGS$n_units,
batch_input_shape = c(FLAGS$batch_size, FLAGS$n_timesteps, n_features),
dropout = FLAGS$dropout,
recurrent_dropout = FLAGS$recurrent_dropout,
return_sequences = TRUE,
stateful = FLAGS$stateful
)
if (FLAGS$stack_layers) {
model %>%
layer_lstm(
units = FLAGS$n_units,
dropout = FLAGS$dropout,
recurrent_dropout = FLAGS$recurrent_dropout,
return_sequences = TRUE,
stateful = FLAGS$stateful
)
}
model %>% time_distributed(layer_dense(units = 1))
# Root Mean Squared Error (needed for accuracy stats)
rmse_metric <-
custom_metric("rmse_metric",
function(y_true, y_pred) {
K <- backend()
K$sqrt(K$mean(K$square(K$relu(y_pred - y_true)
)
)
)
}
)
model %>%
compile(
loss = FLAGS$loss,
optimizer = optimizer,
metrics = list(mape = "mean_absolute_percentage_error",
mse = "mean_squared_error",
rmse_metric
)
)
if (!FLAGS$stateful) {
history <-
model %>%
fit(
x = X_train,
y = y_train,
validation_data = list(X_valid, y_valid),
batch_size = FLAGS$batch_size,
epochs = FLAGS$n_epochs,
callbacks = callbacks,
verbose = 0
)
} else {
history <-
for (i in 1:FLAGS$n_epochs) {
model %>%
fit(
x = X_train,
y = y_train,
validation_data = list(X_valid, y_valid),
callbacks = callbacks,
batch_size = FLAGS$batch_size,
epochs = 1,
shuffle = FALSE,
verbose = 0
)
model %>% reset_states()
}
}
if (FLAGS$stateful)
model %>% reset_states()
summary(model)
## ___________________________________________________________________________
## Layer (type) Output Shape Param #
## ===========================================================================
## lstm_1 (LSTM) (28, 7, 128) 66560
## ___________________________________________________________________________
## time_distributed_1 (TimeDistribu (28, 7, 1) 129
## ===========================================================================
## Total params: 66,689
## Trainable params: 66,689
## Non-trainable params: 0
## ___________________________________________________________________________
plot(history)
plot(history, metrics = "loss")
plot(history, metrics = "rmse_metric")
save_model_hdf5(object = model,
filepath = paste0(wd,
"/Models/",
'keras_optimal_model.h5'
)
)
# model <-
# load_model_hdf5(filepath = paste0(wd,
# "/Models/",
# 'keras_optimal_model.h5'
# )
# )
Calculate accuracy (MAPE and RMSE) stats.
## Training Set
pred_train <- model %>%
predict(X_train, batch_size = FLAGS$batch_size) %>%
.[, , 1]
# Retransform values to original scale
pred_train_norm <- (pred_train * scale_history + center_history) ^2
compare_train <- df %>% filter(key == "01_training")
# build a dataframe that has both actual and predicted values
for (i in 1:nrow(pred_train_norm)
) {
varname <- paste0("pred_train", i)
compare_train <-
mutate(compare_train,
!!varname := c(rep(NA,
FLAGS$n_timesteps + i - 1
),
pred_train_norm[i,],
rep(NA,
nrow(compare_train) - FLAGS$n_timesteps * 2 - i + 1
)
)
)
}
coln <- colnames(compare_train)[4:ncol(compare_train)]
cols <- coln %>%
map(~ sym(.x)
)
# MAPE
mape_train <-
map_dbl(cols,
function(col) {
data = compare_train %>%
mutate(pct_error = el_rides - !!col / el_rides * 100
)
mape = data$pct_error %>%
abs() %>%
mean(na.rm = TRUE)
return(mape)
}
) %>%
mean()
# RMSE
rmse_train <-
map(cols,
function(col)
rmse(data = compare_train,
truth = el_rides,
estimate = !!col,
na.rm = TRUE
) %>%
select(.estimate)
) %>%
bind_rows()
rmse_train <- mean(rmse_train$.estimate, na.rm = TRUE)
rmse_train
## [1] 63.94936
### Calculate on test data
pred_test <- model %>%
predict(X_test, batch_size = FLAGS$batch_size) %>%
.[, , 1]
# Retransform values to original scale
pred_test <- (pred_test * scale_history + center_history) ^2
compare_test <- df %>% filter(key == "03_testing")
# build a dataframe that has both actual and predicted values
for (i in 1:nrow(pred_test)
) {
varname <- paste0("pred_test", i)
compare_test <-
mutate(compare_test,
!!varname := c(rep(NA,
FLAGS$n_timesteps + i - 1
),
pred_test[i,],
rep(NA,
nrow(compare_test) - FLAGS$n_timesteps * 2 - i + 1
)
)
)
}
coln <- colnames(compare_test)[4:ncol(compare_test)]
cols <- coln %>%
map(~ sym(.x)
)
# MAPE
mape_test <-
map_dbl(cols,
function(col) {
data = compare_test %>%
mutate(pct_error = el_rides - !!col / el_rides * 100
)
mape = data$pct_error %>%
abs() %>%
mean(na.rm = TRUE)
return(mape)
}
) %>%
mean()
# RMSE
rmse_test <-
map(cols,
function(col)
rmse(data = compare_test,
truth = el_rides,
estimate = !!col,
na.rm = TRUE
) %>%
select(.estimate)
) %>%
bind_rows()
rmse_test <- mean(rmse_test$.estimate, na.rm = TRUE)
rmse_test
## [1] 65.49447
message("mape_train")
## mape_train
mape_train
## [1] 371.8885
message("mape_test")
## mape_test
mape_test
## [1] 377.3498
message("rmse_train")
## rmse_train
rmse_train
## [1] 63.94936
message("rmse_test")
## rmse_test
rmse_test
## [1] 65.49447
# View(compare_train)
# View(compare_test)
Plot actual vs. prediction on test data.
ggplot(compare_test,
aes(x = el_date,
y = el_rides
)
) +
geom_line() +
geom_line(aes(y = pred_test1), color = "cyan", na.rm = TRUE) +
geom_line(aes(y = pred_test30), color = "red", na.rm = TRUE) +
geom_line(aes(y = pred_test60), color = "green", na.rm = TRUE) +
geom_line(aes(y = pred_test90), color = "violet", na.rm = TRUE) +
geom_line(aes(y = pred_test120), color = "cyan", na.rm = TRUE) +
geom_line(aes(y = pred_test150), color = "red", na.rm = TRUE) +
geom_line(aes(y = pred_test180), color = "green", na.rm = TRUE) +
labs(title = "Keras LSTM Predictions on Test Set",
subtitle = "L Stop ID 40600: split 13 of 13"
) +
theme_minimal() +
NULL
Run the LSTM model for each split.
# Build the function
obtain_predictions <-
function(split) {
# split into train (2/3 of original train data), validation (1/3 of original train data), and test (original validation data)
keras_split_pt <-
((analysis(split) %>%
nrow()
) * (2/3)
) %>%
round(digits = 0)
df_trn <-
analysis(split)[1:keras_split_pt,
,
drop = FALSE
] %>%
select(el_date, el_rides)
df_val <-
analysis(split)[(keras_split_pt + 1):nrow(analysis(split)
),
,
drop = FALSE
] %>%
select(el_date, el_rides)
df_tst <- assessment(split) %>%
select(el_date, el_rides)
df <-
bind_rows(df_trn %>% add_column(key = "01_training"),
df_val %>% add_column(key = "02_validation"),
df_tst %>% add_column(key = "03_testing")
) %>%
as_tbl_time(index = el_date) %>%
select(el_date,
key,
el_rides
)
##### Center & Scaling the data
rec_obj <- recipe(el_rides ~ ., df) %>%
step_sqrt(el_rides) %>%
step_center(el_rides) %>%
step_scale(el_rides) %>%
prep()
df_processed_tbl <- bake(rec_obj, df)
center_history <- rec_obj$steps[[2]]$means["el_rides"]
scale_history <- rec_obj$steps[[3]]$sds["el_rides"]
###### Build the LSTM Model
FLAGS <- FLAGS
n_predictions <- FLAGS$n_timesteps
n_features <- 1
optimizer <- switch(FLAGS$optimizer_type,
sgd = optimizer_sgd(lr = FLAGS$lr,
momentum = FLAGS$momentum
),
adam = optimizer_adam(lr = FLAGS$lr,
decay = FLAGS$decay
)
)
callbacks <- list(callback_early_stopping(patience = FLAGS$patience)
)
###### extract el_ridess from data frame
train_vals <- df_processed_tbl %>%
filter(key == "01_training") %>%
select(el_rides) %>%
pull()
valid_vals <- df_processed_tbl %>%
filter(key == "02_validation") %>%
select(el_rides) %>%
pull()
test_vals <- df_processed_tbl %>%
filter(key == "03_testing") %>%
select(el_rides) %>%
pull()
train_matrix <- build_matrix(train_vals, FLAGS$n_timesteps + n_predictions)
valid_matrix <- build_matrix(valid_vals, FLAGS$n_timesteps + n_predictions)
test_matrix <- build_matrix(test_vals, FLAGS$n_timesteps + n_predictions)
###### separate matrices into training and testing parts
# also, discard last batch if there are fewer than batch_size samples (a purely technical requirement)
# nrow(X_train) %/% 360 * 360
# nrow(X_valid) %/% 360 * 360
X_train <- train_matrix[, 1:FLAGS$n_timesteps]
y_train <- train_matrix[, (FLAGS$n_timesteps + 1):(FLAGS$n_timesteps * 2)]
X_train <- X_train[1:(nrow(X_train) %/% FLAGS$batch_size * FLAGS$batch_size), ]
y_train <- y_train[1:(nrow(y_train) %/% FLAGS$batch_size * FLAGS$batch_size), ]
X_valid <- valid_matrix[, 1:FLAGS$n_timesteps]
y_valid <- valid_matrix[, (FLAGS$n_timesteps + 1):(FLAGS$n_timesteps * 2)]
X_valid <- X_valid[1:(nrow(X_valid) %/% FLAGS$batch_size * FLAGS$batch_size), ]
y_valid <- y_valid[1:(nrow(y_valid) %/% FLAGS$batch_size * FLAGS$batch_size), ]
X_test <- test_matrix[, 1:FLAGS$n_timesteps]
y_test <- test_matrix[, (FLAGS$n_timesteps + 1):(FLAGS$n_timesteps * 2)]
X_test <- X_test[1:(nrow(X_test) %/% FLAGS$batch_size * FLAGS$batch_size), ]
y_test <- y_test[1:(nrow(y_test) %/% FLAGS$batch_size * FLAGS$batch_size), ]
X_train <- reshape_X_3d(X_train)
X_valid <- reshape_X_3d(X_valid)
X_test <- reshape_X_3d(X_test)
y_train <- reshape_X_3d(y_train)
y_valid <- reshape_X_3d(y_valid)
y_test <- reshape_X_3d(y_test)
##### Run longer version
model <- keras_model_sequential()
model %>%
layer_lstm(
units = FLAGS$n_units,
batch_input_shape = c(FLAGS$batch_size, FLAGS$n_timesteps, n_features),
dropout = FLAGS$dropout,
recurrent_dropout = FLAGS$recurrent_dropout,
return_sequences = TRUE,
stateful = FLAGS$stateful
)
if (FLAGS$stack_layers) {
model %>%
layer_lstm(
units = FLAGS$n_units,
dropout = FLAGS$dropout,
recurrent_dropout = FLAGS$recurrent_dropout,
return_sequences = TRUE,
stateful = FLAGS$stateful
)
}
model %>% time_distributed(layer_dense(units = 1))
# Root Mean Squared Error (needed for accuracy stats)
# (must be defined within the Keras loop)
rmse_metric <-
custom_metric("rmse_metric",
function(y_true, y_pred) {
K <- backend()
K$sqrt(K$mean(K$square(K$relu(y_pred - y_true)
)
)
)
}
)
model %>%
compile(
loss = FLAGS$loss,
optimizer = optimizer,
metrics = list(mape = "mean_absolute_percentage_error",
mse = "mean_squared_error",
rmse_metric
)
)
if (!FLAGS$stateful) {
history <- model %>%
fit(
x = X_train,
y = y_train,
validation_data = list(X_valid, y_valid),
batch_size = FLAGS$batch_size,
epochs = FLAGS$n_epochs,
callbacks = callbacks,
verbose = 0
)
} else {
history <-
for (i in 1:FLAGS$n_epochs) {
model %>%
fit(
x = X_train,
y = y_train,
validation_data = list(X_valid, y_valid),
callbacks = callbacks,
batch_size = FLAGS$batch_size,
epochs = 1,
shuffle = FALSE,
verbose = 0
)
model %>% reset_states()
}
}
if (FLAGS$stateful)
model %>% reset_states()
## Get predictions
pred_train <- model %>%
predict(X_train, batch_size = FLAGS$batch_size) %>%
.[, , 1]
pred_valid <- model %>%
predict(X_valid, batch_size = FLAGS$batch_size) %>%
.[, , 1]
pred_test <- model %>%
predict(X_test, batch_size = FLAGS$batch_size) %>%
.[, , 1]
# Retransform values to original scale
pred_train_norm <- (pred_train * scale_history + center_history) ^2
pred_valid_norm <- (pred_valid * scale_history + center_history) ^2
pred_test_norm <- (pred_test * scale_history + center_history) ^2
compare_train <- df %>% filter(key == "01_training")
compare_valid <- df %>% filter(key == "02_validation")
compare_test <- df %>% filter(key == "03_testing")
##### build a dataframe that has both actual and predicted values
for (i in 1:nrow(pred_train_norm)
) {
varname <- paste0("pred_train", i)
compare_train <-
mutate(compare_train,
!!varname := c(rep(NA,
FLAGS$n_timesteps + i - 1
),
pred_train_norm[i,],
rep(NA,
nrow(compare_train) - FLAGS$n_timesteps * 2 - i + 1
)
)
)
}
for (i in 1:nrow(pred_valid_norm)
) {
varname <- paste0("pred_valid", i)
compare_valid <-
mutate(compare_valid,
!!varname := c(rep(NA,
FLAGS$n_timesteps + i - 1
),
pred_valid_norm[i,],
rep(NA,
nrow(compare_valid) - FLAGS$n_timesteps * 2 - i + 1
)
)
)
}
for (i in 1:nrow(pred_test_norm)
) {
varname <- paste0("pred_test", i)
compare_test <-
mutate(compare_test,
!!varname := c(rep(NA,
FLAGS$n_timesteps + i - 1
),
pred_test_norm[i,],
rep(NA,
nrow(compare_test) - FLAGS$n_timesteps * 2 - i + 1
)
)
)
}
##### final output
list(train = compare_train,
valid = compare_valid,
test = compare_test
)
}
Run the function.
# best_settings <-
# readRDS(paste0(wd,
# "/Data_Processed/",
# "best_settings.Rds"
# )
# )
# using foreach on tot_cores - 1 (i.e., 3) cores
# user system elapsed
# 21.834 16.848 2159.961
# ~ 36 min
# user system elapsed
# 2308.015 241.913 1852.001
# ~ 31 min
start <- proc.time()
# tot_cores <- detectCores()
# cl <- makeCluster(tot_cores - 1)
# registerDoParallel(cl)
#
# all_split_preds <-
# foreach(i = add_el_stop_id,
# .packages = c("rsample",
# "tibbletime",
# "recipes",
# "tfruns",
# "keras",
# "tidyverse",
# "purrr",
# "yardstick"
# ),
# .combine = c,
# .verbose = FALSE
# ) %dopar% {
# dat = i %>%
# mutate(predict = map(splits,
# obtain_predictions
# )
# )
#
# return(dat)
# }
all_split_preds <-
pmap(.l = list(a = add_el_stop_id),
.f = function(a) {
a %>%
mutate(predict = map(splits,
obtain_predictions
)
)
}
)
runtime_keras_all <- proc.time() - start
runtime_keras_all
## user system elapsed
## 2402.160 252.783 1878.307
# stopCluster(cl)
# rm(cl)
rm(start)
saveRDS(all_split_preds,
paste0(wd,
"/Models/",
"all_split_preds.Rds"
)
)
# all_split_preds <-
# readRDS(paste0(wd,
# "/Models/",
# "all_split_preds.Rds"
# )
# )
# View(all_split_preds$`40910`$predict)
# add_el_stop_id$`40910`$splits %>% length()
Calculate accuracy stats.
First, create the function to calculate the stats.
accuracy_stats <-
function(df, stat_type) {
coln <- colnames(df)[4:ncol(df)]
cols <- coln %>%
map(~ sym(.x)
)
if(stat_type == "rmse") {
result = map(cols,
function(col)
rmse(data = df,
truth = el_rides,
estimate = !!col,
na.rm = TRUE
) %>%
select(.estimate)
) %>%
bind_rows()
result = mean(result$.estimate, na.rm = TRUE)
} else {
result = map_dbl(cols,
function(col) {
data = df %>%
mutate(pct_error = el_rides - !!col / el_rides * 100
)
mape_stat = data$pct_error %>%
abs() %>%
mean(na.rm = TRUE)
return(mape_stat)
}
) %>%
mean()
}
return(result)
}
Munge the data to be able to run the accuracy_stats function.
all_split_preds_unnest <-
all_split_preds %>%
map(~ unnest(.x,
predict
)
)
names(all_split_preds_unnest$`40600`)
## [1] "id" "start_date"
## [3] "h2o.interpolation" "h2o.extrapolation"
## [5] "h2o.limvars.interpolation" "h2o.limvars.extrapolation"
## [7] "arima.interpolation" "arima_xreg.interpolation"
## [9] "prophet.interpolation" "prophet_hol.interpolation"
## [11] "rf.interpolation" "rf.extrapolation"
## [13] "xgb.interpolation" "xgb.extrapolation"
## [15] "arima.extrapolation" "arima_xreg.extrapolation"
## [17] "prophet.extrapolation" "prophet_hol.extrapolation"
## [19] "el_stop_id" "predict"
all_split_preds_individ <-
pmap(.l = list(a = all_split_preds_unnest),
.f = function(a) {
lngth_tot = length(a$predict)
train_ = a[seq(1, lngth_tot - 2, by = 3), ] # train is the 1st of every 3 sets
valid_ = a[seq(2, lngth_tot - 1, by = 3), ] # valid is the 2nd of every 3 sets
test_ = a[seq(3, lngth_tot, by = 3), ] # test is the 3rd of every 3 sets
return(list(train = train_,
valid = valid_,
test = test_
)
)
}
)
Now I can run the accuracy_stats function to calculate the stats.
# user system elapsed
# 2050.926 337.664 2625.468
# ~ 44 min
start <- proc.time()
all_split_rmse_mape <-
pmap(.l = list(a = all_split_preds_individ),
.f = function(a) {
trn = a$train
vld = a$valid
tst = a$test
data_trn = trn %>%
mutate(keras.interpolation_rmse =
map_dbl(predict,
~ accuracy_stats(df = .x,
stat_type = "rmse"
)
),
keras.interpolation_mape =
map_dbl(predict,
~ accuracy_stats(df = .x,
stat_type = "mape"
)
)
) %>%
select(-predict)
data_vld = vld %>%
select(el_stop_id, id, start_date, predict) %>%
mutate(keras.extrapolation_rmse =
map_dbl(predict,
~ accuracy_stats(df = .x,
stat_type = "rmse"
)
),
keras.extrapolation_mape =
map_dbl(predict,
~ accuracy_stats(df = .x,
stat_type = "mape"
)
)
) %>%
select(-predict)
data_tst = tst %>%
select(el_stop_id, id, start_date, predict) %>%
mutate(keras.test_rmse =
map_dbl(predict,
~ accuracy_stats(df = .x,
stat_type = "rmse"
)
),
keras.test_mape =
map_dbl(predict,
~ accuracy_stats(df = .x,
stat_type = "mape"
)
)
) %>%
select(-predict)
full_data =
data_trn %>%
left_join(y = data_vld,
by = c("el_stop_id" = "el_stop_id",
"id" = "id",
"start_date" = "start_date"
)
) %>%
left_join(y = data_tst,
by = c("el_stop_id" = "el_stop_id",
"id" = "id",
"start_date" = "start_date"
)
)
return(full_data)
}
)
runtime_accuracy_stats_train <- proc.time() - start
runtime_accuracy_stats_train
## user system elapsed
## 1941.338 311.683 2278.183
rm(start)
saveRDS(all_split_rmse_mape,
paste0(wd,
"/Models/",
"all_split_rmse_mape.Rds"
)
)
Now I calculate the median and mean stats for each type of model.
mean_med_stats <-
all_split_rmse_mape %>%
map(~ gather(.x,
key = "model",
value = "value",
-id,
-start_date,
-el_stop_id
) %>%
filter(#!str_detect(model, "extrapolation"),
!str_detect(model, "mape"), # mape was only used for Keras, whereas rmse was used in all models (including Keras)
) %>%
group_by(el_stop_id,
model
) %>%
summarise(mean = mean(value, na.rm = TRUE),
median = median(value, na.rm = TRUE)
) %>%
ungroup() %>%
arrange(el_stop_id,
median
) %>%
mutate(stat_type = case_when(str_detect(model, "interpolation") ~ "interpolation",
str_detect(model, "extrapolation") ~ "extrapolation",
TRUE ~ "other"
)#,
# sdf = str_locate_all(model, "\\.")
)
) %>%
bind_rows() %>%
group_by(stat_type,
el_stop_id
) %>%
arrange(median) %>%
mutate(row_num = row_number()) %>%
ungroup() %>%
arrange(stat_type,
el_stop_id,
median
)
mean_med_stats
View(mean_med_stats)