Load the relevant libraries.
# rm(list = ls())
# .rs.restartR()
# data manipulation
library("plyr")
library("tidyverse")
library("magrittr")
library("data.table")
library("lubridate")
library("sqldf")
# time series specific packages
library("timetk")
library("zoo")
library("tibbletime")
# modeling
library("fpp2")
library("prophet")
library("caret")
library("randomForest")
library("xgboost")
library("h2o")
library("keras")
# use_session_with_seed(123456789) # setting the seed to obtain reproducible results
# see https://keras.rstudio.com/articles/faq.html#how-can-i-obtain-reproducible-results-using-keras-during-development and https://cran.r-project.org/web/packages/keras/vignettes/faq.html
# can also re-enable gpu and parallel processing by using: use_session_with_seed(42, disable_gpu = FALSE, disable_parallel_cpu = FALSE)
# other
library("geosphere") # specific for distance calculations from lat-lon pairs
library("naniar") # inspecting missing data
library("rlang") # building functions
library("recipes") # used in Keras modeling to design matrices
library("rsample") # rolling samples for validation stats
library("tfruns") # used in Keras modeling for trainin runs
library("stringr") # string manipulation
library("ggplot2") # viz
library("sweep") # more easily pull out model statistics
library("yardstick") # easily calculate accuracy stats
library("doParallel") # parallel processing
Session Info.
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] doParallel_1.0.14 iterators_1.0.10 foreach_1.4.4
## [4] yardstick_0.0.2 sweep_0.2.1.1 tfruns_1.4
## [7] rsample_0.0.3 recipes_0.1.4 rlang_0.3.0.1
## [10] naniar_0.4.1 geosphere_1.5-7 keras_2.2.4
## [13] h2o_3.20.0.8 xgboost_0.71.2 randomForest_4.6-14
## [16] caret_6.0-81 lattice_0.20-38 prophet_0.3.0.1
## [19] Rcpp_1.0.0 fpp2_2.3 expsmooth_2.3
## [22] fma_2.3 forecast_8.4 tibbletime_0.1.1
## [25] zoo_1.8-4 timetk_0.1.1.1 sqldf_0.4-11
## [28] RSQLite_2.1.1 gsubfn_0.7 proto_1.0.0
## [31] lubridate_1.7.4 data.table_1.11.8 magrittr_1.5
## [34] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8
## [37] purrr_0.2.5 readr_1.2.1 tidyr_0.8.2
## [40] tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1
## [43] plyr_1.8.4
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 class_7.3-14 visdat_0.5.1
## [4] rprojroot_1.3-2 base64enc_0.1-3 rstudioapi_0.8
## [7] rstan_2.18.2 bit64_0.9-7 prodlim_2018.04.18
## [10] xml2_1.2.0 codetools_0.2-15 splines_3.5.1
## [13] knitr_1.20 zeallot_0.1.0 jsonlite_1.5
## [16] pROC_1.13.0 broom_0.5.0 compiler_3.5.1
## [19] httr_1.3.1 backports_1.1.2 assertthat_0.2.0
## [22] Matrix_1.2-15 lazyeval_0.2.1 cli_1.0.1
## [25] htmltools_0.3.6 prettyunits_1.0.2 tools_3.5.1
## [28] bindrcpp_0.2.2 gtable_0.2.0 glue_1.3.0
## [31] reshape2_1.4.3 cellranger_1.1.0 fracdiff_1.4-2
## [34] urca_1.3-0 debugme_1.1.0 nlme_3.1-137
## [37] lmtest_0.9-36 timeDate_3043.102 gower_0.1.2
## [40] ps_1.2.1 rvest_0.3.2 MASS_7.3-51.1
## [43] scales_1.0.0 ipred_0.9-8 hms_0.4.2
## [46] inline_0.3.15 yaml_2.2.0 quantmod_0.4-13
## [49] curl_3.2 reticulate_1.10 memoise_1.1.0
## [52] gridExtra_2.3 loo_2.0.0 StanHeaders_2.18.0
## [55] uroot_2.0-9 rpart_4.1-13 stringi_1.2.4
## [58] tensorflow_1.10 tseries_0.10-46 TTR_0.23-4
## [61] pkgbuild_1.0.2 lava_1.6.4 chron_2.3-53
## [64] bitops_1.0-6 pkgconfig_2.0.2 matrixStats_0.54.0
## [67] evaluate_0.12 bindr_0.1.1 bit_1.1-14
## [70] processx_3.2.0 tidyselect_0.2.5 R6_2.3.0
## [73] generics_0.0.2 DBI_1.0.0 whisker_0.3-2
## [76] pillar_1.3.0 haven_2.0.0 withr_2.1.2
## [79] xts_0.11-2 sp_1.3-1 RCurl_1.95-4.11
## [82] survival_2.43-3 nnet_7.3-12 modelr_0.1.2
## [85] crayon_1.3.4 rmarkdown_1.10 grid_3.5.1
## [88] readxl_1.1.0 blob_1.1.1 callr_3.0.0
## [91] ModelMetrics_1.2.2 digest_0.6.18 stats4_3.5.1
## [94] munsell_0.5.0 tcltk_3.5.1 quadprog_1.5-5
Setup the root directory.
Setting wd as the working directory.
wd <- getwd()
wd
## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy"
NOTE: h2o.limvars_accuracy_stats, DV_Fit.Rf.corr_no, DV_Fit.Xgbtree.corr_yes, func_one_hot_vars, run_times, h2o.time, and h2o.limvars.time are the outputs produced in Step 02 - Step 05
h2o.limvars_accuracy_stats <-
readRDS(paste0(wd,
"/Models/",
"h2o.limvars_accuracy_stats.Rds"
)
)
DV_Fit.Rf.corr_no <-
readRDS(paste0(wd,
"/Models/",
"DV_Fit.Rf.corr_no.Rds"
)
)
DV_Fit.Xgbtree.corr_yes <-
readRDS(paste0(wd,
"/Models/",
"DV_Fit.Xgbtree.corr_yes.Rds"
)
)
func_one_hot_vars <-
readRDS(paste0(wd,
"/Data/Interim/",
"func_one_hot_vars.Rds"
)
)
run_times <-
readRDS(paste0(wd,
"/Models/",
"run_times.Rds"
)
)
h2o.time <-
readRDS(paste0(wd,
"/Models/",
"h2o.time.Rds"
)
)
h2o.limvars.time <-
readRDS(paste0(wd,
"/Models/",
"h2o.limvars.time.Rds"
)
)
Now we can check some summary (RMSE) statistics. NOTE: these are summary statistics based on the data used to build the models.
accuracy_interp_arima <-
pmap(.l = list(a = h2o.limvars_accuracy_stats),
.f = function(a) {
interp_a =
map_dbl(a$arima,
function(x)
sw_glance(x)[["RMSE"]]
)
interp_a_xreg =
map_dbl(a$arima_xreg,
function(x) {
y = x$best_fit %>%
sw_glance()
y[["RMSE"]]
}
)
a$arima.interpolation = interp_a
a$arima_xreg.interpolation = interp_a_xreg
return(a)
}
)
## Warning: `list_len()` is soft-deprecated as of rlang 0.2.0.
## Please use `new_list()` instead
## This warning is displayed once per session.
names(accuracy_interp_arima$`40600`)
## [1] "splits" "id"
## [3] "start_date" "arima"
## [5] "arima_xreg" "prophet"
## [7] "prophet_hol" "prophet.future"
## [9] "prophet_hol.future" "prophet.forecast"
## [11] "prophet_hol.forecast" "prophet.plots"
## [13] "prophet_hol.plots" "interp_data"
## [15] "extrap_data" "h2o.interp.forecast"
## [17] "h2o.extrap.forecast" "h2o.interpolation"
## [19] "h2o.extrapolation" "h2o.limvars.interp.forecast"
## [21] "h2o.limvars.extrap.forecast" "h2o.limvars.interpolation"
## [23] "h2o.limvars.extrapolation" "arima.interpolation"
## [25] "arima_xreg.interpolation"
accuracy_interp_arima$`40600`$arima.interpolation
## [1] 61.17978 59.01982 59.12690 61.16683 57.53232 59.93085 59.72882
## [8] 59.90099 59.88639 57.12052 58.46335 58.94525 59.29253
accuracy_interp_arima$`40600`$arima_xreg.interpolation
## [1] 45.92921 48.51186 48.68320 47.76805 46.71281 46.98059 47.57170
## [8] 45.84552 47.30054 45.78911 46.39437 46.35905 46.54615
message("arima.interpolation")
## arima.interpolation
accuracy_interp_arima %>%
map(~ summary(.x$arima.interpolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 57.12 58.95 59.29 59.33 59.90 61.18
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69.45 71.19 72.89 73.74 76.86 77.75
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 351.0 364.2 368.0 368.5 376.9 378.4
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 323.8 335.1 352.6 349.2 355.9 385.7
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2369 2429 2649 2563 2656 2755
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2166 2213 2240 2249 2292 2345
message("arima_xreg.interpolation")
## arima_xreg.interpolation
accuracy_interp_arima %>%
map(~ summary(.x$arima_xreg.interpolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 45.79 46.36 46.71 46.95 47.57 48.68
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 58.26 59.07 59.18 60.57 63.12 64.99
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 280.5 282.2 285.2 285.3 286.7 292.5
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 272.1 274.2 281.9 281.6 289.2 291.4
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1657 1721 1750 1739 1760 1793
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1748 1757 1827 1806 1832 1856
rm(h2o.limvars_accuracy_stats)
get_interp_prophet <- function(split, pf) {
# Get analysis data
pred_dat =
analysis(split) %>%
select(el_date,
el_rides
) %>%
left_join(select(pf,
ds,
yhat,
yhat_zero_floor
) %>%
mutate_at(vars(ds), as.Date),
by = c("el_date" = "ds")
) %>%
mutate(sqrd_error = (el_rides - yhat_zero_floor)^2
) %>%
arrange(el_date)
sqrt(mean(pred_dat$sqrd_error,
na.rm = TRUE
)
)
}
accuracy_interp_prophet <-
pmap(.l = list(a = accuracy_interp_arima),
.f = function(a) {
interp =
map2_dbl(a$splits,
a$prophet.forecast,
get_interp_prophet
)
interp_hol =
map2_dbl(a$splits,
a$prophet_hol.forecast,
get_interp_prophet
)
a$prophet.interpolation = interp
a$prophet_hol.interpolation = interp_hol
return(a)
}
)
names(accuracy_interp_prophet$`40600`)
## [1] "splits" "id"
## [3] "start_date" "arima"
## [5] "arima_xreg" "prophet"
## [7] "prophet_hol" "prophet.future"
## [9] "prophet_hol.future" "prophet.forecast"
## [11] "prophet_hol.forecast" "prophet.plots"
## [13] "prophet_hol.plots" "interp_data"
## [15] "extrap_data" "h2o.interp.forecast"
## [17] "h2o.extrap.forecast" "h2o.interpolation"
## [19] "h2o.extrapolation" "h2o.limvars.interp.forecast"
## [21] "h2o.limvars.extrap.forecast" "h2o.limvars.interpolation"
## [23] "h2o.limvars.extrapolation" "arima.interpolation"
## [25] "arima_xreg.interpolation" "prophet.interpolation"
## [27] "prophet_hol.interpolation"
accuracy_interp_prophet$`40600`$prophet.interpolation
## [1] 68.96201 68.62795 69.35761 69.70989 69.45280 68.88785 70.01384
## [8] 70.17346 69.27674 68.85172 70.03093 69.98618 69.26562
accuracy_interp_prophet$`40600`$prophet_hol.interpolation
## [1] 53.98834 53.39593 53.67258 53.43274 54.14267 54.02384 54.51328
## [8] 53.80563 53.74292 54.19677 53.64952 53.80492 53.18090
message("prophet.interpolation")
## prophet.interpolation
accuracy_interp_prophet %>%
map(~ summary(.x$prophet.interpolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 68.63 68.96 69.36 69.43 69.99 70.17
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 72.59 75.07 77.02 79.08 84.65 87.29
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 409.8 412.5 418.3 417.6 421.1 424.0
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 385.7 387.5 491.8 462.0 514.1 532.5
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2633 2651 2678 2677 2700 2725
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2430 2493 2530 2519 2552 2575
message("prophet_hol.interpolation")
## prophet_hol.interpolation
accuracy_interp_prophet %>%
map(~ summary(.x$prophet_hol.interpolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 53.18 53.65 53.80 53.81 54.02 54.51
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 58.02 58.35 63.38 63.82 68.55 70.99
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 274.9 281.0 282.3 282.8 283.6 290.7
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 308.9 312.4 433.7 396.1 456.5 478.4
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1436 1470 1478 1481 1503 1511
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1685 1748 1775 1765 1784 1820
# saving is done to avoid having to run the models again
saveRDS(accuracy_interp_prophet,
paste0(wd,
"/Models/",
"accuracy_interp_prophet.Rds"
)
)
# accuracy_interp_prophet <-
# readRDS(paste0(wd,
# "/Models/",
# "accuracy_interp_prophet.Rds"
# )
# )
rm(accuracy_interp_arima)
Get the forecasted data for the random forest and the xgboost tree.
First, I create a function to get the error percentages.
get_pct_error <-
function(base, mod) {
data = bind_cols(base,
mod
) %>%
select(el_date,
el_rides,
yhat
) %>%
rename(y = el_rides
) %>%
# if_else is needed to prevent negative predictions
mutate(yhat_zero_floor = if_else(yhat < 0,
0,
yhat
),
sqrd_error = (y - yhat_zero_floor)^2
) %>%
arrange(el_date)
return(data)
}
Now I get the forecasts of all ML models as one list.
rf_xgb_forecasts_combined <-
pmap(.l = list(a = accuracy_interp_prophet,
b = DV_Fit.Rf.corr_no,
c = DV_Fit.Xgbtree.corr_yes
),
.f = function(a, b, c) {
splits = a$splits
rf = b
xgb = c
fcast =
pmap(.l = list(d = splits),
.f = function(d) {
# prep the data
analysis_assessment =
analysis(d) %>%
mutate(type = "interpolation") %>%
bind_rows(assessment(d) %>%
mutate(type = "extrapolation")
)
one_hot =
analysis_assessment %>%
func_one_hot_vars()
one_hot_interp =
one_hot %>%
filter(typeinterpolation == 1) %>%
select(-typeinterpolation,
-typeextrapolation
)
one_hot_extrap =
one_hot %>%
filter(typeextrapolation == 1) %>%
select(-typeinterpolation,
-typeextrapolation
)
complete_interp =
one_hot_interp[complete.cases(one_hot_interp), ]
complete_extrap =
one_hot_extrap[complete.cases(one_hot_extrap), ]
# predictions
pred_rf.interp =
predict(rf,
newdata = complete_interp
) %>%
as.data.frame()
names(pred_rf.interp) <- "yhat"
pred_rf.extrap =
predict(rf,
newdata = complete_extrap
) %>%
as.data.frame()
names(pred_rf.extrap) <- "yhat"
pred_xgb.interp =
predict(xgb,
newdata = complete_interp
) %>%
as.data.frame()
names(pred_xgb.interp) <- "yhat"
pred_xgb.extrap =
predict(xgb,
newdata = complete_extrap
) %>%
as.data.frame()
names(pred_xgb.extrap) <- "yhat"
# add percent errors
full_pred.rf.interp =
get_pct_error(base = complete_interp,
mod = pred_rf.interp
)
full_pred.rf.extrap =
get_pct_error(base = complete_extrap,
mod = pred_rf.extrap
)
full_pred.xgb.interp =
get_pct_error(base = complete_interp,
mod = pred_xgb.interp
)
full_pred.xgb.extrap =
get_pct_error(base = complete_extrap,
mod = pred_xgb.extrap
)
# result as a list
full_pred_list =
list(pred.rf.interp = full_pred.rf.interp,
pred.rf.extrap = full_pred.rf.extrap,
pred.xgb.interp = full_pred.xgb.interp,
pred.xgb.extrap = full_pred.xgb.extrap
)
# return value
return(full_pred_list)
}
)
a$ml_forecasts_all = fcast
return(a)
}
)
names(rf_xgb_forecasts_combined$`40600`)
## [1] "splits" "id"
## [3] "start_date" "arima"
## [5] "arima_xreg" "prophet"
## [7] "prophet_hol" "prophet.future"
## [9] "prophet_hol.future" "prophet.forecast"
## [11] "prophet_hol.forecast" "prophet.plots"
## [13] "prophet_hol.plots" "interp_data"
## [15] "extrap_data" "h2o.interp.forecast"
## [17] "h2o.extrap.forecast" "h2o.interpolation"
## [19] "h2o.extrapolation" "h2o.limvars.interp.forecast"
## [21] "h2o.limvars.extrap.forecast" "h2o.limvars.interpolation"
## [23] "h2o.limvars.extrapolation" "arima.interpolation"
## [25] "arima_xreg.interpolation" "prophet.interpolation"
## [27] "prophet_hol.interpolation" "ml_forecasts_all"
Now I separate the forecats into their own dataframes.
rf_xgb_forecasts_individualized <-
pmap(.l = list(a = rf_xgb_forecasts_combined),
.f = function(a) {
fc = a$ml_forecasts_all
# limit results to the relevant model
result.rf.interp =
pmap(.l = list(b = fc),
.f = function(b) {
b$pred.rf.interp
}
)
result.rf.extrap =
pmap(.l = list(b = fc),
.f = function(b) {
b$pred.rf.extrap
}
)
result.xgb.interp =
pmap(.l = list(b = fc),
.f = function(b) {
b$pred.xgb.interp
}
)
result.xgb.extrap =
pmap(.l = list(b = fc),
.f = function(b) {
b$pred.xgb.extrap
}
)
a$rf.forecast_interp <- result.rf.interp
a$rf.forecast_extrap <- result.rf.extrap
a$xgb.forecast_interp <- result.xgb.interp
a$xgb.forecast_extrap <- result.xgb.extrap
# remove no longer needed list of all forecasts
a$ml_forecasts_all <- NULL
return(a)
}
)
names(rf_xgb_forecasts_individualized$`40600`)
## [1] "splits" "id"
## [3] "start_date" "arima"
## [5] "arima_xreg" "prophet"
## [7] "prophet_hol" "prophet.future"
## [9] "prophet_hol.future" "prophet.forecast"
## [11] "prophet_hol.forecast" "prophet.plots"
## [13] "prophet_hol.plots" "interp_data"
## [15] "extrap_data" "h2o.interp.forecast"
## [17] "h2o.extrap.forecast" "h2o.interpolation"
## [19] "h2o.extrapolation" "h2o.limvars.interp.forecast"
## [21] "h2o.limvars.extrap.forecast" "h2o.limvars.interpolation"
## [23] "h2o.limvars.extrapolation" "arima.interpolation"
## [25] "arima_xreg.interpolation" "prophet.interpolation"
## [27] "prophet_hol.interpolation" "rf.forecast_interp"
## [29] "rf.forecast_extrap" "xgb.forecast_interp"
## [31] "xgb.forecast_extrap"
rm(accuracy_interp_prophet, rf_xgb_forecasts_combined)
Now I calculate the interpolation and extrapolation accuracy stats.
# function to get the RMSE value
get_interp_ML <- function(ML.fcast) {
pred_dat = ML.fcast
sqrt(mean(pred_dat$sqrd_error,
na.rm = TRUE
)
)
}
accuracy_interp_ML <-
pmap(.l = list(a = rf_xgb_forecasts_individualized),
.f = function(a) {
interp.rf =
map_dbl(a$rf.forecast_interp,
get_interp_ML
)
extrap.rf =
map_dbl(a$rf.forecast_extrap,
get_interp_ML
)
interp.xgb =
map_dbl(a$xgb.forecast_interp,
get_interp_ML
)
extrap.xgb =
map_dbl(a$xgb.forecast_extrap,
get_interp_ML
)
a$rf.interpolation <- interp.rf
a$rf.extrapolation <- extrap.rf
a$xgb.interpolation <- interp.xgb
a$xgb.extrapolation <- extrap.xgb
return(a)
}
)
names(accuracy_interp_ML$`40600`)
## [1] "splits" "id"
## [3] "start_date" "arima"
## [5] "arima_xreg" "prophet"
## [7] "prophet_hol" "prophet.future"
## [9] "prophet_hol.future" "prophet.forecast"
## [11] "prophet_hol.forecast" "prophet.plots"
## [13] "prophet_hol.plots" "interp_data"
## [15] "extrap_data" "h2o.interp.forecast"
## [17] "h2o.extrap.forecast" "h2o.interpolation"
## [19] "h2o.extrapolation" "h2o.limvars.interp.forecast"
## [21] "h2o.limvars.extrap.forecast" "h2o.limvars.interpolation"
## [23] "h2o.limvars.extrapolation" "arima.interpolation"
## [25] "arima_xreg.interpolation" "prophet.interpolation"
## [27] "prophet_hol.interpolation" "rf.forecast_interp"
## [29] "rf.forecast_extrap" "xgb.forecast_interp"
## [31] "xgb.forecast_extrap" "rf.interpolation"
## [33] "rf.extrapolation" "xgb.interpolation"
## [35] "xgb.extrapolation"
accuracy_interp_ML$`40600`$rf.interpolation
## [1] 20.78572 20.73984 20.74743 20.74830 20.21899 20.15955 20.13680
## [8] 20.14938 20.15622 20.04888 20.50682 20.56212 20.59679
accuracy_interp_ML$`40600`$rf.extrapolation
## [1] 17.98158 18.03256 18.17887 18.19466 18.19726 18.22872 18.40603
## [8] 19.99322 21.52102 21.41720 20.67071 21.21724 21.38974
message("rf.interpolation")
## rf.interpolation
accuracy_interp_ML %>%
map(~ summary(.x$rf.interpolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.05 20.16 20.51 20.43 20.74 20.79
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 27.03 27.66 28.25 28.76 30.36 31.15
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 124.3 124.9 126.6 126.4 128.2 128.8
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 128.0 128.2 128.8 128.7 129.0 129.4
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 767.8 771.4 777.1 782.3 791.6 805.0
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 876.9 888.9 892.2 895.3 905.5 912.0
message("rf.extrapolation")
## rf.extrapolation
accuracy_interp_ML %>%
map(~ summary(.x$rf.extrapolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 17.98 18.19 18.41 19.49 21.22 21.52
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.80 23.00 23.24 23.31 23.65 24.29
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 101.9 102.3 102.9 114.1 128.3 133.0
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 82.42 115.37 122.23 117.04 123.37 126.12
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 833.9 849.7 862.1 918.8 1013.5 1020.5
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 828.0 842.0 900.7 906.1 966.8 990.6
message("xgb.interpolation")
## xgb.interpolation
accuracy_interp_ML %>%
map(~ summary(.x$xgb.interpolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 58.39 58.74 59.31 59.19 59.36 60.06
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.09 50.59 50.89 50.87 51.21 51.51
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 146.7 147.6 147.8 147.9 148.3 149.1
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 209.5 212.9 214.2 214.4 217.2 218.4
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2317 2320 2362 2349 2376 2384
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2207 2214 2221 2231 2246 2264
message("xgb.extrapolation")
## xgb.extrapolation
accuracy_interp_ML %>%
map(~ summary(.x$xgb.extrapolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 47.37 47.71 47.80 51.13 55.36 55.67
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 41.05 43.40 44.18 43.65 44.56 45.10
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 133.0 133.6 134.9 137.0 140.7 142.6
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 183.4 189.4 189.9 191.1 194.3 198.1
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1836 1857 1909 2120 2393 2489
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1811 1866 1914 1998 2118 2227
# saving is done to avoid having to run the models again
saveRDS(accuracy_interp_ML,
paste0(wd,
"/Models/",
"accuracy_interp_ML.Rds"
)
)
# accuracy_interp_ML <-
# readRDS(paste0(wd,
# "/Models/",
# "accuracy_interp_ML.Rds"
# )
# )
rm(rf_xgb_forecasts_individualized)
Now we can check some summary (RMSE) statistics. NOTE: these are summary statistics based on the data NOT used to build the models.
Function to get the ARIMA predictions.
get_pred_arima <- function(split, mod) {
n <- nrow(assessment(split)
)
# Get assessment data
pred_dat <-
assessment(split) %>%
mutate(pred = as.vector(forecast(mod, h = n)$mean),
# if_else is used to put a floor of 0 (zero) for the prediction
pred_zero_floor = if_else(pred < 0,
0,
pred
),
# pct_error = (el_rides - pred_zero_floor) / el_rides * 100,
sqrd_error = (el_rides - pred_zero_floor)^2
)
return(pred_dat)
}
Function to calculate the extrapolation stats
get_extrap_arima <- function(arima.fcast) {
pred_dat = arima.fcast
sqrt(mean(pred_dat$sqrd_error,
na.rm = TRUE
)
)
}
Run the the forecast and extrapolation functions.
accuracy_extrap_arima <-
pmap(.l = list(a = accuracy_interp_ML),
.f = function(a) {
pred =
map2(a$splits,
a$arima,
get_pred_arima
)
a$arima.forecast <- pred
return(a)
}
)
accuracy_extrap_arima <-
pmap(.l = list(a = accuracy_extrap_arima),
.f = function(a) {
extrap <-
map_dbl(a$arima.forecast,
get_extrap_arima
)
a$arima.extrapolation <- extrap
return(a)
}
)
names(accuracy_extrap_arima$`40600`)
## [1] "splits" "id"
## [3] "start_date" "arima"
## [5] "arima_xreg" "prophet"
## [7] "prophet_hol" "prophet.future"
## [9] "prophet_hol.future" "prophet.forecast"
## [11] "prophet_hol.forecast" "prophet.plots"
## [13] "prophet_hol.plots" "interp_data"
## [15] "extrap_data" "h2o.interp.forecast"
## [17] "h2o.extrap.forecast" "h2o.interpolation"
## [19] "h2o.extrapolation" "h2o.limvars.interp.forecast"
## [21] "h2o.limvars.extrap.forecast" "h2o.limvars.interpolation"
## [23] "h2o.limvars.extrapolation" "arima.interpolation"
## [25] "arima_xreg.interpolation" "prophet.interpolation"
## [27] "prophet_hol.interpolation" "rf.forecast_interp"
## [29] "rf.forecast_extrap" "xgb.forecast_interp"
## [31] "xgb.forecast_extrap" "rf.interpolation"
## [33] "rf.extrapolation" "xgb.interpolation"
## [35] "xgb.extrapolation" "arima.forecast"
## [37] "arima.extrapolation"
accuracy_extrap_arima$`40600`$arima.extrapolation
## [1] 69.78843 71.21568 61.58047 55.39423 56.59482 54.82201 53.50198
## [8] 62.57218 63.72728 61.69955 63.98101 75.47126 73.22924
accuracy_extrap_arima %>%
map(~ summary(.x$arima.extrapolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 53.50 56.59 62.57 63.35 69.79 75.47
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 69.84 92.69 98.45 114.41 134.24 193.21
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 386.6 435.3 446.1 447.8 470.5 516.3
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 313.2 358.3 379.3 379.6 401.1 434.0
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2032 2544 2595 2831 3249 3762
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3004 3397 3479 3581 3774 4219
rm(accuracy_interp_ML)
Function to get the forecasts from the ARIMA XREG models.
get_pred_arima_xreg <-
function(split, mod) {
n = nrow(assessment(split)
)
data_og =
split %>%
assessment %>%
mutate(holiday_binary = if_else(holiday == FALSE,
0,
1
)
)
data_pred =
data_og %>%
mutate(pred = as.vector(forecast(mod$best_fit,
xreg = cbind(mod$best_ts_365_fourier_future,
# variables below identified based on "importance" from Random Forest and XGBTree models
data_og$holiday_binary,
data_og$year,
data_og$half,
data_og$quarter,
data_og$month,
data_og$mweek,
data_og$wday.lbl,
data_og$el_rides_l07,
data_og$el_rides_l14,
data_og$el_rides_l21,
data_og$el_rides_l28,
data_og$el_rides_ma07,
data_og$el_rides_ma14,
data_og$el_rides_ma21,
data_og$el_rides_ma28
),
h = n
)$mean
),
# if_else is used to put a floor of 0 (zero) for the prediction
pred_zero_floor = if_else(pred < 0,
0,
pred
),
# pct_error = (el_rides - pred_zero_floor) / el_rides * 100
sqrd_error = (el_rides - pred_zero_floor)^2
)
}
Function to calculate the extrapolation stats.
get_extrap_arima_xreg <-
function(arima.fcast) {
pred_dat = arima.fcast
sqrt(mean(pred_dat$sqrd_error,
na.rm = TRUE
)
)
}
Run the forecast and extrapolation functions.
accuracy_extrap_arima_xreg <-
pmap(.l = list(a = accuracy_extrap_arima),
.f = function(a) {
pred <-
map2(a$splits,
a$arima_xreg,
get_pred_arima_xreg
)
a$arima_xreg.forecast <- pred
return(a)
}
)
accuracy_extrap_arima_xreg <-
pmap(.l = list(a = accuracy_extrap_arima_xreg),
.f = function(a) {
extrap <-
map_dbl(a$arima_xreg.forecast,
get_extrap_arima
)
a$arima_xreg.extrapolation <- extrap
return(a)
}
)
names(accuracy_extrap_arima_xreg$`40600`)
## [1] "splits" "id"
## [3] "start_date" "arima"
## [5] "arima_xreg" "prophet"
## [7] "prophet_hol" "prophet.future"
## [9] "prophet_hol.future" "prophet.forecast"
## [11] "prophet_hol.forecast" "prophet.plots"
## [13] "prophet_hol.plots" "interp_data"
## [15] "extrap_data" "h2o.interp.forecast"
## [17] "h2o.extrap.forecast" "h2o.interpolation"
## [19] "h2o.extrapolation" "h2o.limvars.interp.forecast"
## [21] "h2o.limvars.extrap.forecast" "h2o.limvars.interpolation"
## [23] "h2o.limvars.extrapolation" "arima.interpolation"
## [25] "arima_xreg.interpolation" "prophet.interpolation"
## [27] "prophet_hol.interpolation" "rf.forecast_interp"
## [29] "rf.forecast_extrap" "xgb.forecast_interp"
## [31] "xgb.forecast_extrap" "rf.interpolation"
## [33] "rf.extrapolation" "xgb.interpolation"
## [35] "xgb.extrapolation" "arima.forecast"
## [37] "arima.extrapolation" "arima_xreg.forecast"
## [39] "arima_xreg.extrapolation"
accuracy_extrap_arima_xreg$`40600`$arima_xreg.extrapolation
## [1] 43.08196 41.36051 40.86218 41.19748 42.25116 40.87467 40.34515
## [8] 43.72419 46.16725 48.26305 46.41810 47.71259 49.56721
accuracy_extrap_arima_xreg %>%
map(~ summary(.x$arima_xreg.extrapolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 40.35 41.20 43.08 43.99 46.42 49.57
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 50.61 52.02 53.54 53.36 53.80 56.40
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 231.7 233.1 238.2 253.8 276.3 294.0
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 224.3 277.2 292.8 284.6 297.2 307.4
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1469 1486 1569 1662 1848 1931
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1805 1869 1880 1963 2134 2179
rm(accuracy_extrap_arima)
Function to get the extrapolation stats.
get_extrap_prophet <- function(split, pf) {
# Get assessment data
pred_dat <-
assessment(split) %>%
select(el_date,
el_rides
) %>%
left_join(select(pf,
ds,
yhat,
yhat_zero_floor
) %>%
mutate_at(vars(ds), as.Date),
by = c("el_date" = "ds")
) %>%
mutate(sqrd_error = (el_rides - yhat_zero_floor)^2
) %>%
arrange(el_date)
sqrt(mean(pred_dat$sqrd_error,
na.rm = TRUE
)
)
}
Run the extrapolation stats function.
accuracy_extrap_prophet <-
pmap(.l = list(a = accuracy_extrap_arima_xreg),
.f = function(a) {
extrap <-
map2_dbl(a$splits,
a$prophet.forecast,
get_extrap_prophet
)
extrap_hol <-
map2_dbl(a$splits,
a$prophet_hol.forecast,
get_extrap_prophet
)
a$prophet.extrapolation <- extrap
a$prophet_hol.extrapolation <- extrap_hol
return(a)
}
)
# saving is done to avoid having to run the models again
saveRDS(accuracy_extrap_prophet,
paste0(wd,
"/Models/",
"accuracy_extrap_prophet.Rds"
)
)
# accuracy_extrap_prophet <-
# readRDS(paste0(wd,
# "/Models/",
# "accuracy_extrap_prophet.Rds"
# )
# )
names(accuracy_extrap_prophet$`40600`)
## [1] "splits" "id"
## [3] "start_date" "arima"
## [5] "arima_xreg" "prophet"
## [7] "prophet_hol" "prophet.future"
## [9] "prophet_hol.future" "prophet.forecast"
## [11] "prophet_hol.forecast" "prophet.plots"
## [13] "prophet_hol.plots" "interp_data"
## [15] "extrap_data" "h2o.interp.forecast"
## [17] "h2o.extrap.forecast" "h2o.interpolation"
## [19] "h2o.extrapolation" "h2o.limvars.interp.forecast"
## [21] "h2o.limvars.extrap.forecast" "h2o.limvars.interpolation"
## [23] "h2o.limvars.extrapolation" "arima.interpolation"
## [25] "arima_xreg.interpolation" "prophet.interpolation"
## [27] "prophet_hol.interpolation" "rf.forecast_interp"
## [29] "rf.forecast_extrap" "xgb.forecast_interp"
## [31] "xgb.forecast_extrap" "rf.interpolation"
## [33] "rf.extrapolation" "xgb.interpolation"
## [35] "xgb.extrapolation" "arima.forecast"
## [37] "arima.extrapolation" "arima_xreg.forecast"
## [39] "arima_xreg.extrapolation" "prophet.extrapolation"
## [41] "prophet_hol.extrapolation"
accuracy_extrap_prophet$`40600`$prophet.extrapolation
## [1] 124.98899 123.04937 106.03899 92.42239 77.78675 85.07669 72.08575
## [8] 68.87321 58.21356 59.61095 62.60217 74.32767 83.69122
accuracy_extrap_prophet$`40600`$prophet_hol.extrapolation
## [1] 127.10234 131.31208 105.91172 99.80493 85.14470 86.08135 60.29477
## [8] 43.93964 39.87920 39.81755 54.94782 65.78041 83.07772
message("prophet")
## prophet
accuracy_extrap_prophet %>%
map(~ summary(.x$prophet.extrapolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 58.21 68.87 77.79 83.75 92.42 124.99
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 73.33 98.78 120.44 154.14 227.03 258.34
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 425.4 464.1 528.6 526.5 568.7 660.7
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 592.3 637.7 670.3 667.0 699.6 746.7
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2786 3065 3326 3427 3648 4664
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2646 3538 4636 4805 5761 7173
message("prophet_hol")
## prophet_hol
accuracy_extrap_prophet %>%
map(~ summary(.x$prophet_hol.extrapolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 39.82 54.95 83.08 78.70 99.80 131.31
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 58.12 89.02 119.24 148.03 223.80 252.26
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 215.6 334.5 517.0 460.6 589.0 689.4
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 462.7 616.9 690.3 661.7 722.9 765.2
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1403 1896 2926 2790 3224 4513
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2028 2847 4439 4574 5860 7561
rm(accuracy_extrap_arima_xreg)
Add an identifier for the relevant el_stop_id value.
add_el_stop_id <-
pmap(.l = list(a = accuracy_extrap_prophet,
c = names(accuracy_extrap_prophet)
),
.f = function(a, c) {
a$el_stop_id <- c
return(a)
}
)
# saving is done to avoid having to run the forecasts again
saveRDS(add_el_stop_id,
paste0(wd,
"/Models/",
"add_el_stop_id.Rds"
)
)
# add_el_stop_id <-
# readRDS(paste0(wd,
# "/Models/",
# "add_el_stop_id.Rds"
# )
# )
names(add_el_stop_id$`40600`)
## [1] "splits" "id"
## [3] "start_date" "arima"
## [5] "arima_xreg" "prophet"
## [7] "prophet_hol" "prophet.future"
## [9] "prophet_hol.future" "prophet.forecast"
## [11] "prophet_hol.forecast" "prophet.plots"
## [13] "prophet_hol.plots" "interp_data"
## [15] "extrap_data" "h2o.interp.forecast"
## [17] "h2o.extrap.forecast" "h2o.interpolation"
## [19] "h2o.extrapolation" "h2o.limvars.interp.forecast"
## [21] "h2o.limvars.extrap.forecast" "h2o.limvars.interpolation"
## [23] "h2o.limvars.extrapolation" "arima.interpolation"
## [25] "arima_xreg.interpolation" "prophet.interpolation"
## [27] "prophet_hol.interpolation" "rf.forecast_interp"
## [29] "rf.forecast_extrap" "xgb.forecast_interp"
## [31] "xgb.forecast_extrap" "rf.interpolation"
## [33] "rf.extrapolation" "xgb.interpolation"
## [35] "xgb.extrapolation" "arima.forecast"
## [37] "arima.extrapolation" "arima_xreg.forecast"
## [39] "arima_xreg.extrapolation" "prophet.extrapolation"
## [41] "prophet_hol.extrapolation" "el_stop_id"
names(add_el_stop_id$`40910`)
## [1] "splits" "id"
## [3] "start_date" "arima"
## [5] "arima_xreg" "prophet"
## [7] "prophet_hol" "prophet.future"
## [9] "prophet_hol.future" "prophet.forecast"
## [11] "prophet_hol.forecast" "prophet.plots"
## [13] "prophet_hol.plots" "interp_data"
## [15] "extrap_data" "h2o.interp.forecast"
## [17] "h2o.extrap.forecast" "h2o.interpolation"
## [19] "h2o.extrapolation" "h2o.limvars.interp.forecast"
## [21] "h2o.limvars.extrap.forecast" "h2o.limvars.interpolation"
## [23] "h2o.limvars.extrapolation" "arima.interpolation"
## [25] "arima_xreg.interpolation" "prophet.interpolation"
## [27] "prophet_hol.interpolation" "rf.forecast_interp"
## [29] "rf.forecast_extrap" "xgb.forecast_interp"
## [31] "xgb.forecast_extrap" "rf.interpolation"
## [33] "rf.extrapolation" "xgb.interpolation"
## [35] "xgb.extrapolation" "arima.forecast"
## [37] "arima.extrapolation" "arima_xreg.forecast"
## [39] "arima_xreg.extrapolation" "prophet.extrapolation"
## [41] "prophet_hol.extrapolation" "el_stop_id"
rm(accuracy_extrap_prophet)
Now I simply update the run_times dataset with the relevant h2o info.
run_times[9:10] <-
list(h2o = as.list(h2o.time),
h2o.limvars = as.list(h2o.limvars.time)
)
names(run_times)[9:10] <- c("h2o", "h2o.limvars")
str(run_times)
## List of 10
## $ arima :List of 5
## ..$ user.self : num 995
## ..$ sys.self : num 14.3
## ..$ elapsed : num 1145
## ..$ user.child: num 0
## ..$ sys.child : num 0
## $ arima_xreg :List of 5
## ..$ user.self : num 653
## ..$ sys.self : num 25.5
## ..$ elapsed : num 719
## ..$ user.child: num 0
## ..$ sys.child : num 0
## $ prophet :List of 5
## ..$ user.self : num 10.4
## ..$ sys.self : num 0.777
## ..$ elapsed : num 11.4
## ..$ user.child: num 0
## ..$ sys.child : num 0
## $ prophet_hol :List of 5
## ..$ user.self : num 23.7
## ..$ sys.self : num 1.6
## ..$ elapsed : num 25.4
## ..$ user.child: num 0
## ..$ sys.child : num 0
## $ prophet.forecast :List of 5
## ..$ user.self : num 243
## ..$ sys.self : num 21.5
## ..$ elapsed : num 282
## ..$ user.child: num 0
## ..$ sys.child : num 0
## $ prophet_hol.forecast:List of 5
## ..$ user.self : num 833
## ..$ sys.self : num 79.9
## ..$ elapsed : num 960
## ..$ user.child: num 0
## ..$ sys.child : num 0
## $ rf_corr_no :List of 5
## ..$ user.self : num 56.2
## ..$ sys.self : num 2.88
## ..$ elapsed : num 471
## ..$ user.child: num 0
## ..$ sys.child : num 0
## $ xgbtree_corr_yes :List of 5
## ..$ user.self : num 10.6
## ..$ sys.self : num 2.53
## ..$ elapsed : num 215
## ..$ user.child: num 0
## ..$ sys.child : num 0
## $ h2o :List of 5
## ..$ user.self : num 6.03
## ..$ sys.self : num 2.14
## ..$ elapsed : num 202
## ..$ user.child: num 0
## ..$ sys.child : num 0
## $ h2o.limvars :List of 5
## ..$ user.self : num 5.99
## ..$ sys.self : num 2.22
## ..$ elapsed : num 203
## ..$ user.child: num 0
## ..$ sys.child : num 0
# saving is done to avoid having to run the forecasts again
saveRDS(run_times,
paste0(wd,
"/Models/",
"run_times.Rds"
)
)
# run_times <-
# readRDS(paste0(wd,
# "/Models/",
# "run_times.Rds"
# )
# )