Load the relevant libraries.
# rm(list = ls())
# .rs.restartR()
# data manipulation
library("plyr")
library("tidyverse")
library("magrittr")
library("data.table")
library("lubridate")
library("sqldf")
# time series specific packages
library("timetk")
library("zoo")
library("tibbletime")
# modeling
library("fpp2")
library("prophet")
library("caret")
library("randomForest")
library("xgboost")
library("h2o")
library("keras")
# use_session_with_seed(123456789) # setting the seed to obtain reproducible results
# see https://keras.rstudio.com/articles/faq.html#how-can-i-obtain-reproducible-results-using-keras-during-development and https://cran.r-project.org/web/packages/keras/vignettes/faq.html
# can also re-enable gpu and parallel processing by using: use_session_with_seed(42, disable_gpu = FALSE, disable_parallel_cpu = FALSE)
# other
library("geosphere") # specific for distance calculations from lat-lon pairs
library("naniar") # inspecting missing data
library("rlang") # building functions
library("recipes") # used in Keras modeling to design matrices
library("rsample") # rolling samples for validation stats
library("tfruns") # used in Keras modeling for trainin runs
library("stringr") # string manipulation
library("ggplot2") # viz
library("sweep") # more easily pull out model statistics
library("yardstick") # easily calculate accuracy stats
library("doParallel") # parallel processing
Session Info.
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] doParallel_1.0.14 iterators_1.0.10 foreach_1.4.4
## [4] yardstick_0.0.2 sweep_0.2.1.1 tfruns_1.4
## [7] rsample_0.0.3 recipes_0.1.4 rlang_0.3.0.1
## [10] naniar_0.4.1 geosphere_1.5-7 keras_2.2.4
## [13] h2o_3.20.0.8 xgboost_0.71.2 randomForest_4.6-14
## [16] caret_6.0-81 lattice_0.20-38 prophet_0.3.0.1
## [19] Rcpp_1.0.0 fpp2_2.3 expsmooth_2.3
## [22] fma_2.3 forecast_8.4 tibbletime_0.1.1
## [25] zoo_1.8-4 timetk_0.1.1.1 sqldf_0.4-11
## [28] RSQLite_2.1.1 gsubfn_0.7 proto_1.0.0
## [31] lubridate_1.7.4 data.table_1.11.8 magrittr_1.5
## [34] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8
## [37] purrr_0.2.5 readr_1.2.1 tidyr_0.8.2
## [40] tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1
## [43] plyr_1.8.4
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 class_7.3-14 visdat_0.5.1
## [4] rprojroot_1.3-2 base64enc_0.1-3 rstudioapi_0.8
## [7] rstan_2.18.2 bit64_0.9-7 prodlim_2018.04.18
## [10] xml2_1.2.0 codetools_0.2-15 splines_3.5.1
## [13] knitr_1.20 zeallot_0.1.0 jsonlite_1.5
## [16] pROC_1.13.0 broom_0.5.0 compiler_3.5.1
## [19] httr_1.3.1 backports_1.1.2 assertthat_0.2.0
## [22] Matrix_1.2-15 lazyeval_0.2.1 cli_1.0.1
## [25] htmltools_0.3.6 prettyunits_1.0.2 tools_3.5.1
## [28] bindrcpp_0.2.2 gtable_0.2.0 glue_1.3.0
## [31] reshape2_1.4.3 cellranger_1.1.0 fracdiff_1.4-2
## [34] urca_1.3-0 debugme_1.1.0 nlme_3.1-137
## [37] lmtest_0.9-36 timeDate_3043.102 gower_0.1.2
## [40] ps_1.2.1 rvest_0.3.2 MASS_7.3-51.1
## [43] scales_1.0.0 ipred_0.9-8 hms_0.4.2
## [46] inline_0.3.15 yaml_2.2.0 quantmod_0.4-13
## [49] curl_3.2 reticulate_1.10 memoise_1.1.0
## [52] gridExtra_2.3 loo_2.0.0 StanHeaders_2.18.0
## [55] uroot_2.0-9 rpart_4.1-13 stringi_1.2.4
## [58] tensorflow_1.10 tseries_0.10-46 TTR_0.23-4
## [61] pkgbuild_1.0.2 lava_1.6.4 chron_2.3-53
## [64] bitops_1.0-6 pkgconfig_2.0.2 matrixStats_0.54.0
## [67] evaluate_0.12 bindr_0.1.1 bit_1.1-14
## [70] processx_3.2.0 tidyselect_0.2.5 R6_2.3.0
## [73] generics_0.0.2 DBI_1.0.0 whisker_0.3-2
## [76] pillar_1.3.0 haven_2.0.0 withr_2.1.2
## [79] xts_0.11-2 sp_1.3-1 RCurl_1.95-4.11
## [82] survival_2.43-3 nnet_7.3-12 modelr_0.1.2
## [85] crayon_1.3.4 rmarkdown_1.10 grid_3.5.1
## [88] readxl_1.1.0 blob_1.1.1 callr_3.0.0
## [91] ModelMetrics_1.2.2 digest_0.6.18 stats4_3.5.1
## [94] munsell_0.5.0 tcltk_3.5.1 quadprog_1.5-5
Setup the root directory.
Setting wd as the working directory.
wd <- getwd()
wd
## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy"
NOTE: add_trn_val_test, func_one_hot_vars, prophet.plots, and DV_corr_predict are the outputs produced in Step 02 - Step 04
add_trn_val_test <-
readRDS(paste0(wd,
"/Data/Interim/",
"add_trn_val_test.Rds"
)
)
func_one_hot_vars <-
readRDS(paste0(wd,
"/Data/Interim/",
"func_one_hot_vars.Rds"
)
)
prophet.plots <-
readRDS(paste0(wd,
"/Models/",
"prophet.plots.Rds"
)
)
DV_corr_predict <-
readRDS(paste0(wd,
"/Data/Interim/",
"DV_corr_predict.Rds"
)
)
Fire up h2o, but turn off the progress bar.
h2o.init()
##
## H2O is not running yet, starting it now...
##
## Note: In case of errors look at the following log files:
## /var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//RtmpNxh4aQ/h2o_mdturse_started_from_r.out
## /var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//RtmpNxh4aQ/h2o_mdturse_started_from_r.err
##
##
## Starting H2O JVM and connecting: .. Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 2 seconds 705 milliseconds
## H2O cluster timezone: America/New_York
## H2O data parsing timezone: UTC
## H2O cluster version: 3.20.0.8
## H2O cluster version age: 2 months and 12 days
## H2O cluster name: H2O_started_from_R_mdturse_alm317
## H2O cluster total nodes: 1
## H2O cluster total memory: 1.78 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: XGBoost, Algos, AutoML, Core V3, Core V4
## R Version: R version 3.5.1 (2018-07-02)
h2o.no_progress()
Now, create one-hot variables from the add_trn_val_test dataset, then separate the data into train, validation, and test datasets.
# One-Hot
h2o_one_hot <-
add_trn_val_test %>%
map(~ func_one_hot_vars(.x))
# create train-val-test for h2o
h2o_train_tbl <-
h2o_one_hot %>%
map(~ (filter(.x,
data_use_el_stop_id == "01_train"
)
)
)
h2o_valid_tbl <-
h2o_one_hot %>%
map(~ (filter(.x,
data_use_el_stop_id == "02_validation"
)
)
)
h2o_test_tbl <-
h2o_one_hot %>%
map(~ (filter(.x,
data_use_el_stop_id == "03_test"
)
)
)
# Convert to H2OFrame objects
h2o_train <- h2o_train_tbl %>% map(~ as.h2o(.x))
h2o_valid <- h2o_valid_tbl %>% map(~ as.h2o(.x))
h2o_test <- h2o_test_tbl %>% map(~ as.h2o(.x))
Create the Interpolation & Extrapolation Datasets
h2o_data <-
pmap(.l = list(a = prophet.plots),
.f = function(a) {
dat_interp = a$splits %>%
map(~ analysis(.x) %>%
func_one_hot_vars() %>%
as.h2o()
)
dat_extrap = a$splits %>%
map(~ assessment(.x) %>%
func_one_hot_vars() %>%
as.h2o()
)
a$interp_data = dat_interp
a$extrap_data = dat_extrap
return(a)
}
)
# h2o_data$`40600`$interp_data[[1]] %>% as.data.frame() %>% dim()
rm(prophet.plots)
Set the names for h2o, and run the h2o.automl.
# Set names for h2o
y <- "el_rides"
x <- h2o_train %>%
map(~ setdiff(names(.x),
y
)
)
# user system elapsed
# 9.730 5.819 215.764
# ~ 4 min
start <- proc.time()
h2o_automl_models <-
pmap(.l = list(a = x,
b = h2o_train,
c = h2o_valid,
d = h2o_test
),
.f = function(a, b, c, d) {
h2o.automl(
x = a,
y = y,
training_frame = b,
validation_frame = c,
leaderboard_frame = d,
nfolds = 13,
max_runtime_secs = 30,
stopping_metric = "deviance"
)
}
)
h2o.time <- proc.time() - start
rm(start)
h2o.time
## user system elapsed
## 6.033 2.140 202.057
saveRDS(h2o.time,
paste0(wd,
"/Models/",
"h2o.time.Rds"
)
)
# h2o.time <-
# readRDS(paste0(wd,
# "/Models/",
# "h2o.time.Rds"
# )
# )
# print(automl_models_h2o$`40600`@leaderboard)
Extract leader model.
h2o_automl_leader <-
pmap(.l = list(a = h2o_automl_models),
.f = function(a) {
ldr = a@leader
return(ldr)
}
)
# save the leader
saveRDS(h2o_automl_leader,
paste0(wd,
"/Models/",
"h2o_automl_leader.Rds"
)
)
# h2o_automl_leader <-
# readRDS(paste0(wd,
# "/Models/",
# "h2o_automl_leader.Rds"
# )
# )
Get the predictions.
# predictions on test set
h2o_pred <-
pmap(.l = list(a = h2o_automl_leader,
b = h2o_test
),
.f = function(a, b) {
res = h2o.predict(a, newdata = b)
return(res)
}
)
# predictions on interp & extrap
h2o_forecasts <-
pmap(.l = list(a = h2o_data,
b = h2o_automl_leader
),
.f = function(a, b) {
int_pred =
pmap(.l = list(c = a$interp_data),
.f = function(c) {
pred = h2o.predict(b, newdata = c) %>%
as.data.frame()
return(pred)
}
)
ext_pred =
pmap(.l = list(d = a$extrap_data),
.f = function(d) {
pred = h2o.predict(b, newdata = d) %>%
as.data.frame()
return(pred)
}
)
int_data =
pmap(.l = list(e = a$interp_data),
.f = function(e) {
df = e %>% as.data.frame()
return(df)
}
)
ext_data =
pmap(.l = list(f = a$extrap_data),
.f = function(f) {
df = f %>% as.data.frame()
return(df)
}
)
int_fcast =
pmap(.l = list(g = int_data,
h = int_pred
),
.f = function(g, h) {
res = g %>%
select(el_date, el_rides) %>%
bind_cols(h) %>%
rename(actual = el_rides,
yhat = predict
) %>%
mutate(sqrd_error = (actual - yhat)^2
)
}
)
ext_fcast =
pmap(.l = list(i = ext_data,
j = ext_pred
),
.f = function(i, j) {
res = i %>%
select(el_date, el_rides) %>%
bind_cols(j) %>%
rename(actual = el_rides,
yhat = predict
) %>%
mutate(sqrd_error = (actual - yhat)^2
)
}
)
a$h2o.interp.forecast = int_fcast
a$h2o.extrap.forecast = ext_fcast
return(a)
}
)
# add_el_stop_id$`40600`$h2o.extrap.forecast[[1]] %>% str()
rm(h2o_data)
Now I can calculate the accuracy (RMSE) stats.
# calculation of RMSE
h2o_accuracy_stats <-
pmap(.l = list(a = h2o_forecasts),
.f = function(a) {
rmse_interp =
a$h2o.interp.forecast %>%
map_dbl(~ sqrt(mean(.x$sqrd_error)
)
)
rmse_extrap =
a$h2o.extrap.forecast %>%
map_dbl(~ sqrt(mean(.x$sqrd_error)
)
)
a$h2o.interpolation = rmse_interp
a$h2o.extrapolation = rmse_extrap
return(a)
}
)
# save the dataset
saveRDS(h2o_accuracy_stats,
paste0(wd,
"/Models/",
"h2o_accuracy_stats.Rds"
)
)
# h2o_accuracy_stats <-
# readRDS(paste0(wd,
# "/Models/",
# "h2o_accuracy_stats.Rds"
# )
# )
rm(h2o_forecasts)
# Summary stats of accuracy
h2o_accuracy_stats %>%
map(~ summary(.x$h2o.interpolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 15.30 15.36 15.42 15.40 15.43 15.57
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 26.01 26.34 26.57 26.82 27.66 27.84
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 113.9 114.8 114.8 115.1 115.8 115.9
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 112.4 113.7 114.2 114.3 115.1 115.7
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 811.9 836.0 840.2 839.5 840.9 859.4
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 437.6 440.1 441.9 443.6 446.7 453.1
h2o_accuracy_stats %>%
map(~ summary(.x$h2o.extrapolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 14.71 14.84 14.92 14.98 15.10 15.39
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 22.32 22.87 22.99 23.27 23.76 25.02
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 99.63 101.36 106.72 110.31 116.85 125.59
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 85.19 104.39 105.46 103.85 107.17 108.00
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 789.8 811.5 831.4 852.2 876.8 940.1
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 469.9 480.5 482.6 484.7 491.7 494.8
Now I measure performance on the test dataset.
h2o_perf <-
pmap(.l = list(a = h2o_automl_leader,
b = h2o_test),
.f = function(a, b) {
per = h2o.performance(a, newdata = b)
return(per)
}
)
h2o_perf
## $`40600`
## H2ORegressionMetrics: stackedensemble
##
## MSE: 1146.714
## RMSE: 33.86317
## MAE: 26.11591
## RMSLE: 0.09431473
## Mean Residual Deviance : 1146.714
##
##
## $`41140`
## H2ORegressionMetrics: stackedensemble
##
## MSE: 1740.001
## RMSE: 41.71332
## MAE: 31.6018
## RMSLE: 0.1015169
## Mean Residual Deviance : 1740.001
##
##
## $`40120`
## H2ORegressionMetrics: stackedensemble
##
## MSE: 34860.02
## RMSE: 186.7084
## MAE: 137.7366
## RMSLE: 0.1135811
## Mean Residual Deviance : 34860.02
##
##
## $`40910`
## H2ORegressionMetrics: stackedensemble
##
## MSE: 31446.56
## RMSE: 177.3318
## MAE: 126.1339
## RMSLE: 0.07578638
## Mean Residual Deviance : 31446.56
##
##
## $`40380`
## H2ORegressionMetrics: gbm
##
## MSE: 2156051
## RMSE: 1468.35
## MAE: 972.8846
## RMSLE: 0.1589573
## Mean Residual Deviance : 2156051
##
##
## $`41660`
## H2ORegressionMetrics: gbm
##
## MSE: 2454560
## RMSE: 1566.703
## MAE: 1117.092
## RMSLE: 0.12669
## Mean Residual Deviance : 2454560
# save the leader performance
saveRDS(h2o_perf,
paste0(wd,
"/Models/",
"h2o_perf.Rds"
)
)
# h2o_perf <-
# readRDS(paste0(wd,
# "/Models/",
# "h2o_perf.Rds"
# )
# )
# h2o.removeAll()
Select specific columns to use
col_names <-
DV_corr_predict %>%
map(~ colnames(.x))
select_cols <-
pmap(.l = list(a = h2o_one_hot,
b = col_names
),
.f = function(a, b) {
res = a[, colnames(a) %in% b]
return(res)
}
)
rm(col_names)
Create train-val-test datasets for h2o.
h2o.limvars_train_tbl <-
select_cols %>%
map(~ (filter(.x,
data_use_el_stop_id == "01_train"
)
)
)
h2o.limvars_valid_tbl <-
select_cols %>%
map(~ (filter(.x,
data_use_el_stop_id == "02_validation"
)
)
)
h2o.limvars_test_tbl <-
select_cols %>%
map(~ (filter(.x,
data_use_el_stop_id == "03_test"
)
)
)
rm(select_cols)
# Convert to H2OFrame objects
h2o.limvars_train <- h2o.limvars_train_tbl %>% map(~ as.h2o(.x))
h2o.limvars_valid <- h2o.limvars_valid_tbl %>% map(~ as.h2o(.x))
h2o.limvars_test <- h2o.limvars_test_tbl %>% map(~ as.h2o(.x))
Set names for h2o and run h2o.automl.
y <- "el_rides"
x <- h2o.limvars_train %>%
map(~ setdiff(names(.x),
y
)
)
# user system elapsed
# 7.985 3.463 208.442
# ~ 4 min
start <- proc.time()
h2o.limvars_automl_models <-
pmap(.l = list(a = x,
b = h2o.limvars_train,
c = h2o.limvars_valid,
d = h2o.limvars_test
),
.f = function(a, b, c, d) {
h2o.automl(
x = a,
y = y,
training_frame = b,
validation_frame = c,
leaderboard_frame = d,
nfolds = 13,
# max_runtime_secs = 300,
max_runtime_secs = 30,
stopping_metric = "deviance"
)
}
)
h2o.limvars.time <- proc.time() - start
rm(start)
h2o.limvars.time
## user system elapsed
## 5.988 2.221 203.070
saveRDS(h2o.limvars.time,
paste0(wd,
"/Models/",
"h2o.limvars.time.Rds"
)
)
# h2o.limvars.time <-
# readRDS(paste0(wd,
# "/Models/",
# "h2o.limvars.time.Rds"
# )
# )
# print(h2o.limvars_automl_models$`40600`@leaderboard)
Extract leader model.
h2o.limvars_automl_leader <-
pmap(.l = list(a = h2o.limvars_automl_models),
.f = function(a) {
ldr = a@leader
return(ldr)
}
)
# save the leader
saveRDS(h2o.limvars_automl_leader,
paste0(wd,
"/Models/",
"h2o.limvars_automl_leader.Rds"
)
)
# h2o.limvars_automl_leader <-
# readRDS(paste0(wd,
# "/Models/",
# "h2o.limvars_automl_leader.Rds"
# )
# )
Get the predictions on the interpolation, and extrapolation, and test data.
# predictions on interp & extrap
h2o.limvars_forecasts <-
pmap(.l = list(a = h2o_accuracy_stats,
b = h2o.limvars_automl_leader
),
.f = function(a, b) {
int_pred =
pmap(.l = list(c = a$interp_data),
.f = function(c) {
pred = h2o.predict(b, newdata = c) %>%
as.data.frame()
return(pred)
}
)
ext_pred =
pmap(.l = list(d = a$extrap_data),
.f = function(d) {
pred = h2o.predict(b, newdata = d) %>%
as.data.frame()
return(pred)
}
)
int_data =
pmap(.l = list(e = a$interp_data),
.f = function(e) {
df = e %>% as.data.frame()
return(df)
}
)
ext_data =
pmap(.l = list(f = a$extrap_data),
.f = function(f) {
df = f %>% as.data.frame()
return(df)
}
)
int_fcast =
pmap(.l = list(g = int_data,
h = int_pred
),
.f = function(g, h) {
res = g %>%
select(el_date, el_rides) %>%
bind_cols(h) %>%
rename(actual = el_rides,
yhat = predict
) %>%
mutate(sqrd_error = (actual - yhat)^2
)
}
)
ext_fcast =
pmap(.l = list(i = ext_data,
j = ext_pred
),
.f = function(i, j) {
res = i %>%
select(el_date, el_rides) %>%
bind_cols(j) %>%
rename(actual = el_rides,
yhat = predict
) %>%
mutate(sqrd_error = (actual - yhat)^2
)
}
)
a$h2o.limvars.interp.forecast = int_fcast
a$h2o.limvars.extrap.forecast = ext_fcast
return(a)
}
)
rm(h2o_accuracy_stats)
# predictions on test set
h2o.limvars_pred <-
pmap(.l = list(a = h2o.limvars_automl_leader,
b = h2o.limvars_test
),
.f = function(a, b) {
res = h2o.predict(a, newdata = b)
return(res)
}
)
Calculate the accuracy (RMSE) stats.
# calculate RMSE
h2o.limvars_accuracy_stats <-
pmap(.l = list(a = h2o.limvars_forecasts),
.f = function(a) {
rmse_interp =
a$h2o.limvars.interp.forecast %>%
map_dbl(~ sqrt(mean(.x$sqrd_error)
)
)
rmse_extrap =
a$h2o.limvars.extrap.forecast %>%
map_dbl(~ sqrt(mean(.x$sqrd_error)
)
)
a$h2o.limvars.interpolation = rmse_interp
a$h2o.limvars.extrapolation = rmse_extrap
return(a)
}
)
# save the dataset
saveRDS(h2o.limvars_accuracy_stats,
paste0(wd,
"/Models/",
"h2o.limvars_accuracy_stats.Rds"
)
)
# h2o.limvars_accuracy_stats <-
# readRDS(paste0(wd,
# "/Models/",
# "h2o.limvars_accuracy_stats.Rds"
# )
# )
rm(h2o.limvars_forecasts)
# Summary stats of accuracy
h2o.limvars_accuracy_stats %>%
map(~ summary(.x$h2o.limvars.interpolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 20.73 20.84 21.18 21.03 21.22 21.26
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 33.80 34.07 34.23 34.75 35.88 36.44
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 137.8 138.6 138.7 139.6 139.4 143.0
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 142.3 144.8 147.8 146.7 148.5 148.7
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 824.9 837.4 838.1 843.3 848.5 869.2
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 978.9 980.4 981.9 986.5 991.5 1005.9
h2o.limvars_accuracy_stats %>%
map(~ summary(.x$h2o.limvars.extrapolation)
)
## $`40600`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.17 19.42 19.48 20.28 21.50 21.86
##
## $`41140`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 32.60 32.98 33.41 33.43 33.92 34.09
##
## $`40120`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 110.5 113.6 115.2 115.4 115.8 122.4
##
## $`40910`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 103.8 131.8 142.0 135.7 142.6 145.3
##
## $`40380`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 906.2 920.9 969.6 964.6 979.5 1035.8
##
## $`41660`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 969.4 971.9 1030.7 1044.5 1104.9 1130.2
Measure performance on test dataset.
h2o.limvars_perf <-
pmap(.l = list(a = h2o.limvars_automl_leader,
b = h2o.limvars_test),
.f = function(a, b) {
per = h2o.performance(a, newdata = b)
return(per)
}
)
h2o.limvars_perf
## $`40600`
## H2ORegressionMetrics: stackedensemble
##
## MSE: 2572.714
## RMSE: 50.72193
## MAE: 36.84049
## RMSLE: 0.1434912
## Mean Residual Deviance : 2572.714
##
##
## $`41140`
## H2ORegressionMetrics: gbm
##
## MSE: 2465.854
## RMSE: 49.65736
## MAE: 37.5372
## RMSLE: 0.1167978
## Mean Residual Deviance : 2465.854
##
##
## $`40120`
## H2ORegressionMetrics: drf
##
## MSE: 73492.85
## RMSE: 271.0956
## MAE: 170.3585
## RMSLE: 0.1558966
## Mean Residual Deviance : 73492.85
##
##
## $`40910`
## H2ORegressionMetrics: stackedensemble
##
## MSE: 50273.03
## RMSE: 224.2165
## MAE: 147.4233
## RMSLE: 0.09721515
## Mean Residual Deviance : 50273.03
##
##
## $`40380`
## H2ORegressionMetrics: gbm
##
## MSE: 4560092
## RMSE: 2135.437
## MAE: 1342.835
## RMSLE: 0.2120997
## Mean Residual Deviance : 4560092
##
##
## $`41660`
## H2ORegressionMetrics: gbm
##
## MSE: 4751802
## RMSE: 2179.863
## MAE: 1447.372
## RMSLE: 0.1722674
## Mean Residual Deviance : 4751802
# save the leader performance
saveRDS(h2o.limvars_perf,
paste0(wd,
"/Models/",
"h2o.limvars_perf.Rds"
)
)
# h2o.limvars_perf <-
# readRDS(paste0(wd,
# "/Models/",
# "h2o.limvars_perf.Rds"
# )
# )
# h2o.removeAll()