Step 05: Models

Setup

Load the relevant libraries.

# rm(list = ls())
# .rs.restartR()


# data manipulation
library("plyr")
library("tidyverse")
library("magrittr")
library("data.table")
library("lubridate")
library("sqldf")


# time series specific packages
library("timetk")
library("zoo")
library("tibbletime")


# modeling
library("fpp2")
library("prophet")
library("caret")
library("randomForest")
library("xgboost")
library("h2o")
library("keras")
# use_session_with_seed(123456789) # setting the seed to obtain reproducible results
# see https://keras.rstudio.com/articles/faq.html#how-can-i-obtain-reproducible-results-using-keras-during-development and https://cran.r-project.org/web/packages/keras/vignettes/faq.html
# can also re-enable gpu and parallel processing by using:  use_session_with_seed(42, disable_gpu = FALSE, disable_parallel_cpu = FALSE)



# other
library("geosphere")          # specific for distance calculations from lat-lon pairs
library("naniar")             # inspecting missing data
library("rlang")              # building functions
library("recipes")            # used in Keras modeling to design matrices
library("rsample")            # rolling samples for validation stats
library("tfruns")             # used in Keras modeling for trainin runs
library("stringr")            # string manipulation
library("ggplot2")            # viz
library("sweep")              # more easily pull out model statistics
library("yardstick")          # easily calculate accuracy stats
library("doParallel")         # parallel processing

Session Info.

sessionInfo()

## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] doParallel_1.0.14   iterators_1.0.10    foreach_1.4.4      
##  [4] yardstick_0.0.2     sweep_0.2.1.1       tfruns_1.4         
##  [7] rsample_0.0.3       recipes_0.1.4       rlang_0.3.0.1      
## [10] naniar_0.4.1        geosphere_1.5-7     keras_2.2.4        
## [13] h2o_3.20.0.8        xgboost_0.71.2      randomForest_4.6-14
## [16] caret_6.0-81        lattice_0.20-38     prophet_0.3.0.1    
## [19] Rcpp_1.0.0          fpp2_2.3            expsmooth_2.3      
## [22] fma_2.3             forecast_8.4        tibbletime_0.1.1   
## [25] zoo_1.8-4           timetk_0.1.1.1      sqldf_0.4-11       
## [28] RSQLite_2.1.1       gsubfn_0.7          proto_1.0.0        
## [31] lubridate_1.7.4     data.table_1.11.8   magrittr_1.5       
## [34] forcats_0.3.0       stringr_1.3.1       dplyr_0.7.8        
## [37] purrr_0.2.5         readr_1.2.1         tidyr_0.8.2        
## [40] tibble_1.4.2        ggplot2_3.1.0       tidyverse_1.2.1    
## [43] plyr_1.8.4         
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.3-2   class_7.3-14       visdat_0.5.1      
##  [4] rprojroot_1.3-2    base64enc_0.1-3    rstudioapi_0.8    
##  [7] rstan_2.18.2       bit64_0.9-7        prodlim_2018.04.18
## [10] xml2_1.2.0         codetools_0.2-15   splines_3.5.1     
## [13] knitr_1.20         zeallot_0.1.0      jsonlite_1.5      
## [16] pROC_1.13.0        broom_0.5.0        compiler_3.5.1    
## [19] httr_1.3.1         backports_1.1.2    assertthat_0.2.0  
## [22] Matrix_1.2-15      lazyeval_0.2.1     cli_1.0.1         
## [25] htmltools_0.3.6    prettyunits_1.0.2  tools_3.5.1       
## [28] bindrcpp_0.2.2     gtable_0.2.0       glue_1.3.0        
## [31] reshape2_1.4.3     cellranger_1.1.0   fracdiff_1.4-2    
## [34] urca_1.3-0         debugme_1.1.0      nlme_3.1-137      
## [37] lmtest_0.9-36      timeDate_3043.102  gower_0.1.2       
## [40] ps_1.2.1           rvest_0.3.2        MASS_7.3-51.1     
## [43] scales_1.0.0       ipred_0.9-8        hms_0.4.2         
## [46] inline_0.3.15      yaml_2.2.0         quantmod_0.4-13   
## [49] curl_3.2           reticulate_1.10    memoise_1.1.0     
## [52] gridExtra_2.3      loo_2.0.0          StanHeaders_2.18.0
## [55] uroot_2.0-9        rpart_4.1-13       stringi_1.2.4     
## [58] tensorflow_1.10    tseries_0.10-46    TTR_0.23-4        
## [61] pkgbuild_1.0.2     lava_1.6.4         chron_2.3-53      
## [64] bitops_1.0-6       pkgconfig_2.0.2    matrixStats_0.54.0
## [67] evaluate_0.12      bindr_0.1.1        bit_1.1-14        
## [70] processx_3.2.0     tidyselect_0.2.5   R6_2.3.0          
## [73] generics_0.0.2     DBI_1.0.0          whisker_0.3-2     
## [76] pillar_1.3.0       haven_2.0.0        withr_2.1.2       
## [79] xts_0.11-2         sp_1.3-1           RCurl_1.95-4.11   
## [82] survival_2.43-3    nnet_7.3-12        modelr_0.1.2      
## [85] crayon_1.3.4       rmarkdown_1.10     grid_3.5.1        
## [88] readxl_1.1.0       blob_1.1.1         callr_3.0.0       
## [91] ModelMetrics_1.2.2 digest_0.6.18      stats4_3.5.1      
## [94] munsell_0.5.0      tcltk_3.5.1        quadprog_1.5-5

Setup the root directory.

Setting wd as the working directory.

wd <- getwd()

wd

## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy"

Modeling

NOTE: add_trn_val_test, func_one_hot_vars, prophet.plots, and DV_corr_predict are the outputs produced in Step 02 - Step 04

add_trn_val_test <-
  readRDS(paste0(wd,
                 "/Data/Interim/",
                 "add_trn_val_test.Rds"
                 )
          )

func_one_hot_vars <-
  readRDS(paste0(wd,
                 "/Data/Interim/",
                 "func_one_hot_vars.Rds"
                 )
          )

prophet.plots <-
  readRDS(paste0(wd,
                 "/Models/",
                 "prophet.plots.Rds"
                 )
          )

DV_corr_predict <-
  readRDS(paste0(wd,
                 "/Data/Interim/",
                 "DV_corr_predict.Rds"
                 )
          )

Modeling With H2O

All Variables

Fire up h2o, but turn off the progress bar.

h2o.init()

## 
## H2O is not running yet, starting it now...
## 
## Note:  In case of errors look at the following log files:
##     /var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//RtmpNxh4aQ/h2o_mdturse_started_from_r.out
##     /var/folders/yb/g384t9010db3z533jmxcp5280000gn/T//RtmpNxh4aQ/h2o_mdturse_started_from_r.err
## 
## 
## Starting H2O JVM and connecting: .. Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         2 seconds 705 milliseconds 
##     H2O cluster timezone:       America/New_York 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.20.0.8 
##     H2O cluster version age:    2 months and 12 days  
##     H2O cluster name:           H2O_started_from_R_mdturse_alm317 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   1.78 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         XGBoost, Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.5.1 (2018-07-02)

h2o.no_progress()

Now, create one-hot variables from the add_trn_val_test dataset, then separate the data into train, validation, and test datasets.

# One-Hot
h2o_one_hot <-
  add_trn_val_test %>% 
  map(~ func_one_hot_vars(.x))


# create train-val-test for h2o
h2o_train_tbl <-
  h2o_one_hot %>% 
  map(~ (filter(.x,
                data_use_el_stop_id == "01_train"
                )
         )
      )

h2o_valid_tbl <-
  h2o_one_hot %>% 
  map(~ (filter(.x,
                data_use_el_stop_id == "02_validation"
                )
         )
      )

h2o_test_tbl <-
  h2o_one_hot %>% 
  map(~ (filter(.x,
                data_use_el_stop_id == "03_test"
                )
         )
      )


# Convert to H2OFrame objects
h2o_train <- h2o_train_tbl %>% map(~ as.h2o(.x))
h2o_valid <- h2o_valid_tbl %>% map(~ as.h2o(.x))
h2o_test  <- h2o_test_tbl %>% map(~ as.h2o(.x))

Create the Interpolation & Extrapolation Datasets

h2o_data <-
  pmap(.l = list(a = prophet.plots),
       .f = function(a) {
         dat_interp = a$splits %>% 
           map(~ analysis(.x) %>% 
                 func_one_hot_vars() %>% 
                 as.h2o()
               )
         
         dat_extrap = a$splits %>% 
           map(~ assessment(.x) %>% 
                 func_one_hot_vars() %>% 
                 as.h2o()
               )
         
         a$interp_data = dat_interp
         a$extrap_data = dat_extrap
         
         return(a)
         }
       )

# h2o_data$`40600`$interp_data[[1]] %>% as.data.frame() %>% dim()
rm(prophet.plots)

Set the names for h2o, and run the h2o.automl.

# Set names for h2o
y <- "el_rides"
x <- h2o_train %>% 
  map(~ setdiff(names(.x),
                y
                )
      )


# user  system elapsed 
#   9.730   5.819 215.764
# ~ 4 min
start <- proc.time()

h2o_automl_models <-
  pmap(.l = list(a = x,
                 b = h2o_train,
                 c = h2o_valid,
                 d = h2o_test
                 ),
       .f = function(a, b, c, d) {
         h2o.automl(
           x = a,
           y = y,
           training_frame = b,
           validation_frame = c,
           leaderboard_frame = d,
           nfolds = 13,
           max_runtime_secs = 30,
           stopping_metric = "deviance"
           )
         }
       )

h2o.time <- proc.time() - start
rm(start)

h2o.time

##    user  system elapsed 
##   6.033   2.140 202.057

saveRDS(h2o.time,
        paste0(wd,
               "/Models/",
               "h2o.time.Rds"
               )
        )

# h2o.time <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "h2o.time.Rds"
#                  )
#           )

# print(automl_models_h2o$`40600`@leaderboard)

Extract leader model.

h2o_automl_leader <-
  pmap(.l = list(a = h2o_automl_models),
       .f = function(a) {
         ldr = a@leader
         
         return(ldr)
         }
       )


# save the leader
saveRDS(h2o_automl_leader,
        paste0(wd,
               "/Models/",
               "h2o_automl_leader.Rds"
               )
        )

# h2o_automl_leader <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "h2o_automl_leader.Rds"
#                  )
#           )

Get the predictions.

# predictions on test set
h2o_pred <-
  pmap(.l = list(a = h2o_automl_leader,
                 b = h2o_test
                 ),
       .f = function(a, b) {
         res = h2o.predict(a, newdata = b)
         
         return(res)
         }
       )


# predictions on interp & extrap
h2o_forecasts <-
  pmap(.l = list(a = h2o_data,
                 b = h2o_automl_leader
                 ),
       .f = function(a, b) {
         int_pred =
           pmap(.l = list(c = a$interp_data),
                .f = function(c) {
                  pred = h2o.predict(b, newdata = c) %>% 
                    as.data.frame()
                  
                  return(pred)
                  }
                )

         ext_pred =
           pmap(.l = list(d = a$extrap_data),
                .f = function(d) {
                  pred = h2o.predict(b, newdata = d) %>% 
                    as.data.frame()
                  
                  return(pred)
                  }
                )
         
         
         int_data =
           pmap(.l = list(e = a$interp_data),
                .f = function(e) {
                  df = e %>% as.data.frame()
                  
                  return(df)
                  }
                )
         
         ext_data =
           pmap(.l = list(f = a$extrap_data),
                .f = function(f) {
                  df = f %>% as.data.frame()
                  
                  return(df)
                  }
                )
         
         
         int_fcast =
           pmap(.l = list(g = int_data,
                          h = int_pred
                          ),
                .f = function(g, h) {
                  res = g %>% 
                    select(el_date, el_rides) %>% 
                    bind_cols(h) %>% 
                    rename(actual = el_rides,
                           yhat = predict
                           ) %>% 
                    mutate(sqrd_error = (actual - yhat)^2
                           )
                  }
                )
         
         ext_fcast =
           pmap(.l = list(i = ext_data,
                          j = ext_pred
                          ),
                .f = function(i, j) {
                  res = i %>% 
                    select(el_date, el_rides) %>% 
                    bind_cols(j) %>% 
                    rename(actual = el_rides,
                           yhat = predict
                           ) %>% 
                    mutate(sqrd_error = (actual - yhat)^2
                           )
                  }
                )
         
         
         a$h2o.interp.forecast = int_fcast
         a$h2o.extrap.forecast = ext_fcast
         
         return(a)
         }
       )


# add_el_stop_id$`40600`$h2o.extrap.forecast[[1]] %>% str()
rm(h2o_data)

Now I can calculate the accuracy (RMSE) stats.

# calculation of RMSE
h2o_accuracy_stats <-
  pmap(.l = list(a = h2o_forecasts),
       .f = function(a) {
         rmse_interp =
           a$h2o.interp.forecast %>% 
           map_dbl(~ sqrt(mean(.x$sqrd_error)
                          )
               )
         
         rmse_extrap =
           a$h2o.extrap.forecast %>% 
           map_dbl(~ sqrt(mean(.x$sqrd_error)
                          )
               )
         
         a$h2o.interpolation = rmse_interp
         a$h2o.extrapolation = rmse_extrap
         
         return(a)
         }
       )


# save the dataset 
saveRDS(h2o_accuracy_stats,
        paste0(wd,
               "/Models/",
               "h2o_accuracy_stats.Rds"
               )
        )


# h2o_accuracy_stats <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "h2o_accuracy_stats.Rds"
#                  )
#           )


rm(h2o_forecasts)


# Summary stats of accuracy  
h2o_accuracy_stats %>% 
  map(~ summary(.x$h2o.interpolation)
      )

## $`40600`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.30   15.36   15.42   15.40   15.43   15.57 
## 
## $`41140`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   26.01   26.34   26.57   26.82   27.66   27.84 
## 
## $`40120`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   113.9   114.8   114.8   115.1   115.8   115.9 
## 
## $`40910`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   112.4   113.7   114.2   114.3   115.1   115.7 
## 
## $`40380`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   811.9   836.0   840.2   839.5   840.9   859.4 
## 
## $`41660`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   437.6   440.1   441.9   443.6   446.7   453.1

h2o_accuracy_stats %>% 
  map(~ summary(.x$h2o.extrapolation)
      )

## $`40600`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   14.71   14.84   14.92   14.98   15.10   15.39 
## 
## $`41140`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   22.32   22.87   22.99   23.27   23.76   25.02 
## 
## $`40120`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   99.63  101.36  106.72  110.31  116.85  125.59 
## 
## $`40910`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   85.19  104.39  105.46  103.85  107.17  108.00 
## 
## $`40380`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   789.8   811.5   831.4   852.2   876.8   940.1 
## 
## $`41660`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   469.9   480.5   482.6   484.7   491.7   494.8

Now I measure performance on the test dataset.

h2o_perf <-
  pmap(.l = list(a = h2o_automl_leader,
                 b = h2o_test),
       .f = function(a, b) {
         per = h2o.performance(a, newdata = b)
         
         return(per)
         }
       )

h2o_perf

## $`40600`
## H2ORegressionMetrics: stackedensemble
## 
## MSE:  1146.714
## RMSE:  33.86317
## MAE:  26.11591
## RMSLE:  0.09431473
## Mean Residual Deviance :  1146.714
## 
## 
## $`41140`
## H2ORegressionMetrics: stackedensemble
## 
## MSE:  1740.001
## RMSE:  41.71332
## MAE:  31.6018
## RMSLE:  0.1015169
## Mean Residual Deviance :  1740.001
## 
## 
## $`40120`
## H2ORegressionMetrics: stackedensemble
## 
## MSE:  34860.02
## RMSE:  186.7084
## MAE:  137.7366
## RMSLE:  0.1135811
## Mean Residual Deviance :  34860.02
## 
## 
## $`40910`
## H2ORegressionMetrics: stackedensemble
## 
## MSE:  31446.56
## RMSE:  177.3318
## MAE:  126.1339
## RMSLE:  0.07578638
## Mean Residual Deviance :  31446.56
## 
## 
## $`40380`
## H2ORegressionMetrics: gbm
## 
## MSE:  2156051
## RMSE:  1468.35
## MAE:  972.8846
## RMSLE:  0.1589573
## Mean Residual Deviance :  2156051
## 
## 
## $`41660`
## H2ORegressionMetrics: gbm
## 
## MSE:  2454560
## RMSE:  1566.703
## MAE:  1117.092
## RMSLE:  0.12669
## Mean Residual Deviance :  2454560

# save the leader performance
saveRDS(h2o_perf,
        paste0(wd,
               "/Models/",
               "h2o_perf.Rds"
               )
        )


# h2o_perf <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "h2o_perf.Rds"
#                  )
#           )


# h2o.removeAll()

Limited Variables

Select specific columns to use

col_names <-
  DV_corr_predict %>% 
  map(~ colnames(.x))


select_cols <-
  pmap(.l = list(a = h2o_one_hot,
                 b = col_names
                 ),
       .f = function(a, b) {
         res = a[, colnames(a) %in% b]
         
         return(res)
         }
       )


rm(col_names)

Create train-val-test datasets for h2o.

h2o.limvars_train_tbl <-
  select_cols %>% 
  map(~ (filter(.x,
                data_use_el_stop_id == "01_train"
                )
         )
      )

h2o.limvars_valid_tbl <-
  select_cols %>% 
  map(~ (filter(.x,
                data_use_el_stop_id == "02_validation"
                )
         )
      )

h2o.limvars_test_tbl <-
  select_cols %>% 
  map(~ (filter(.x,
                data_use_el_stop_id == "03_test"
                )
         )
      )


rm(select_cols)


# Convert to H2OFrame objects
h2o.limvars_train <- h2o.limvars_train_tbl %>% map(~ as.h2o(.x))
h2o.limvars_valid <- h2o.limvars_valid_tbl %>% map(~ as.h2o(.x))
h2o.limvars_test  <- h2o.limvars_test_tbl %>% map(~ as.h2o(.x))

Set names for h2o and run h2o.automl.

y <- "el_rides"
x <- h2o.limvars_train %>% 
  map(~ setdiff(names(.x),
                y
                )
      )


# user  system elapsed 
#   7.985   3.463 208.442 
# ~ 4 min
start <- proc.time()

h2o.limvars_automl_models <-
  pmap(.l = list(a = x,
                 b = h2o.limvars_train,
                 c = h2o.limvars_valid,
                 d = h2o.limvars_test
                 ),
       .f = function(a, b, c, d) {
         h2o.automl(
           x = a,
           y = y,
           training_frame = b,
           validation_frame = c,
           leaderboard_frame = d,
           nfolds = 13,
           # max_runtime_secs = 300,
           max_runtime_secs = 30,
           stopping_metric = "deviance"
           )
         }
       )

h2o.limvars.time <- proc.time() - start
rm(start)

h2o.limvars.time

##    user  system elapsed 
##   5.988   2.221 203.070

saveRDS(h2o.limvars.time,
        paste0(wd,
               "/Models/",
               "h2o.limvars.time.Rds"
               )
        )

# h2o.limvars.time <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "h2o.limvars.time.Rds"
#                  )
#           )

# print(h2o.limvars_automl_models$`40600`@leaderboard)

Extract leader model.

h2o.limvars_automl_leader <-
  pmap(.l = list(a = h2o.limvars_automl_models),
       .f = function(a) {
         ldr = a@leader
         
         return(ldr)
         }
       )


# save the leader
saveRDS(h2o.limvars_automl_leader,
        paste0(wd,
               "/Models/",
               "h2o.limvars_automl_leader.Rds"
               )
        )

# h2o.limvars_automl_leader <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "h2o.limvars_automl_leader.Rds"
#                  )
#           )

Get the predictions on the interpolation, and extrapolation, and test data.

# predictions on interp & extrap
h2o.limvars_forecasts <-
  pmap(.l = list(a = h2o_accuracy_stats,
                 b = h2o.limvars_automl_leader
                 ),
       .f = function(a, b) {
         int_pred =
           pmap(.l = list(c = a$interp_data),
                .f = function(c) {
                  pred = h2o.predict(b, newdata = c) %>% 
                    as.data.frame()
                  
                  return(pred)
                  }
                )
         
         ext_pred =
           pmap(.l = list(d = a$extrap_data),
                .f = function(d) {
                  pred = h2o.predict(b, newdata = d) %>% 
                    as.data.frame()
                  
                  return(pred)
                  }
                )
         
         
         int_data =
           pmap(.l = list(e = a$interp_data),
                .f = function(e) {
                  df = e %>% as.data.frame()
                  
                  return(df)
                  }
                )
         
         ext_data =
           pmap(.l = list(f = a$extrap_data),
                .f = function(f) {
                  df = f %>% as.data.frame()
                  
                  return(df)
                  }
                )
         
         
         int_fcast =
           pmap(.l = list(g = int_data,
                          h = int_pred
                          ),
                .f = function(g, h) {
                  res = g %>% 
                    select(el_date, el_rides) %>% 
                    bind_cols(h) %>% 
                    rename(actual = el_rides,
                           yhat = predict
                           ) %>% 
                    mutate(sqrd_error = (actual - yhat)^2
                           )
                  }
                )
         
         ext_fcast =
           pmap(.l = list(i = ext_data,
                          j = ext_pred
                          ),
                .f = function(i, j) {
                  res = i %>% 
                    select(el_date, el_rides) %>% 
                    bind_cols(j) %>% 
                    rename(actual = el_rides,
                           yhat = predict
                           ) %>% 
                    mutate(sqrd_error = (actual - yhat)^2
                           )
                  }
                )
         
         
         a$h2o.limvars.interp.forecast = int_fcast
         a$h2o.limvars.extrap.forecast = ext_fcast
         
         return(a)
         }
       )


rm(h2o_accuracy_stats)


# predictions on test set
h2o.limvars_pred <-
  pmap(.l = list(a = h2o.limvars_automl_leader,
                 b = h2o.limvars_test
                 ),
       .f = function(a, b) {
         res = h2o.predict(a, newdata = b)
         
         return(res)
         }
       )

Calculate the accuracy (RMSE) stats.

# calculate RMSE
h2o.limvars_accuracy_stats <-
  pmap(.l = list(a = h2o.limvars_forecasts),
       .f = function(a) {
         rmse_interp =
           a$h2o.limvars.interp.forecast %>% 
           map_dbl(~ sqrt(mean(.x$sqrd_error)
                          )
               )
         
         rmse_extrap =
           a$h2o.limvars.extrap.forecast %>% 
           map_dbl(~ sqrt(mean(.x$sqrd_error)
                          )
               )
         
         a$h2o.limvars.interpolation = rmse_interp
         a$h2o.limvars.extrapolation = rmse_extrap
         
         return(a)
         }
       )


# save the dataset 
saveRDS(h2o.limvars_accuracy_stats,
        paste0(wd,
               "/Models/",
               "h2o.limvars_accuracy_stats.Rds"
               )
        )


# h2o.limvars_accuracy_stats <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "h2o.limvars_accuracy_stats.Rds"
#                  )
#           )


rm(h2o.limvars_forecasts)


# Summary stats of accuracy  
h2o.limvars_accuracy_stats %>% 
  map(~ summary(.x$h2o.limvars.interpolation)
      )

## $`40600`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   20.73   20.84   21.18   21.03   21.22   21.26 
## 
## $`41140`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   33.80   34.07   34.23   34.75   35.88   36.44 
## 
## $`40120`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   137.8   138.6   138.7   139.6   139.4   143.0 
## 
## $`40910`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   142.3   144.8   147.8   146.7   148.5   148.7 
## 
## $`40380`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   824.9   837.4   838.1   843.3   848.5   869.2 
## 
## $`41660`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   978.9   980.4   981.9   986.5   991.5  1005.9

h2o.limvars_accuracy_stats %>% 
  map(~ summary(.x$h2o.limvars.extrapolation)
      )

## $`40600`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.17   19.42   19.48   20.28   21.50   21.86 
## 
## $`41140`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   32.60   32.98   33.41   33.43   33.92   34.09 
## 
## $`40120`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   110.5   113.6   115.2   115.4   115.8   122.4 
## 
## $`40910`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   103.8   131.8   142.0   135.7   142.6   145.3 
## 
## $`40380`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   906.2   920.9   969.6   964.6   979.5  1035.8 
## 
## $`41660`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   969.4   971.9  1030.7  1044.5  1104.9  1130.2

Measure performance on test dataset.

h2o.limvars_perf <-
  pmap(.l = list(a = h2o.limvars_automl_leader,
                 b = h2o.limvars_test),
       .f = function(a, b) {
         per = h2o.performance(a, newdata = b)
         
         return(per)
         }
       )

h2o.limvars_perf

## $`40600`
## H2ORegressionMetrics: stackedensemble
## 
## MSE:  2572.714
## RMSE:  50.72193
## MAE:  36.84049
## RMSLE:  0.1434912
## Mean Residual Deviance :  2572.714
## 
## 
## $`41140`
## H2ORegressionMetrics: gbm
## 
## MSE:  2465.854
## RMSE:  49.65736
## MAE:  37.5372
## RMSLE:  0.1167978
## Mean Residual Deviance :  2465.854
## 
## 
## $`40120`
## H2ORegressionMetrics: drf
## 
## MSE:  73492.85
## RMSE:  271.0956
## MAE:  170.3585
## RMSLE:  0.1558966
## Mean Residual Deviance :  73492.85
## 
## 
## $`40910`
## H2ORegressionMetrics: stackedensemble
## 
## MSE:  50273.03
## RMSE:  224.2165
## MAE:  147.4233
## RMSLE:  0.09721515
## Mean Residual Deviance :  50273.03
## 
## 
## $`40380`
## H2ORegressionMetrics: gbm
## 
## MSE:  4560092
## RMSE:  2135.437
## MAE:  1342.835
## RMSLE:  0.2120997
## Mean Residual Deviance :  4560092
## 
## 
## $`41660`
## H2ORegressionMetrics: gbm
## 
## MSE:  4751802
## RMSE:  2179.863
## MAE:  1447.372
## RMSLE:  0.1722674
## Mean Residual Deviance :  4751802

# save the leader performance
saveRDS(h2o.limvars_perf,
        paste0(wd,
               "/Models/",
               "h2o.limvars_perf.Rds"
               )
        )


# h2o.limvars_perf <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "h2o.limvars_perf.Rds"
#                  )
#           )


# h2o.removeAll()

Step 05: Models - H20

Setup

Modeling

Modeling With H2O

All Variables

Limited Variables