Step 03: Models - Random Forest and XGBoost

Setup

Load the relevant libraries.

# rm(list = ls())
# .rs.restartR()


# data manipulation
library("plyr")
library("tidyverse")
library("magrittr")
library("data.table")
library("lubridate")
library("sqldf")


# time series specific packages
library("timetk")
library("zoo")
library("tibbletime")


# modeling
library("fpp2")
library("prophet")
library("caret")
library("randomForest")
library("xgboost")
library("h2o")
library("keras")
# use_session_with_seed(123456789) # setting the seed to obtain reproducible results
# see https://keras.rstudio.com/articles/faq.html#how-can-i-obtain-reproducible-results-using-keras-during-development and https://cran.r-project.org/web/packages/keras/vignettes/faq.html
# can also re-enable gpu and parallel processing by using:  use_session_with_seed(42, disable_gpu = FALSE, disable_parallel_cpu = FALSE)



# other
library("geosphere")          # specific for distance calculations from lat-lon pairs
library("naniar")             # inspecting missing data
library("rlang")              # building functions
library("recipes")            # used in Keras modeling to design matrices
library("rsample")            # rolling samples for validation stats
library("tfruns")             # used in Keras modeling for trainin runs
library("stringr")            # string manipulation
library("ggplot2")            # viz
library("sweep")              # more easily pull out model statistics
library("yardstick")          # easily calculate accuracy stats
library("doParallel")         # parallel processing

Session Info.

sessionInfo()

## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] doParallel_1.0.14   iterators_1.0.10    foreach_1.4.4      
##  [4] yardstick_0.0.2     sweep_0.2.1.1       tfruns_1.4         
##  [7] rsample_0.0.3       recipes_0.1.4       rlang_0.3.0.1      
## [10] naniar_0.4.1        geosphere_1.5-7     keras_2.2.4        
## [13] h2o_3.20.0.8        xgboost_0.71.2      randomForest_4.6-14
## [16] caret_6.0-81        lattice_0.20-38     prophet_0.3.0.1    
## [19] Rcpp_1.0.0          fpp2_2.3            expsmooth_2.3      
## [22] fma_2.3             forecast_8.4        tibbletime_0.1.1   
## [25] zoo_1.8-4           timetk_0.1.1.1      sqldf_0.4-11       
## [28] RSQLite_2.1.1       gsubfn_0.7          proto_1.0.0        
## [31] lubridate_1.7.4     data.table_1.11.8   magrittr_1.5       
## [34] forcats_0.3.0       stringr_1.3.1       dplyr_0.7.8        
## [37] purrr_0.2.5         readr_1.2.1         tidyr_0.8.2        
## [40] tibble_1.4.2        ggplot2_3.1.0       tidyverse_1.2.1    
## [43] plyr_1.8.4         
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.3-2   class_7.3-14       visdat_0.5.1      
##  [4] rprojroot_1.3-2    base64enc_0.1-3    rstudioapi_0.8    
##  [7] rstan_2.18.2       bit64_0.9-7        prodlim_2018.04.18
## [10] xml2_1.2.0         codetools_0.2-15   splines_3.5.1     
## [13] knitr_1.20         zeallot_0.1.0      jsonlite_1.5      
## [16] pROC_1.13.0        broom_0.5.0        compiler_3.5.1    
## [19] httr_1.3.1         backports_1.1.2    assertthat_0.2.0  
## [22] Matrix_1.2-15      lazyeval_0.2.1     cli_1.0.1         
## [25] htmltools_0.3.6    prettyunits_1.0.2  tools_3.5.1       
## [28] bindrcpp_0.2.2     gtable_0.2.0       glue_1.3.0        
## [31] reshape2_1.4.3     cellranger_1.1.0   fracdiff_1.4-2    
## [34] urca_1.3-0         debugme_1.1.0      nlme_3.1-137      
## [37] lmtest_0.9-36      timeDate_3043.102  gower_0.1.2       
## [40] ps_1.2.1           rvest_0.3.2        MASS_7.3-51.1     
## [43] scales_1.0.0       ipred_0.9-8        hms_0.4.2         
## [46] inline_0.3.15      yaml_2.2.0         quantmod_0.4-13   
## [49] curl_3.2           reticulate_1.10    memoise_1.1.0     
## [52] gridExtra_2.3      loo_2.0.0          StanHeaders_2.18.0
## [55] uroot_2.0-9        rpart_4.1-13       stringi_1.2.4     
## [58] tensorflow_1.10    tseries_0.10-46    TTR_0.23-4        
## [61] pkgbuild_1.0.2     lava_1.6.4         chron_2.3-53      
## [64] bitops_1.0-6       pkgconfig_2.0.2    matrixStats_0.54.0
## [67] evaluate_0.12      bindr_0.1.1        bit_1.1-14        
## [70] processx_3.2.0     tidyselect_0.2.5   R6_2.3.0          
## [73] generics_0.0.2     DBI_1.0.0          whisker_0.3-2     
## [76] pillar_1.3.0       haven_2.0.0        withr_2.1.2       
## [79] xts_0.11-2         sp_1.3-1           RCurl_1.95-4.11   
## [82] survival_2.43-3    nnet_7.3-12        modelr_0.1.2      
## [85] crayon_1.3.4       rmarkdown_1.10     grid_3.5.1        
## [88] readxl_1.1.0       blob_1.1.1         callr_3.0.0       
## [91] ModelMetrics_1.2.2 digest_0.6.18      stats4_3.5.1      
## [94] munsell_0.5.0      tcltk_3.5.1        quadprog_1.5-5

Setup the root directory.

Setting wd as the working directory.

wd <- getwd()

wd

## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy"

Modeling

NOTE: DV_corr_predict, DV_nzv_predict, func_custom_accuracy_metrics, period_train, period_test, and skip_span are the outputs produced in Step 02

DV_corr_predict <-
  readRDS(paste0(wd,
                 "/Data/Interim/",
                 "DV_corr_predict.Rds"
                 )
          )

DV_nzv_predict <-
  readRDS(paste0(wd,
                 "/Data/Interim/",
                 "DV_nzv_predict.Rds"
                 )
          )

func_custom_accuracy_metrics <-
  readRDS(paste0(wd,
                 "/Data/Interim/",
                 "func_custom_accuracy_metrics.Rds"
                 )
          )

period_train <-
  readRDS(paste0(wd,
                 "/Data/Interim/",
                 "period_train.Rds"
                 )
          )

period_test <-
  readRDS(paste0(wd,
                 "/Data/Interim/",
                 "period_test.Rds"
                 )
          )

skip_span <-
  readRDS(paste0(wd,
                 "/Data/Interim/",
                 "skip_span.Rds"
                 )
          )

Random Forest

Create one model with preprocessing that removes highly correlated variables, and one model that does not.

tot_cores <- detectCores()
cl <- makeCluster(tot_cores - 1)
registerDoParallel(cl)


start <- proc.time()
DV_Fit.Rf.corr_yes <-
  DV_corr_predict %>%
  map(.f = function(a) {
    fitControl =
      trainControl(method = "timeslice",
                   initialWindow = period_train,
                   horizon = period_test,
                   fixedWindow = TRUE,
                   skip = skip_span,
                   summaryFunction = func_custom_accuracy_metrics
                   )

    set.seed(123456789)

    output =
      train(el_rides ~ .,
            data = a %>% 
              select(#-el_stop_id,
                     -data_use_el_stop_id
                     ),
            preProcess = c(#"nzv"
                           #"corr"
                           "center",
                           "scale",
                           "medianImpute"
                           ),
            na.action = na.pass,
            method = "rf",
            metric = "RMSE",
            maximize = FALSE,
            importance = TRUE,
            trControl = fitControl,
            verbose = TRUE
            )

    return(output)
    }
    )

time.Rf.corr_yes <- proc.time() - start

message("DV_Fit.Rf.corr_yes")

## DV_Fit.Rf.corr_yes

DV_Fit.Rf.corr_yes

## $`40600`
## Random Forest 
## 
## 905 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE       RMSE      R2       
##    2    53.36399  12.661148  66.12081  0.7628671
##   29    31.38680   7.566142  47.24439  0.8788246
##   56    31.99531   7.629171  47.73797  0.8761612
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 29.
## 
## $`41140`
## Random Forest 
## 
## 905 samples
##  56 predictor
## 
## Pre-processing: centered (56), scaled (56), median imputation (56) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE       RMSE      R2       
##    2    69.12999  12.333349  83.30069  0.6295243
##   29    49.05100   8.337495  64.50003  0.7768664
##   56    45.72820   7.883337  61.33798  0.7986168
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 56.
## 
## $`40120`
## Random Forest 
## 
## 905 samples
##  55 predictor
## 
## Pre-processing: centered (55), scaled (55), median imputation (55) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE       RMSE      R2       
##    2    360.3300  14.979545  437.4819  0.7308128
##   28    204.1010   8.621304  304.2583  0.8694514
##   55    203.0345   8.558621  303.7084  0.8699530
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 55.
## 
## $`40910`
## Random Forest 
## 
## 875 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57), median imputation (57) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE      RMSE      R2       
##    2    274.7844  229.3202  371.2302  0.6457933
##   29    188.3720  217.8330  294.7416  0.7763492
##   57    195.3182  218.8777  304.3883  0.7615656
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 29.
## 
## $`40380`
## Random Forest 
## 
## 905 samples
##  57 predictor
## 
## Pre-processing: centered (57), scaled (57), median imputation (57) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE      RMSE      R2       
##    2    3178.530  24.35459  3602.505  0.7246555
##   29    1911.570  15.97969  2553.785  0.8605411
##   57    1969.073  17.63183  2558.202  0.8594512
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 29.
## 
## $`41660`
## Random Forest 
## 
## 905 samples
##  58 predictor
## 
## Pre-processing: centered (58), scaled (58), median imputation (58) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE       RMSE      R2       
##    2    2800.403  14.825966  3315.517  0.5100332
##   30    1784.122  10.106161  2465.759  0.7295317
##   58    1736.895   9.920884  2451.605  0.7322425
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 58.

start <- proc.time()
DV_Fit.Rf.corr_no <-
  DV_nzv_predict %>%
  map(.f = function(a) {
    fitControl =
      trainControl(method = "timeslice",
                   initialWindow = period_train,
                   horizon = period_test,
                   fixedWindow = TRUE,
                   skip = skip_span,
                   summaryFunction = func_custom_accuracy_metrics
                   )
    
    set.seed(123456789)
    
    output =
      train(el_rides ~ .,
            data = a %>% 
              select(#-el_stop_id,
                     -data_use_el_stop_id
                     ),
            preProcess = c(#"nzv"
                           #"corr"
                           "center",
                           "scale",
                           "medianImpute"
                           ),
            na.action = na.pass,
            method = "rf",
            metric = "RMSE",
            maximize = FALSE,
            importance = TRUE,
            trControl = fitControl,
            verbose = TRUE
            )
    
    return(output)
    }
    )

time.Rf.corr_no <- proc.time() - start

message("DV_Fit.Rf.corr_no")

## DV_Fit.Rf.corr_no

DV_Fit.Rf.corr_no

## $`40600`
## Random Forest 
## 
## 905 samples
##  67 predictor
## 
## Pre-processing: centered (67), scaled (67), median imputation (67) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE       RMSE      R2       
##    2    54.67882  13.346597  67.24735  0.7549971
##   34    31.66811   7.623250  47.57247  0.8770904
##   67    32.14927   7.684513  48.02464  0.8746478
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 34.
## 
## $`41140`
## Random Forest 
## 
## 905 samples
##  68 predictor
## 
## Pre-processing: centered (68), scaled (68), median imputation (68) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE       RMSE      R2       
##    2    71.72215  12.912940  85.38045  0.6104662
##   35    46.26375   7.993646  62.05579  0.7938126
##   68    45.72194   7.892781  61.70367  0.7961505
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 68.
## 
## $`40120`
## Random Forest 
## 
## 905 samples
##  70 predictor
## 
## Pre-processing: centered (70), scaled (70), median imputation (70) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE       RMSE      R2       
##    2    396.4919  17.159898  470.7127  0.6884465
##   36    205.8296   8.834800  306.5976  0.8674667
##   70    205.6014   8.680993  307.5658  0.8666173
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 36.
## 
## $`40910`
## Random Forest 
## 
## 875 samples
##  69 predictor
## 
## Pre-processing: centered (69), scaled (69), median imputation (69) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE      RMSE      R2       
##    2    285.8836  233.3822  380.8792  0.6271777
##   35    193.0397  220.7091  299.6599  0.7688630
##   69    195.3964  221.1577  303.9636  0.7622537
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 35.
## 
## $`40380`
## Random Forest 
## 
## 905 samples
##  68 predictor
## 
## Pre-processing: centered (68), scaled (68), median imputation (68) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE      RMSE      R2       
##    2    3437.994  27.17536  3805.897  0.6926114
##   35    1912.006  15.87499  2555.334  0.8604150
##   68    1981.270  17.60880  2575.045  0.8573869
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 35.
## 
## $`41660`
## Random Forest 
## 
## 905 samples
##  70 predictor
## 
## Pre-processing: centered (70), scaled (70), median imputation (70) 
## Resampling: Rolling Forecasting Origin Resampling (212 held-out with a fixed window) 
## Summary of sample sizes: 578, 578, 578, 578, 578, 578, ... 
## Resampling results across tuning parameters:
## 
##   mtry  MAE       MAPE      RMSE      R2       
##    2    2844.166  15.35689  3340.362  0.5033022
##   36    1835.743  10.35124  2490.086  0.7240566
##   70    1803.379  10.17824  2535.297  0.7131576
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 36.

stopCluster(cl)
rm(start, tot_cores, cl)

Compare the results.

# user  system elapsed 
#  61.039   5.166 527.512 
# ~ 9 min
message("time.Rf.corr_yes")

## time.Rf.corr_yes

time.Rf.corr_yes

##    user  system elapsed 
##  57.047   2.769 416.736

# user  system elapsed 
#  58.048   3.563 486.738
# ~ 8 min
message("time.Rf.corr_no")

## time.Rf.corr_no

time.Rf.corr_no

##    user  system elapsed 
##  56.182   2.885 470.992

# Create a list of models
Models.Rf <-
  pmap(.l = list(a = DV_Fit.Rf.corr_yes,
                 b = DV_Fit.Rf.corr_no
                 ),
       .f = function(a, b) {
         l = list(Corr_No = a,
                  Corr_Yes = b
                  )
         
         return(l)
         }
       )


# Resample the models
Resample_Results.Rf <-
  Models.Rf %>% 
  map(~ resamples(.x)
      )


# Generate a summary
Resample_Results.Rf %>% 
  map(~ summary(.x)
      )

## $`40600`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 13 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  29.49156 30.29677 31.27463 31.38680 32.49142 33.53614    0
## Corr_Yes 29.10133 30.54453 31.41446 31.66811 32.68464 34.82144    0
## 
## MAPE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  6.849371 7.016811 7.340792 7.566142 8.013743 8.623327    0
## Corr_Yes 6.846320 7.010051 7.509362 7.623250 8.027933 8.788167    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.8568696 0.8638475 0.8893933 0.8788246 0.8923240 0.8942618    0
## Corr_Yes 0.8515189 0.8629410 0.8865375 0.8770904 0.8920949 0.8933530    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  43.85291 44.41439 44.82802 47.24439 50.77109 51.77616    0
## Corr_Yes 43.87476 44.68568 45.54589 47.57247 51.33672 52.95681    0
## 
## 
## $`41140`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 13 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  41.83843 44.57632 45.70248 45.72820 46.93409 49.13770    0
## Corr_Yes 41.05446 44.69314 45.96692 45.72194 46.31655 51.37393    0
## 
## MAPE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  7.612844 7.754832 7.923190 7.883337 7.966813 8.306049    0
## Corr_Yes 7.509003 7.763323 7.834721 7.892781 7.982655 8.622716    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.7796583 0.7875419 0.7925303 0.7986168 0.8001393 0.8409655    0
## Corr_Yes 0.7645538 0.7841397 0.7926616 0.7961505 0.7985534 0.8416195    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  57.48317 60.28750 61.17986 61.33798 62.44682 64.81715    0
## Corr_Yes 57.36486 60.41439 61.31194 61.70367 62.55303 67.00195    0
## 
## 
## $`40120`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 13 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean 3rd Qu.     Max. NA's
## Corr_No  176.4748 186.2569 192.8253 203.0345 224.498 237.3381    0
## Corr_Yes 175.1231 185.2949 195.1465 205.8296 221.869 259.9216    0
## 
## MAPE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  7.406341 7.886926 8.051402 8.558621 9.418846 10.45619    0
## Corr_Yes 7.485117 7.992040 8.195726 8.834800 9.377135 11.43588    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.8347500 0.8489616 0.8803475 0.8699530 0.8850913 0.8966807    0
## Corr_Yes 0.8283214 0.8467642 0.8807202 0.8674667 0.8833842 0.8973781    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  269.0210 278.6145 283.1965 303.7084 335.4447 350.4054    0
## Corr_Yes 268.1115 283.0184 284.7382 306.5976 337.4272 367.6232    0
## 
## 
## $`40910`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 10 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  150.9585 179.6485 190.6999 188.3720 202.0979 215.3847    0
## Corr_Yes 155.3571 186.0088 194.3332 193.0397 204.7638 217.8496    0
## 
## MAPE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  5.498962 237.7162 240.9771 217.8330 244.4084 245.5900    0
## Corr_Yes 5.616745 243.4091 243.5759 220.7091 244.3935 248.8788    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.7504095 0.7573360 0.7627025 0.7763492 0.7802347 0.8681174    0
## Corr_Yes 0.7442099 0.7523486 0.7552171 0.7688630 0.7706389 0.8624315    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  207.6478 287.9799 306.4173 294.7416 313.9330 323.5713    0
## Corr_Yes 212.0767 294.7318 311.0596 299.6599 315.3143 327.5652    0
## 
## 
## $`40380`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 13 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  1377.077 1804.829 1879.122 1911.570 2050.309 2738.133    0
## Corr_Yes 1339.499 1785.645 1885.284 1912.006 2117.726 2617.534    0
## 
## MAPE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  10.25874 15.56809 16.13926 15.97969 16.70598 22.34793    0
## Corr_Yes 10.02759 15.23272 15.99324 15.87499 16.46497 21.06954    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.7748433 0.8527297 0.8600544 0.8605411 0.8705118 0.9065783    0
## Corr_Yes 0.7849040 0.8462781 0.8567119 0.8604150 0.8723439 0.9093752    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  2052.713 2423.448 2595.687 2553.785 2660.415 3314.556    0
## Corr_Yes 2021.752 2406.242 2597.468 2555.334 2652.086 3239.657    0
## 
## 
## $`41660`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 13 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  1578.035 1688.407 1737.096 1736.895 1788.231 1929.228    0
## Corr_Yes 1667.649 1751.958 1800.107 1835.743 1843.362 2254.031    0
## 
## MAPE 
##              Min.  1st Qu.   Median      Mean  3rd Qu.     Max. NA's
## Corr_No  8.986980 9.415665 9.817172  9.920884 10.21256 11.25040    0
## Corr_Yes 9.343517 9.631941 9.959996 10.351242 10.71342 12.76414    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.7102085 0.7208719 0.7222698 0.7322425 0.7336277 0.7932970    0
## Corr_Yes 0.6592414 0.7168635 0.7293069 0.7240566 0.7376489 0.7790171    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  2354.167 2396.771 2420.436 2451.605 2479.233 2643.995    0
## Corr_Yes 2317.457 2376.487 2450.175 2490.086 2555.142 2867.090    0

Resample_Results.Rf %>% 
  map(~ bwplot(.x)
      )

## $`40600`

## 
## $`41140`

## 
## $`40120`

## 
## $`40910`

## 
## $`40380`

## 
## $`41660`

After inspecting the results, we choose to keep the model that includes the correlation filter in the preprocessing stage - the results and runtimes were similar.

rm(list = ls(pattern = "corr_yes"))


saveRDS(DV_Fit.Rf.corr_no,
        paste0(wd,
               "/Models/",
               "DV_Fit.Rf.corr_no.Rds"
               )
        )


saveRDS(time.Rf.corr_no,
        paste0(wd,
               "/Models/",
               "time.Rf.corr_no.Rds"
               )
        )


# DV_Fit.Rf.corr_no <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "DV_Fit.Rf.corr_no.Rds"
#                  )
#           )

# time.Rf.corr_no <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "time.Rf.corr_no.Rds"
#                  )
#           )

Inspect varialbe importance.

# Permutation improtance is used for the variable importance
# Based on discussion here:  http://parrt.cs.usfca.edu/doc/rf-importance/index.html
VI <- DV_Fit.Rf.corr_no %>% 
  map(~ varImp(.x,
               type = 1,
               scale = TRUE
               )
      )

VI

## $`40600`
## rf variable importance
## 
##   only 20 most important variables shown (out of 67)
## 
##                                Overall
## el_rides_ma07                   100.00
## el_rides_l28                     89.15
## el_rides_l07                     81.22
## el_rides_l21                     58.40
## wday.lbl.Saturday                48.94
## el_rides_l14                     43.28
## el_rides_ma14                    40.43
## divvy_all_trip_cnt_cus_l7        37.61
## el_rides_ma28                    36.25
## divvy_all_triptime_med_sub_l7    36.23
## wday.lbl.Sunday                  34.11
## divvy_all_triptime_med_cus_l7    31.24
## el_date                          31.18
## month.11                         31.11
## divvy_all_trip_cnt_sub_l7        29.00
## el_rides_ma21                    28.76
## mweek.1                          24.61
## tmax_bands_l7.05_75to100         22.24
## wday.lbl.Monday                  20.86
## divvy_all_triptime_mean_cus_l7   19.26
## 
## $`41140`
## rf variable importance
## 
##   only 20 most important variables shown (out of 68)
## 
##                                    Overall
## el_rides_ma07                       100.00
## el_rides_l07                         80.94
## el_rides_l14                         51.36
## el_rides_l21                         49.01
## wday.lbl.Sunday                      41.29
## el_date                              40.78
## el_rides_ma14                        34.91
## divvy_all_trip_cnt_cus_l7            34.11
## wday.lbl.Saturday                    33.69
## el_rides_ma28                        33.00
## divvy_all_triptime_med_sub_l7        31.76
## el_rides_l28                         31.47
## divvy_all_trip_cnt_sub_l7            30.40
## el_rides_ma21                        25.28
## wday.lbl.Friday                      17.47
## divvy_mindist_triptime_mean_cus_l7   16.76
## tmax_bands.05_75to100                16.68
## month.9                              16.66
## divvy_mindist_miles                  16.63
## divvy_all_triptime_med_cus_l7        16.57
## 
## $`40120`
## rf variable importance
## 
##   only 20 most important variables shown (out of 70)
## 
##                                Overall
## el_rides_l07                    100.00
## el_rides_ma07                    91.88
## el_rides_l28                     81.99
## el_rides_l14                     53.57
## el_rides_l21                     47.78
## divvy_all_trip_cnt_cus_l7        38.85
## wday.lbl.Saturday                37.85
## el_rides_ma14                    37.19
## month.11                         35.48
## el_date                          34.36
## el_rides_ma28                    33.92
## wday.lbl.Sunday                  33.52
## divvy_all_triptime_med_sub_l7    32.89
## month.9                          31.02
## wday.lbl.Monday                  27.55
## el_rides_ma21                    25.24
## divvy_all_triptime_mean_sub_l7   24.31
## divvy_all_trip_cnt_sub_l7        23.06
## divvy_all_triptime_med_cus_l7    22.31
## tmax_bands.05_75to100            18.93
## 
## $`40910`
## rf variable importance
## 
##   only 20 most important variables shown (out of 69)
## 
##                                    Overall
## el_rides_l07                        100.00
## el_rides_ma07                        82.10
## wday.lbl.Saturday                    77.04
## el_rides_l28                         72.09
## wday.lbl.Sunday                      64.27
## el_rides_l14                         63.47
## el_rides_l21                         53.70
## el_rides_ma14                        47.24
## el_rides_ma28                        46.21
## el_rides_ma21                        41.10
## el_date                              40.20
## divvy_all_trip_cnt_cus_l7            30.83
## divvy_all_triptime_med_sub_l7        29.18
## divvy_mindist_miles                  24.32
## divvy_all_triptime_mean_sub_l7       22.89
## month.12                             22.26
## quarter.4                            22.18
## divvy_mindist_trip_cnt_sub_l7        22.16
## divvy_mindist_triptime_mean_sub_l7   21.99
## divvy_all_triptime_mean_cus_l7       20.11
## 
## $`40380`
## rf variable importance
## 
##   only 20 most important variables shown (out of 68)
## 
##                                    Overall
## el_rides_ma07                       100.00
## el_rides_l28                         96.19
## el_rides_l07                         88.16
## wday.lbl.Saturday                    54.25
## el_rides_l14                         50.78
## el_rides_l21                         48.19
## month.11                             43.00
## el_rides_ma14                        41.21
## el_rides_ma28                        36.70
## mweek.4                              36.07
## wday.lbl.Monday                      35.32
## el_rides_ma21                        32.28
## divvy_all_trip_cnt_cus_l7            31.72
## divvy_all_triptime_med_cus_l7        28.99
## divvy_pt5mi_trip_cnt_sub_l7          27.12
## divvy_all_triptime_med_sub_l7        27.01
## el_date                              26.74
## divvy_pt5mi_trip_cnt_cus_l7          24.49
## divvy_pt5mi_triptime_mean_sub_l7     23.15
## divvy_mindist_triptime_mean_cus_l7   22.57
## 
## $`41660`
## rf variable importance
## 
##   only 20 most important variables shown (out of 70)
## 
##                               Overall
## el_rides_l28                   100.00
## el_rides_ma07                   91.46
## el_rides_l07                    85.83
## el_rides_l21                    74.06
## el_rides_l14                    73.02
## wday.lbl.Saturday               62.20
## el_rides_ma21                   56.02
## el_rides_ma14                   48.39
## el_date                         47.23
## el_rides_ma28                   45.74
## divvy_all_trip_cnt_cus_l7       41.55
## divvy_all_trip_cnt_sub_l7       37.04
## divvy_pt5mi_trip_cnt_sub_l7     34.58
## wday.lbl.Monday                 31.42
## divvy_all_triptime_med_cus_l7   31.38
## divvy_mindist_trip_cnt_cus_l7   31.28
## divvy_all_triptime_med_sub_l7   31.21
## wday.lbl.Sunday                 28.94
## month.12                        27.64
## month.11                        25.87

VI %>% 
  map(~ plot(.x, top = 20)
      )

## $`40600`

## 
## $`41140`

## 
## $`40120`

## 
## $`40910`

## 
## $`40380`

## 
## $`41660`

rm(VI)

Extreme Gradient Boosted Tree

Create one model with preprocessing that removes highly correlated variables, and one model that does not.

tot_cores <- detectCores()
cl <- makeCluster(tot_cores - 1)
registerDoParallel(cl)


start <- proc.time()
DV_Fit.Xgbtree.corr_yes <-
  DV_corr_predict %>%
  map(.f = function(a) {
    fitControl =
      trainControl(method = "timeslice",
                   initialWindow = period_train,
                   horizon = period_test,
                   fixedWindow = TRUE,
                   skip = skip_span,
                   summaryFunction = func_custom_accuracy_metrics
                   )

    set.seed(123456789)

    output =
      train(el_rides ~ .,
            data = a %>% 
              select(#-el_stop_id,
                     -data_use_el_stop_id
                     ),
            preProcess = c(#"nzv"
                           #"corr"
                           "center",
                           "scale",
                           "medianImpute"
                           ),
            na.action = na.pass,
            method = "xgbTree",
            metric = "RMSE",
            maximize = FALSE,
            importance = TRUE,
            trControl = fitControl,
            verbose = TRUE
            )

    return(output)
    }
    )

time.Xgbtree.corr_yes <- proc.time() - start

# message("DV_Fit.Xgbtree.corr_yes")
# DV_Fit.Xgbtree.corr_yes



start <- proc.time()
DV_Fit.Xgbtree.corr_no <-
  DV_nzv_predict %>%
  map(.f = function(a) {
    fitControl =
      trainControl(method = "timeslice",
                   initialWindow = period_train,
                   horizon = period_test,
                   fixedWindow = TRUE,
                   skip = skip_span,
                   summaryFunction = func_custom_accuracy_metrics
                   )
    
    set.seed(123456789)
    
    output =
      train(el_rides ~ .,
            data = a %>% 
              select(#-el_stop_id,
                     -data_use_el_stop_id
                     ),
            preProcess = c(#"nzv"
                           #"corr"
                           "center",
                           "scale",
                           "medianImpute"
                           ),
            na.action = na.pass,
            method = "xgbTree",
            metric = "RMSE",
            maximize = FALSE,
            importance = TRUE,
            trControl = fitControl,
            verbose = TRUE
            )
    
    return(output)
    }
    )

time.Xgbtree.corr_no <- proc.time() - start

# message("DV_Fit.Xgbtree.corr_no")
# DV_Fit.Xgbtree.corr_no


stopCluster(cl)
rm(start, tot_cores, cl)

Compare the results.

# user  system elapsed 
#  10.888   2.333 179.411
# ~ 3 min 
message("time.Xgbtree.corr_yes")

## time.Xgbtree.corr_yes

time.Xgbtree.corr_yes

##    user  system elapsed 
##  10.596   2.534 214.636

# user  system elapsed 
#  10.377   2.360 201.333
# ~ 3 min
message("time.Xgbtree.corr_no")

## time.Xgbtree.corr_no

time.Xgbtree.corr_no

##    user  system elapsed 
##  10.379   2.605 238.173

# Create a list of models
Models.Xgbtree <-
  pmap(.l = list(a = DV_Fit.Xgbtree.corr_yes,
                 b = DV_Fit.Xgbtree.corr_no
                 ),
       .f = function(a, b) {
         l = list(Corr_No = a,
                  Corr_Yes = b
                  )
         
         return(l)
         }
       )


# Resample the models
Resample_Results.Xgbtree <-
  Models.Xgbtree %>% 
  map(~ resamples(.x)
      )


# Generate a summary
Resample_Results.Xgbtree %>% 
  map(~ summary(.x)
      )

## $`40600`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 13 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  32.19303 35.55575 37.49635 39.07401 40.58886 57.36103    0
## Corr_Yes 33.27039 34.71990 36.09276 37.70206 40.52589 45.22524    0
## 
## MAPE 
##              Min.  1st Qu.   Median     Mean   3rd Qu.     Max. NA's
## Corr_No  7.969871 8.645468 9.526889 9.679793 10.533630 12.01006    0
## Corr_Yes 7.446816 8.064649 8.922381 9.062954  9.074494 12.00199    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.7476391 0.7828786 0.8228306 0.8183967 0.8581394 0.8753876    0
## Corr_Yes 0.7329532 0.8212722 0.8402651 0.8280687 0.8649636 0.8769419    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  47.93771 50.79815 57.11459 57.56327 63.26301 69.03943    0
## Corr_Yes 47.30833 49.90248 54.26858 55.97031 57.68864 71.65853    0
## 
## 
## $`41140`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 13 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  44.25420 53.49472 59.14318 61.65002 65.98962 88.99007    0
## Corr_Yes 47.04107 53.15157 60.99780 61.28252 69.37796 80.08106    0
## 
## MAPE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  7.889643 8.990736 10.38354 10.39353 11.16079 14.11260    0
## Corr_Yes 8.282878 9.668258 10.63027 10.50186 11.44832 13.67824    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.4088250 0.6208651 0.6657441 0.6513493 0.7324304 0.7945801    0
## Corr_Yes 0.4807594 0.6239036 0.6791110 0.6686902 0.7177186 0.7735332    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.      Max. NA's
## Corr_No  63.05710 69.90978 80.84457 79.89923 82.59053 103.68635    0
## Corr_Yes 66.20869 73.18454 77.04980 78.21456 85.16411  95.90091    0
## 
## 
## $`40120`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 13 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  193.4535 221.8841 260.6418 277.4664 313.4230 438.6116    0
## Corr_Yes 207.1631 230.2994 244.5939 273.2633 293.6921 403.2901    0
## 
## MAPE 
##              Min.  1st Qu.    Median     Mean  3rd Qu.     Max. NA's
## Corr_No  8.253393 9.708229 11.397093 11.68554 13.81434 16.91666    0
## Corr_Yes 7.900832 8.928526  9.569002 10.98873 13.65961 15.28340    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.6536373 0.7481348 0.8024843 0.7913396 0.8326028 0.8685105    0
## Corr_Yes 0.6900278 0.7599979 0.8316271 0.8015114 0.8447884 0.8775462    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  300.5154 350.4285 378.9381 381.7610 426.4795 491.6738    0
## Corr_Yes 292.6610 318.6677 352.5948 372.2966 436.5678 464.2249    0
## 
## 
## $`40910`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 10 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  182.5477 198.9882 215.4499 218.0884 242.3456 250.2286    0
## Corr_Yes 195.9871 204.0963 217.1693 222.7655 236.2191 276.7918    0
## 
## MAPE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  6.831365 231.9860 237.0799 216.7938 244.9702 255.6546    0
## Corr_Yes 7.189624 237.2212 243.3488 220.8403 247.4934 258.6670    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.6269560 0.6862052 0.7230603 0.7171828 0.7377268 0.8024803    0
## Corr_Yes 0.6414896 0.6880251 0.7105813 0.7116975 0.7439550 0.7860860    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  254.1199 319.2051 327.0842 331.2105 356.0990 393.3169    0
## Corr_Yes 264.4558 311.1674 338.3910 334.6585 354.3217 387.7991    0
## 
## 
## $`40380`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 13 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  1290.088 1674.217 1795.734 1808.918 1909.690 2217.246    0
## Corr_Yes 1330.699 1660.514 1702.089 1725.475 1867.537 2043.224    0
## 
## MAPE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  10.70217 12.30182 14.16940 14.82862 14.71599 21.66701    0
## Corr_Yes 10.49416 11.54394 12.82834 14.15156 14.30425 20.82751    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.8105004 0.8375893 0.8504863 0.8540175 0.8682326 0.9090692    0
## Corr_Yes 0.8173254 0.8561397 0.8640984 0.8619001 0.8828332 0.9082951    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  2025.838 2462.633 2629.306 2615.267 2706.523 3089.312    0
## Corr_Yes 2034.443 2305.265 2537.967 2544.655 2639.140 3033.171    0
## 
## 
## $`41660`
## 
## Call:
## summary.resamples(object = .x)
## 
## Models: Corr_No, Corr_Yes 
## Number of resamples: 13 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  1461.259 1762.976 2123.482 2380.587 2846.651 3324.812    0
## Corr_Yes 1442.951 1507.182 2004.050 2180.979 2302.439 3799.306    0
## 
## MAPE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  9.317611 9.901048 13.96259 13.17253 14.31943 18.61874    0
## Corr_Yes 8.296165 9.322468 11.69481 12.24441 13.71458 19.21908    0
## 
## R2 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## Corr_No  0.3343301 0.4746273 0.6426096 0.5737701 0.7358578 0.7649216    0
## Corr_Yes 0.1613235 0.5885666 0.6736276 0.6114771 0.7597928 0.7944904    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## Corr_No  2320.089 2464.427 3095.532 3045.560 3310.782 3856.815    0
## Corr_Yes 2056.723 2362.283 2816.765 2876.031 3150.414 4194.559    0

Resample_Results.Xgbtree %>% 
  map(~ bwplot(.x)
      )

## $`40600`

## 
## $`41140`

## 
## $`40120`

## 
## $`40910`

## 
## $`40380`

## 
## $`41660`

After inspecting the results, we choose to keep the model that does NOT include the correlation filter in the preprocessing stage - the results were similar, and the run time was about half as long.

rm(list = ls(pattern = "Xgbtree.corr_no"))


saveRDS(DV_Fit.Xgbtree.corr_yes,
        paste0(wd,
               "/Models/",
               "DV_Fit.Xgbtree.corr_yes.Rds"
               )
        )


saveRDS(time.Xgbtree.corr_yes,
        paste0(wd,
               "/Models/",
               "time.Xgbtree.corr_yes.Rds"
               )
        )

# DV_Fit.Xgbtree.corr_yes <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "DV_Fit.Xgbtree.corr_yes.Rds"
#                  )
#           )

# time.Xgbtree.corr_yes <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "time.Xgbtree.corr_yes.Rds"
#                  )
#           )

Inspect varialbe importance.

# Permutation improtance is used for the variable importance
# Based on discussion here:  http://parrt.cs.usfca.edu/doc/rf-importance/index.html
VI <- DV_Fit.Xgbtree.corr_yes %>% 
  map(~ varImp(.x,
               type = 1,
               scale = TRUE
               )
      )

VI

## $`40600`
## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                             Overall
## wday.lbl.Sunday           100.00000
## wday.lbl.Saturday          54.21841
## el_date                    10.13680
## month.1                     4.19157
## month.10                    1.99296
## month.9                     1.20995
## month.12                    0.83474
## wday.lbl.Monday             0.51056
## month.3                     0.28139
## wday.lbl.Tuesday            0.22407
## wday.lbl.Wednesday          0.19511
## mweek.2                     0.14645
## mweek.4                     0.09541
## month.2                     0.09030
## month.7                     0.05361
## mweek.1                     0.03273
## divvy_all_trip_cnt_sub_l7   0.00000
## month.8                     0.00000
## wday.lbl.Thursday           0.00000
## month.6                     0.00000
## 
## $`41140`
## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 56)
## 
##                    Overall
## el_date           100.0000
## wday.lbl.Sunday    54.9947
## wday.lbl.Saturday  27.9666
## quarter.1           3.9540
## quarter.3           2.9215
## month.1             1.9928
## month.12            1.0470
## wday.lbl.Friday     0.8736
## mweek.2             0.5172
## month.10            0.4596
## wday.lbl.Monday     0.3824
## mweek.1             0.3015
## month.9             0.2852
## month.11            0.2143
## wday.lbl.Thursday   0.1743
## month.8             0.1549
## mweek.4             0.1425
## wday.lbl.Tuesday    0.1368
## month.4             0.1275
## month.6             0.1143
## 
## $`40120`
## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 55)
## 
##                     Overall
## wday.lbl.Sunday    100.0000
## wday.lbl.Saturday   68.7862
## el_date             26.0470
## wday.lbl.Thursday    2.4572
## mweek.5              2.2036
## month.10             2.1857
## month.12             1.9718
## month.1              1.9460
## month.9              1.5397
## mweek.4              1.2234
## month.11             0.7489
## wday.lbl.Tuesday     0.7077
## wday.lbl.Wednesday   0.6385
## wday.lbl.Friday      0.6051
## quarter.4            0.5559
## quarter.1            0.4561
## wday.lbl.Monday      0.4460
## month.5              0.3966
## quarter.3            0.3237
## mweek.1              0.3165
## 
## $`40910`
## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                                 Overall
## el_date                        100.0000
## wday.lbl.Sunday                 27.5656
## wday.lbl.Saturday               14.5501
## year.2014                        2.2619
## month.10                         1.5704
## month.9                          1.5558
## divvy_all_triptime_med_cus_l7    1.5419
## divvy_all_triptime_mean_cus_l7   1.3195
## divvy_all_trip_cnt_cus_l7        1.2040
## tmax_bands_l7.02_00to25          0.6750
## wday.lbl.Monday                  0.4040
## tmax_bands_l7.04_50to75          0.3767
## mweek.5                          0.3318
## tmin_bands_l7.03_00to25          0.3060
## tmin_bands_l7.04_25to50          0.2792
## quarter.4                        0.2610
## month.11                         0.2323
## mweek.4                          0.2162
## month.1                          0.1861
## wday.lbl.Friday                  0.1359
## 
## $`40380`
## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 57)
## 
##                        Overall
## wday.lbl.Sunday      100.00000
## wday.lbl.Saturday     84.92598
## el_date                8.96964
## month.12               1.57564
## month.10               0.84088
## quarter.3              0.77306
## month.1                0.73530
## wday.lbl.Tuesday       0.63052
## wday.lbl.Wednesday     0.45522
## mweek.5                0.41161
## mweek.3                0.37522
## mweek.2                0.35298
## month.6                0.29304
## wday.lbl.Thursday      0.23105
## tmax_bands.04_50to75   0.20358
## month.11               0.19781
## month.2                0.08966
## month.8                0.07502
## mweek.4                0.06973
## wday.lbl.Monday        0.06943
## 
## $`41660`
## xgbTree variable importance
## 
##   only 20 most important variables shown (out of 58)
## 
##                     Overall
## wday.lbl.Sunday    100.0000
## wday.lbl.Saturday   43.1934
## el_date             37.7939
## month.1              5.0212
## quarter.1            2.1457
## wday.lbl.Monday      1.7308
## month.12             1.3121
## wday.lbl.Friday      1.1027
## mweek.3              0.7702
## quarter.3            0.6994
## month.2              0.6061
## mweek.5              0.4890
## mweek.2              0.4380
## month.10             0.3641
## wday.lbl.Thursday    0.3233
## year.2014            0.2937
## mweek.4              0.2693
## wday.lbl.Wednesday   0.2294
## month.11             0.2073
## month.5              0.1991

VI %>% 
  map(~ plot(.x, top = 20)
      )

## $`40600`

## 
## $`41140`

## 
## $`40120`

## 
## $`40910`

## 
## $`40380`

## 
## $`41660`

rm(VI)