Setup

Load the relevant libraries.

# rm(list = ls())
# .rs.restartR()


# data manipulation
library("plyr")
library("tidyverse")
library("magrittr")
library("data.table")
library("lubridate")
library("sqldf")


# time series specific packages
library("timetk")
library("zoo")
library("tibbletime")


# modeling
library("fpp2")
library("prophet")
library("caret")
library("randomForest")
library("xgboost")
library("h2o")
library("keras")
# use_session_with_seed(123456789) # setting the seed to obtain reproducible results
# see https://keras.rstudio.com/articles/faq.html#how-can-i-obtain-reproducible-results-using-keras-during-development and https://cran.r-project.org/web/packages/keras/vignettes/faq.html
# can also re-enable gpu and parallel processing by using:  use_session_with_seed(42, disable_gpu = FALSE, disable_parallel_cpu = FALSE)



# other
library("geosphere")          # specific for distance calculations from lat-lon pairs
library("naniar")             # inspecting missing data
library("rlang")              # building functions
library("recipes")            # used in Keras modeling to design matrices
library("rsample")            # rolling samples for validation stats
library("tfruns")             # used in Keras modeling for trainin runs
library("stringr")            # string manipulation
library("ggplot2")            # viz
library("sweep")              # more easily pull out model statistics
library("yardstick")          # easily calculate accuracy stats
library("doParallel")         # parallel processing

Session Info.

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] doParallel_1.0.14   iterators_1.0.10    foreach_1.4.4      
##  [4] yardstick_0.0.2     sweep_0.2.1.1       tfruns_1.4         
##  [7] rsample_0.0.3       recipes_0.1.4       rlang_0.3.0.1      
## [10] naniar_0.4.1        geosphere_1.5-7     keras_2.2.4        
## [13] h2o_3.20.0.8        xgboost_0.71.2      randomForest_4.6-14
## [16] caret_6.0-81        lattice_0.20-38     prophet_0.3.0.1    
## [19] Rcpp_1.0.0          fpp2_2.3            expsmooth_2.3      
## [22] fma_2.3             forecast_8.4        tibbletime_0.1.1   
## [25] zoo_1.8-4           timetk_0.1.1.1      sqldf_0.4-11       
## [28] RSQLite_2.1.1       gsubfn_0.7          proto_1.0.0        
## [31] lubridate_1.7.4     data.table_1.11.8   magrittr_1.5       
## [34] forcats_0.3.0       stringr_1.3.1       dplyr_0.7.8        
## [37] purrr_0.2.5         readr_1.2.1         tidyr_0.8.2        
## [40] tibble_1.4.2        ggplot2_3.1.0       tidyverse_1.2.1    
## [43] plyr_1.8.4         
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.3-2   class_7.3-14       visdat_0.5.1      
##  [4] rprojroot_1.3-2    base64enc_0.1-3    rstudioapi_0.8    
##  [7] rstan_2.18.2       bit64_0.9-7        prodlim_2018.04.18
## [10] xml2_1.2.0         codetools_0.2-15   splines_3.5.1     
## [13] knitr_1.20         zeallot_0.1.0      jsonlite_1.5      
## [16] pROC_1.13.0        broom_0.5.0        compiler_3.5.1    
## [19] httr_1.3.1         backports_1.1.2    assertthat_0.2.0  
## [22] Matrix_1.2-15      lazyeval_0.2.1     cli_1.0.1         
## [25] htmltools_0.3.6    prettyunits_1.0.2  tools_3.5.1       
## [28] bindrcpp_0.2.2     gtable_0.2.0       glue_1.3.0        
## [31] reshape2_1.4.3     cellranger_1.1.0   fracdiff_1.4-2    
## [34] urca_1.3-0         debugme_1.1.0      nlme_3.1-137      
## [37] lmtest_0.9-36      timeDate_3043.102  gower_0.1.2       
## [40] ps_1.2.1           rvest_0.3.2        MASS_7.3-51.1     
## [43] scales_1.0.0       ipred_0.9-8        hms_0.4.2         
## [46] inline_0.3.15      yaml_2.2.0         quantmod_0.4-13   
## [49] curl_3.2           reticulate_1.10    memoise_1.1.0     
## [52] gridExtra_2.3      loo_2.0.0          StanHeaders_2.18.0
## [55] uroot_2.0-9        rpart_4.1-13       stringi_1.2.4     
## [58] tensorflow_1.10    tseries_0.10-46    TTR_0.23-4        
## [61] pkgbuild_1.0.2     lava_1.6.4         chron_2.3-53      
## [64] bitops_1.0-6       pkgconfig_2.0.2    matrixStats_0.54.0
## [67] evaluate_0.12      bindr_0.1.1        bit_1.1-14        
## [70] processx_3.2.0     tidyselect_0.2.5   R6_2.3.0          
## [73] generics_0.0.2     DBI_1.0.0          whisker_0.3-2     
## [76] pillar_1.3.0       haven_2.0.0        withr_2.1.2       
## [79] xts_0.11-2         sp_1.3-1           RCurl_1.95-4.11   
## [82] survival_2.43-3    nnet_7.3-12        modelr_0.1.2      
## [85] crayon_1.3.4       rmarkdown_1.10     grid_3.5.1        
## [88] readxl_1.1.0       blob_1.1.1         callr_3.0.0       
## [91] ModelMetrics_1.2.2 digest_0.6.18      stats4_3.5.1      
## [94] munsell_0.5.0      tcltk_3.5.1        quadprog_1.5-5

Setup the root directory.

Setting wd as the working directory.

wd <- getwd()

wd
## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy"

Modeling

Model Prep

Add An Indicator for Train-Validation-Test

NOTE: remove_first_28na_rows is the output produced in Step 01

remove_first_28na_rows <-
  readRDS(paste0(wd,
                 "/Data/Interim/",
                 "remove_first_28na_rows.Rds"
                 )
          )

I’ll be testing models that use the entire dataset, and also models that are specific to each el_stop_id (i.e., a model for each El entry point). Therefore, I’ll create train-test splits (at 70% of the data) for each group.

First, I’ll create a function to to create the proper value depending on the situation (i.e., creating the split for the entire dataset, or for each value of el_stop_id as a single entity).

func_train_test_indicator <-
  function(data, date_var, train_val_pct, train_pct, new_col_suffix) {
    # get needed variables
    date_var_enquo = enquo(date_var)
    
    date_min = data %>% 
      select(!!date_var_enquo) %>% 
      distinct() %>% 
      top_n(n = -1, wt = !!date_var_enquo) %>% 
      pull(!!date_var_enquo)
    
    date_max = data %>% 
      select(!!date_var_enquo) %>% 
      distinct() %>% 
      top_n(n = 1, wt = !!date_var_enquo) %>% 
      pull(!!date_var_enquo)
    
    # train and validation split
    days_trainval = round(train_val_pct * (date_max - date_min))
    split_date_trainval = date_min + days_trainval

    # train split
    days_train = round(train_pct * (split_date_trainval - date_min))
    split_date_train = date_min + days_train

    # new dataset
    new_data =
      data %>%
      mutate(new_var =
               case_when(el_date < split_date_train ~ "01_train",
                         between(x = el_date,
                                 lower = split_date_train,
                                 upper = split_date_trainval,
                                 incbounds = TRUE
                                 ) ~ "02_validation",
                         TRUE ~ "03_test"
                         )
             )

    colnames(new_data)[length(new_data)] = paste0("data_use_", new_col_suffix)
    
    return(new_data)
  }

Here I run the function for each value of el_stop_id.

add_trn_val_test <-
  remove_first_28na_rows %>% 
  map(~ func_train_test_indicator(data = .x,
                                  date_var = el_date,
                                  train_val_pct = 0.8,
                                  train_pct = 0.7,
                                  new_col_suffix = "el_stop_id"
                                  ) %>% 
        mutate(wday.lbl = factor(as.character(wday.lbl)
                                 )
               )
      ) 


## Save the data to the proper folder
saveRDS(add_trn_val_test,
        paste0(wd,
               "/Data/Interim/",
               "add_trn_val_test.Rds"
               )
        )

# add_trn_val_test <-
#   readRDS(paste0(wd,
#                  "/Data/Interim/",
#                  "add_trn_val_test.Rds"
#                  )
#           )


str(add_trn_val_test$`41140`)
## 'data.frame':    1617 obs. of  52 variables:
##  $ el_date                           : Date, format: "2013-07-29" "2013-07-30" ...
##  $ el_stop_id                        : num  41140 41140 41140 41140 41140 ...
##  $ divvy_pt5mi_stn_cnt               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_miles               : num  1.49 1.49 1.49 1.49 1.49 ...
##  $ el_rides                          : num  1084 1098 1019 1204 1159 ...
##  $ holiday_name                      : Factor w/ 22 levels "--Not_Holiday--",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday_comment                   : chr  NA NA NA NA ...
##  $ holiday                           : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ year                              : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ half                              : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ quarter                           : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##  $ month                             : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 8 8 8 8 8 8 8 ...
##  $ day                               : Factor w/ 31 levels "1","2","3","4",..: 29 30 31 1 2 3 4 5 6 7 ...
##  $ wday.lbl                          : Factor w/ 7 levels "Friday","Monday",..: 2 6 7 5 1 3 4 2 6 7 ...
##  $ mweek                             : Factor w/ 6 levels "1","2","3","4",..: 5 5 5 5 1 1 1 2 2 2 ...
##  $ tmin_bands                        : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ tmax_bands                        : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
##  $ tmin_bands_l7                     : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ tmax_bands_l7                     : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
##  $ divvy_pt5mi_trip_cnt_cus_l7       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus_l7  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep_l7       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep_l7  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub_l7       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub_l7  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus_l7         : num  1705 1492 1472 1585 1166 ...
##  $ divvy_all_triptime_mean_cus_l7    : num  2165 1798 1826 1823 1828 ...
##  $ divvy_all_triptime_med_cus_l7     : num  1441 1274 1218 1278 1240 ...
##  $ divvy_all_trip_cnt_dep_l7         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep_l7    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep_l7     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub_l7         : num  1029 1179 1191 1225 991 ...
##  $ divvy_all_triptime_mean_sub_l7    : num  819 746 761 845 704 ...
##  $ divvy_all_triptime_med_sub_l7     : num  635 614 620 617 597 ...
##  $ divvy_mindist_trip_cnt_cus_l7     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep_l7     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub_l7     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus_l7: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep_l7: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub_l7: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ holiday_name_l7                   : Factor w/ 22 levels "--Not_Holiday--",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday_comment_l7                : chr  NA NA NA NA ...
##  $ holiday_l7                        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ el_rides_l07                      : num  1140 1043 1081 1084 1094 ...
##  $ el_rides_l14                      : num  1167 1093 1102 1051 1101 ...
##  $ el_rides_l21                      : num  1063 1065 1127 1150 1210 ...
##  $ el_rides_l28                      : num  1179 1048 1177 687 951 ...
##  $ el_rides_ma07                     : num  956 964 955 972 982 ...
##  $ el_rides_ma14                     : num  971 971 965 976 980 ...
##  $ el_rides_ma21                     : num  990 992 987 989 987 ...
##  $ el_rides_ma28                     : num  968 970 965 983 991 ...
##  $ data_use_el_stop_id               : chr  "01_train" "01_train" "01_train" "01_train" ...
add_trn_val_test %>% 
  map(~ group_by(.x,
                 data_use_el_stop_id
                 ) %>% 
        summarise(date_min = min(el_date),
                  date_max = max(el_date)
                  )
      )
## $`40600`
## # A tibble: 3 x 3
##   data_use_el_stop_id date_min   date_max  
##   <chr>               <date>     <date>    
## 1 01_train            2013-07-29 2016-01-19
## 2 02_validation       2016-01-20 2017-02-11
## 3 03_test             2017-02-12 2017-12-31
## 
## $`41140`
## # A tibble: 3 x 3
##   data_use_el_stop_id date_min   date_max  
##   <chr>               <date>     <date>    
## 1 01_train            2013-07-29 2016-01-19
## 2 02_validation       2016-01-20 2017-02-11
## 3 03_test             2017-02-12 2017-12-31
## 
## $`40120`
## # A tibble: 3 x 3
##   data_use_el_stop_id date_min   date_max  
##   <chr>               <date>     <date>    
## 1 01_train            2013-07-29 2016-01-19
## 2 02_validation       2016-01-20 2017-02-11
## 3 03_test             2017-02-12 2017-12-31
## 
## $`40910`
## # A tibble: 3 x 3
##   data_use_el_stop_id date_min   date_max  
##   <chr>               <date>     <date>    
## 1 01_train            2013-07-29 2016-01-19
## 2 02_validation       2016-01-20 2017-02-11
## 3 03_test             2017-02-12 2017-12-31
## 
## $`40380`
## # A tibble: 3 x 3
##   data_use_el_stop_id date_min   date_max  
##   <chr>               <date>     <date>    
## 1 01_train            2013-07-29 2016-01-19
## 2 02_validation       2016-01-20 2017-02-11
## 3 03_test             2017-02-12 2017-12-31
## 
## $`41660`
## # A tibble: 3 x 3
##   data_use_el_stop_id date_min   date_max  
##   <chr>               <date>     <date>    
## 1 01_train            2013-07-29 2016-01-19
## 2 02_validation       2016-01-20 2017-02-11
## 3 03_test             2017-02-12 2017-12-31
rm(remove_first_28na_rows, func_train_test_indicator)

To Delete (below) when no longer needed

# add_trn_val_test <-
#   readRDS(paste0(wd,
#                  "/Data/Interim/",
#                  "add_trn_val_test.Rds"
#                  )
#           )
# 
# 
# str(add_trn_val_test$`41140`)

To Delete (above) when no longer needed

Handling Factor Variables

Turn factor variables into dummy variables. NOTE that this is done with all levels of the factor. That is, it does NOT use “full rank parameterization” to leave one level out (e.g., to be used as the linear model’s intercept).

# create the function to create dummy vars
func_one_hot_vars <-
  function(x) {
    data_s = x %>% 
      select(-holiday_comment,
             -holiday,
             -day,
             -holiday_comment_l7,
             -holiday_l7
             )
    
    formula =
      dummyVars(el_rides ~ .,
                data = data_s %>% 
                  select(-el_date,
                         -el_stop_id,
                         -data_use_el_stop_id
                         ),
                fullRank = FALSE
                )
    
    data_DV =
      data_s %>% 
      select(el_rides,
             el_date,
             el_stop_id,
             data_use_el_stop_id
             ) %>% 
      bind_cols(predict(object = formula,
                        newdata = data_s %>% 
                          select(-el_date,
                                 -el_stop_id,
                                 -data_use_el_stop_id
                                 )
                        ) %>% 
                  as.data.frame()
                )
    }


saveRDS(func_one_hot_vars,
        paste0(wd,
               "/Data/Interim/",
               "func_one_hot_vars.Rds"
               )
        )

# func_one_hot_vars <-
#   readRDS(paste0(wd,
#                  "/Data/Interim/",
#                  "func_one_hot_vars.Rds"
#                  )
#           )


# run the function
DV_data <-
  add_trn_val_test %>% 
  map(~ func_one_hot_vars(.x)
      )

glimpse(DV_data$`41140`)
## Observations: 1,617
## Variables: 134
## $ el_rides                                      <dbl> 1084, 1098, 1019...
## $ el_date                                       <date> 2013-07-29, 201...
## $ el_stop_id                                    <dbl> 41140, 41140, 41...
## $ data_use_el_stop_id                           <chr> "01_train", "01_...
## $ divvy_pt5mi_stn_cnt                           <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_miles                           <dbl> 1.486693, 1.4866...
## $ `holiday_name.--Not_Holiday--`                <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name.Casimir Pulaski Day`            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day`                  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day (in lieu)`        <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Columbus Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Christmas`            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Thanksgiving`         <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Election Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Father's Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day`               <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day (observed)`    <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Labor Day`                      <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Lincoln's Birthday`             <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Martin Luther King Jr. Day`     <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Memorial Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Mother's Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day`                  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day Holiday`          <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Presidents' Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name.Thanksgiving                     <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day (observed)`        <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2013                                     <dbl> 1, 1, 1, 1, 1, 1...
## $ year.2014                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2015                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2016                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2017                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2018                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ half.1                                        <dbl> 0, 0, 0, 0, 0, 0...
## $ half.2                                        <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.1                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.2                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.3                                     <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.4                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ month.1                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.2                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.3                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.4                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.5                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.6                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.7                                       <dbl> 1, 1, 1, 0, 0, 0...
## $ month.8                                       <dbl> 0, 0, 0, 1, 1, 1...
## $ month.9                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.10                                      <dbl> 0, 0, 0, 0, 0, 0...
## $ month.11                                      <dbl> 0, 0, 0, 0, 0, 0...
## $ month.12                                      <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Friday                               <dbl> 0, 0, 0, 0, 1, 0...
## $ wday.lbl.Monday                               <dbl> 1, 0, 0, 0, 0, 0...
## $ wday.lbl.Saturday                             <dbl> 0, 0, 0, 0, 0, 1...
## $ wday.lbl.Sunday                               <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Thursday                             <dbl> 0, 0, 0, 1, 0, 0...
## $ wday.lbl.Tuesday                              <dbl> 0, 1, 0, 0, 0, 0...
## $ wday.lbl.Wednesday                            <dbl> 0, 0, 1, 0, 0, 0...
## $ mweek.1                                       <dbl> 0, 0, 0, 0, 1, 1...
## $ mweek.2                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.3                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.4                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.5                                       <dbl> 1, 1, 1, 1, 0, 0...
## $ mweek.6                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ `tmin_bands.02_-25to00`                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.03_00to25                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.04_25to50                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.05_50to75                          <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands.06_75to100                         <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.02_00to25                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.03_25to50                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.04_50to75                          <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands.05_75to100                         <dbl> 1, 0, 0, 1, 1, 1...
## $ `tmin_bands_l7.02_-25to00`                    <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.03_00to25                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.04_25to50                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.05_50to75                       <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands_l7.06_75to100                      <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.02_00to25                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.03_25to50                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.04_50to75                       <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands_l7.05_75to100                      <dbl> 1, 0, 0, 1, 1, 1...
## $ divvy_pt5mi_trip_cnt_cus_l7                   <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_cus_l7              <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_dep_l7                   <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_dep_l7              <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_sub_l7                   <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_sub_l7              <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_cus_l7                     <dbl> 1705, 1492, 1472...
## $ divvy_all_triptime_mean_cus_l7                <dbl> 2165.246, 1798.0...
## $ divvy_all_triptime_med_cus_l7                 <dbl> 1441.0, 1273.5, ...
## $ divvy_all_trip_cnt_dep_l7                     <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_mean_dep_l7                <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_med_dep_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_sub_l7                     <dbl> 1029, 1179, 1191...
## $ divvy_all_triptime_mean_sub_l7                <dbl> 819.4402, 745.83...
## $ divvy_all_triptime_med_sub_l7                 <dbl> 635.0, 614.0, 62...
## $ divvy_mindist_trip_cnt_cus_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_dep_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_sub_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_cus_l7            <dbl> 0.00, 0.00, 0.00...
## $ divvy_mindist_triptime_mean_dep_l7            <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_sub_l7            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.--Not_Holiday--`             <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name_l7.Casimir Pulaski Day`         <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day`               <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day (in lieu)`     <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Columbus Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Christmas`         <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Thanksgiving`      <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Election Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Father's Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day`            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Labor Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Lincoln's Birthday`          <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Martin Luther King Jr. Day`  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Memorial Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Mother's Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day`               <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day Holiday`       <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Presidents' Day`             <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name_l7.Thanksgiving                  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day (observed)`     <dbl> 0, 0, 0, 0, 0, 0...
## $ el_rides_l07                                  <dbl> 1140, 1043, 1081...
## $ el_rides_l14                                  <dbl> 1167, 1093, 1102...
## $ el_rides_l21                                  <dbl> 1063, 1065, 1127...
## $ el_rides_l28                                  <dbl> 1179, 1048, 1177...
## $ el_rides_ma07                                 <dbl> 956.2857, 964.14...
## $ el_rides_ma14                                 <dbl> 970.8571, 971.21...
## $ el_rides_ma21                                 <dbl> 990.2381, 991.80...
## $ el_rides_ma28                                 <dbl> 968.5000, 970.28...
# rm(func_one_hot_vars)

Accuracy Metrics

I’ll try both caret::randomForest and caret::xgboost on the dataset USING dummy variables. But first, I need to create some custom accuracy metrics.

func_custom_accuracy_metrics <-
  function(data, lev = NULL, model = NULL) {
    mae =
      function(actual, predicted) {
        mean(abs((actual - predicted)
                 ),
             na.rm = TRUE
             )
        }
    
    mape =
      function(actual, predicted) {
        mean(abs((actual - predicted) / actual * 100),
             na.rm = TRUE
             )
        }
    
    rmse =
      function(actual, predicted) {
        sqrt(mean((actual - predicted)^2,
                  na.rm = TRUE
                  )
             )
        }
    
    r2 =
      function(actual, predicted) {
        1 - (sum((actual - predicted)^2
                 ) / sum((actual - mean(actual)
                          )^2
                         )
             )
    }
    
    
    out = c(mae(data$obs,
                data$pred
                ),
            mape(data$obs,
                 data$pred
                 ),
            rmse(data$obs,
                 data$pred
                 ),
            r2(data$obs,
               data$pred
               )
            )

    
    names(out) = c("MAE", "MAPE", "RMSE", "R2")
    
    out
    }


saveRDS(func_custom_accuracy_metrics,
        paste0(wd,
               "/Data/Interim/",
               "func_custom_accuracy_metrics.Rds"
               )
        )

# func_custom_accuracy_metrics <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "func_custom_accuracy_metrics.Rds"
#                  )
#           )

Variable Reduction

First, limit to just training data, and confirm the datasets to use are the same (except for the one-hot-encoding producing dummy variables).

train_data <-
  add_trn_val_test %>% 
  map(~ filter(.x,
               data_use_el_stop_id == "01_train"
               )
      )

DV_train_data <-
  DV_data %>% 
  map(~ filter(.x,
               data_use_el_stop_id == "01_train"
               )
      )


saveRDS(train_data,
        paste0(wd,
               "/Models/",
               "train_data.Rds"
               )
        )

# train_data <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "train_data.Rds"
#                  )
#           )

saveRDS(DV_train_data,
        paste0(wd,
               "/Models/",
               "DV_train_data.Rds"
               )
        )

# DV_train_data <-
#   readRDS(paste0(wd,
#                  "/Models/",
#                  "DV_train_data.Rds"
#                  )
#           )


message("train_data")
## train_data
train_data %>% 
  map(~ dim(.x)
      )
## $`40600`
## [1] 905  52
## 
## $`41140`
## [1] 905  52
## 
## $`40120`
## [1] 905  52
## 
## $`40910`
## [1] 875  52
## 
## $`40380`
## [1] 905  52
## 
## $`41660`
## [1] 905  52
message("DV_train_data")
## DV_train_data
DV_train_data %>% 
  map(~ dim(.x)
      )
## $`40600`
## [1] 905 134
## 
## $`41140`
## [1] 905 134
## 
## $`40120`
## [1] 905 134
## 
## $`40910`
## [1] 875 134
## 
## $`40380`
## [1] 905 134
## 
## $`41660`
## [1] 905 134
message("train_data")
## train_data
train_data$`41140` %>% glimpse()
## Observations: 905
## Variables: 52
## $ el_date                            <date> 2013-07-29, 2013-07-30, 20...
## $ el_stop_id                         <dbl> 41140, 41140, 41140, 41140,...
## $ divvy_pt5mi_stn_cnt                <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_miles                <dbl> 1.486693, 1.486693, 1.48669...
## $ el_rides                           <dbl> 1084, 1098, 1019, 1204, 115...
## $ holiday_name                       <fct> --Not_Holiday--, --Not_Holi...
## $ holiday_comment                    <chr> NA, NA, NA, NA, NA, NA, NA,...
## $ holiday                            <lgl> FALSE, FALSE, FALSE, FALSE,...
## $ year                               <fct> 2013, 2013, 2013, 2013, 201...
## $ half                               <fct> 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ quarter                            <fct> 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
## $ month                              <fct> 7, 7, 7, 8, 8, 8, 8, 8, 8, ...
## $ day                                <fct> 29, 30, 31, 1, 2, 3, 4, 5, ...
## $ wday.lbl                           <fct> Monday, Tuesday, Wednesday,...
## $ mweek                              <fct> 5, 5, 5, 5, 1, 1, 1, 2, 2, ...
## $ tmin_bands                         <fct> 05_50to75, 05_50to75, 05_50...
## $ tmax_bands                         <fct> 05_75to100, 04_50to75, 04_5...
## $ tmin_bands_l7                      <fct> 05_50to75, 05_50to75, 05_50...
## $ tmax_bands_l7                      <fct> 05_75to100, 04_50to75, 04_5...
## $ divvy_pt5mi_trip_cnt_cus_l7        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_pt5mi_triptime_mean_cus_l7   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_pt5mi_trip_cnt_dep_l7        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_pt5mi_triptime_mean_dep_l7   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_pt5mi_trip_cnt_sub_l7        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_pt5mi_triptime_mean_sub_l7   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_all_trip_cnt_cus_l7          <dbl> 1705, 1492, 1472, 1585, 116...
## $ divvy_all_triptime_mean_cus_l7     <dbl> 2165.246, 1798.073, 1826.37...
## $ divvy_all_triptime_med_cus_l7      <dbl> 1441.0, 1273.5, 1217.5, 127...
## $ divvy_all_trip_cnt_dep_l7          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_all_triptime_mean_dep_l7     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_all_triptime_med_dep_l7      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_all_trip_cnt_sub_l7          <dbl> 1029, 1179, 1191, 1225, 991...
## $ divvy_all_triptime_mean_sub_l7     <dbl> 819.4402, 745.8338, 760.988...
## $ divvy_all_triptime_med_sub_l7      <dbl> 635.0, 614.0, 620.0, 617.0,...
## $ divvy_mindist_trip_cnt_cus_l7      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_trip_cnt_dep_l7      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_trip_cnt_sub_l7      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_triptime_mean_cus_l7 <dbl> 0.00, 0.00, 0.00, 0.00, 0.0...
## $ divvy_mindist_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ holiday_name_l7                    <fct> --Not_Holiday--, --Not_Holi...
## $ holiday_comment_l7                 <chr> NA, NA, NA, NA, NA, NA, NA,...
## $ holiday_l7                         <lgl> FALSE, FALSE, FALSE, FALSE,...
## $ el_rides_l07                       <dbl> 1140, 1043, 1081, 1084, 109...
## $ el_rides_l14                       <dbl> 1167, 1093, 1102, 1051, 110...
## $ el_rides_l21                       <dbl> 1063, 1065, 1127, 1150, 121...
## $ el_rides_l28                       <dbl> 1179, 1048, 1177, 687, 951,...
## $ el_rides_ma07                      <dbl> 956.2857, 964.1429, 955.285...
## $ el_rides_ma14                      <dbl> 970.8571, 971.2143, 965.285...
## $ el_rides_ma21                      <dbl> 990.2381, 991.8095, 986.666...
## $ el_rides_ma28                      <dbl> 968.5000, 970.2857, 964.642...
## $ data_use_el_stop_id                <chr> "01_train", "01_train", "01...
message("DV_train_data")
## DV_train_data
DV_train_data$`41140` %>% glimpse()
## Observations: 905
## Variables: 134
## $ el_rides                                      <dbl> 1084, 1098, 1019...
## $ el_date                                       <date> 2013-07-29, 201...
## $ el_stop_id                                    <dbl> 41140, 41140, 41...
## $ data_use_el_stop_id                           <chr> "01_train", "01_...
## $ divvy_pt5mi_stn_cnt                           <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_miles                           <dbl> 1.486693, 1.4866...
## $ `holiday_name.--Not_Holiday--`                <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name.Casimir Pulaski Day`            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day`                  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day (in lieu)`        <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Columbus Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Christmas`            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Thanksgiving`         <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Election Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Father's Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day`               <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day (observed)`    <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Labor Day`                      <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Lincoln's Birthday`             <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Martin Luther King Jr. Day`     <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Memorial Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Mother's Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day`                  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day Holiday`          <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Presidents' Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name.Thanksgiving                     <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day (observed)`        <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2013                                     <dbl> 1, 1, 1, 1, 1, 1...
## $ year.2014                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2015                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2016                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2017                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2018                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ half.1                                        <dbl> 0, 0, 0, 0, 0, 0...
## $ half.2                                        <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.1                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.2                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.3                                     <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.4                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ month.1                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.2                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.3                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.4                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.5                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.6                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.7                                       <dbl> 1, 1, 1, 0, 0, 0...
## $ month.8                                       <dbl> 0, 0, 0, 1, 1, 1...
## $ month.9                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.10                                      <dbl> 0, 0, 0, 0, 0, 0...
## $ month.11                                      <dbl> 0, 0, 0, 0, 0, 0...
## $ month.12                                      <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Friday                               <dbl> 0, 0, 0, 0, 1, 0...
## $ wday.lbl.Monday                               <dbl> 1, 0, 0, 0, 0, 0...
## $ wday.lbl.Saturday                             <dbl> 0, 0, 0, 0, 0, 1...
## $ wday.lbl.Sunday                               <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Thursday                             <dbl> 0, 0, 0, 1, 0, 0...
## $ wday.lbl.Tuesday                              <dbl> 0, 1, 0, 0, 0, 0...
## $ wday.lbl.Wednesday                            <dbl> 0, 0, 1, 0, 0, 0...
## $ mweek.1                                       <dbl> 0, 0, 0, 0, 1, 1...
## $ mweek.2                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.3                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.4                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.5                                       <dbl> 1, 1, 1, 1, 0, 0...
## $ mweek.6                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ `tmin_bands.02_-25to00`                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.03_00to25                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.04_25to50                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.05_50to75                          <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands.06_75to100                         <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.02_00to25                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.03_25to50                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.04_50to75                          <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands.05_75to100                         <dbl> 1, 0, 0, 1, 1, 1...
## $ `tmin_bands_l7.02_-25to00`                    <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.03_00to25                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.04_25to50                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.05_50to75                       <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands_l7.06_75to100                      <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.02_00to25                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.03_25to50                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.04_50to75                       <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands_l7.05_75to100                      <dbl> 1, 0, 0, 1, 1, 1...
## $ divvy_pt5mi_trip_cnt_cus_l7                   <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_cus_l7              <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_dep_l7                   <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_dep_l7              <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_sub_l7                   <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_sub_l7              <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_cus_l7                     <dbl> 1705, 1492, 1472...
## $ divvy_all_triptime_mean_cus_l7                <dbl> 2165.246, 1798.0...
## $ divvy_all_triptime_med_cus_l7                 <dbl> 1441.0, 1273.5, ...
## $ divvy_all_trip_cnt_dep_l7                     <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_mean_dep_l7                <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_med_dep_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_sub_l7                     <dbl> 1029, 1179, 1191...
## $ divvy_all_triptime_mean_sub_l7                <dbl> 819.4402, 745.83...
## $ divvy_all_triptime_med_sub_l7                 <dbl> 635.0, 614.0, 62...
## $ divvy_mindist_trip_cnt_cus_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_dep_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_sub_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_cus_l7            <dbl> 0.00, 0.00, 0.00...
## $ divvy_mindist_triptime_mean_dep_l7            <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_sub_l7            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.--Not_Holiday--`             <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name_l7.Casimir Pulaski Day`         <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day`               <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day (in lieu)`     <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Columbus Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Christmas`         <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Thanksgiving`      <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Election Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Father's Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day`            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Labor Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Lincoln's Birthday`          <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Martin Luther King Jr. Day`  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Memorial Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Mother's Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day`               <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day Holiday`       <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Presidents' Day`             <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name_l7.Thanksgiving                  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day (observed)`     <dbl> 0, 0, 0, 0, 0, 0...
## $ el_rides_l07                                  <dbl> 1140, 1043, 1081...
## $ el_rides_l14                                  <dbl> 1167, 1093, 1102...
## $ el_rides_l21                                  <dbl> 1063, 1065, 1127...
## $ el_rides_l28                                  <dbl> 1179, 1048, 1177...
## $ el_rides_ma07                                 <dbl> 956.2857, 964.14...
## $ el_rides_ma14                                 <dbl> 970.8571, 971.21...
## $ el_rides_ma21                                 <dbl> 990.2381, 991.80...
## $ el_rides_ma28                                 <dbl> 968.5000, 970.28...

Now I reduce the number of variables by using caret::nearZeroVar and caret::corr. This is done individually as caret will not handle a variable of zero standard deviation.

First, I use caret::nearZeroVar to remove variables with “near zero variance.

DV_nzv_list <-
  DV_train_data %>%
  map(~ preProcess(.x,
                   # method = c("nzv", "corr", "center", "scale", "medianImpute"),
                   method = "nzv"
                   )
      )

DV_nzv_predict <-
  map2(.x = DV_nzv_list,
       .y = DV_train_data,
       .f = function(a, b) {
         predict(a, b)
         }
       )


saveRDS(DV_nzv_predict,
        paste0(wd,
               "/Data/Interim/",
               "DV_nzv_predict.Rds"
               )
        )

# DV_nzv_predict <-
#   readRDS(paste0(wd,
#                  "/Data/Interim/",
#                  "DV_nzv_predict.Rds"
#                  )
#           )


message("before reduction")
## before reduction
DV_train_data %>%
  map(~ dim(.x)
      )
## $`40600`
## [1] 905 134
## 
## $`41140`
## [1] 905 134
## 
## $`40120`
## [1] 905 134
## 
## $`40910`
## [1] 875 134
## 
## $`40380`
## [1] 905 134
## 
## $`41660`
## [1] 905 134
DV_train_data$`41140` %>%
  glimpse()
## Observations: 905
## Variables: 134
## $ el_rides                                      <dbl> 1084, 1098, 1019...
## $ el_date                                       <date> 2013-07-29, 201...
## $ el_stop_id                                    <dbl> 41140, 41140, 41...
## $ data_use_el_stop_id                           <chr> "01_train", "01_...
## $ divvy_pt5mi_stn_cnt                           <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_miles                           <dbl> 1.486693, 1.4866...
## $ `holiday_name.--Not_Holiday--`                <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name.Casimir Pulaski Day`            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day`                  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day (in lieu)`        <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Columbus Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Christmas`            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Thanksgiving`         <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Election Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Father's Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day`               <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day (observed)`    <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Labor Day`                      <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Lincoln's Birthday`             <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Martin Luther King Jr. Day`     <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Memorial Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Mother's Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day`                  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day Holiday`          <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Presidents' Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name.Thanksgiving                     <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day (observed)`        <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2013                                     <dbl> 1, 1, 1, 1, 1, 1...
## $ year.2014                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2015                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2016                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2017                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2018                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ half.1                                        <dbl> 0, 0, 0, 0, 0, 0...
## $ half.2                                        <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.1                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.2                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.3                                     <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.4                                     <dbl> 0, 0, 0, 0, 0, 0...
## $ month.1                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.2                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.3                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.4                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.5                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.6                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.7                                       <dbl> 1, 1, 1, 0, 0, 0...
## $ month.8                                       <dbl> 0, 0, 0, 1, 1, 1...
## $ month.9                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ month.10                                      <dbl> 0, 0, 0, 0, 0, 0...
## $ month.11                                      <dbl> 0, 0, 0, 0, 0, 0...
## $ month.12                                      <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Friday                               <dbl> 0, 0, 0, 0, 1, 0...
## $ wday.lbl.Monday                               <dbl> 1, 0, 0, 0, 0, 0...
## $ wday.lbl.Saturday                             <dbl> 0, 0, 0, 0, 0, 1...
## $ wday.lbl.Sunday                               <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Thursday                             <dbl> 0, 0, 0, 1, 0, 0...
## $ wday.lbl.Tuesday                              <dbl> 0, 1, 0, 0, 0, 0...
## $ wday.lbl.Wednesday                            <dbl> 0, 0, 1, 0, 0, 0...
## $ mweek.1                                       <dbl> 0, 0, 0, 0, 1, 1...
## $ mweek.2                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.3                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.4                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.5                                       <dbl> 1, 1, 1, 1, 0, 0...
## $ mweek.6                                       <dbl> 0, 0, 0, 0, 0, 0...
## $ `tmin_bands.02_-25to00`                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.03_00to25                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.04_25to50                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.05_50to75                          <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands.06_75to100                         <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.02_00to25                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.03_25to50                          <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.04_50to75                          <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands.05_75to100                         <dbl> 1, 0, 0, 1, 1, 1...
## $ `tmin_bands_l7.02_-25to00`                    <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.03_00to25                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.04_25to50                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.05_50to75                       <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands_l7.06_75to100                      <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.02_00to25                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.03_25to50                       <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.04_50to75                       <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands_l7.05_75to100                      <dbl> 1, 0, 0, 1, 1, 1...
## $ divvy_pt5mi_trip_cnt_cus_l7                   <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_cus_l7              <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_dep_l7                   <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_dep_l7              <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_sub_l7                   <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_sub_l7              <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_cus_l7                     <dbl> 1705, 1492, 1472...
## $ divvy_all_triptime_mean_cus_l7                <dbl> 2165.246, 1798.0...
## $ divvy_all_triptime_med_cus_l7                 <dbl> 1441.0, 1273.5, ...
## $ divvy_all_trip_cnt_dep_l7                     <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_mean_dep_l7                <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_med_dep_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_sub_l7                     <dbl> 1029, 1179, 1191...
## $ divvy_all_triptime_mean_sub_l7                <dbl> 819.4402, 745.83...
## $ divvy_all_triptime_med_sub_l7                 <dbl> 635.0, 614.0, 62...
## $ divvy_mindist_trip_cnt_cus_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_dep_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_sub_l7                 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_cus_l7            <dbl> 0.00, 0.00, 0.00...
## $ divvy_mindist_triptime_mean_dep_l7            <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_sub_l7            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.--Not_Holiday--`             <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name_l7.Casimir Pulaski Day`         <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day`               <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day (in lieu)`     <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Columbus Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Christmas`         <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Thanksgiving`      <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Election Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Father's Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day`            <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Labor Day`                   <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Lincoln's Birthday`          <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Martin Luther King Jr. Day`  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Memorial Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Mother's Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day`               <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day Holiday`       <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Presidents' Day`             <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name_l7.Thanksgiving                  <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day`                <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day (observed)`     <dbl> 0, 0, 0, 0, 0, 0...
## $ el_rides_l07                                  <dbl> 1140, 1043, 1081...
## $ el_rides_l14                                  <dbl> 1167, 1093, 1102...
## $ el_rides_l21                                  <dbl> 1063, 1065, 1127...
## $ el_rides_l28                                  <dbl> 1179, 1048, 1177...
## $ el_rides_ma07                                 <dbl> 956.2857, 964.14...
## $ el_rides_ma14                                 <dbl> 970.8571, 971.21...
## $ el_rides_ma21                                 <dbl> 990.2381, 991.80...
## $ el_rides_ma28                                 <dbl> 968.5000, 970.28...
message("after near-zero variable reduction")
## after near-zero variable reduction
DV_nzv_predict %>%
  map(~ dim(.x)
      )
## $`40600`
## [1] 905  69
## 
## $`41140`
## [1] 905  70
## 
## $`40120`
## [1] 905  72
## 
## $`40910`
## [1] 875  71
## 
## $`40380`
## [1] 905  70
## 
## $`41660`
## [1] 905  72
DV_nzv_predict$`41140` %>%
  glimpse()
## Observations: 905
## Variables: 70
## $ el_rides                           <dbl> 1084, 1098, 1019, 1204, 115...
## $ el_date                            <date> 2013-07-29, 2013-07-30, 20...
## $ data_use_el_stop_id                <chr> "01_train", "01_train", "01...
## $ divvy_pt5mi_stn_cnt                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_miles                <dbl> 1.486693, 1.486693, 1.48669...
## $ year.2013                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ year.2014                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ year.2015                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ half.1                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ half.2                             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.1                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.2                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.3                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.4                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.1                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.2                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.3                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.4                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.5                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.6                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.7                            <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ month.8                            <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ month.9                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.10                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.11                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.12                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Friday                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ wday.lbl.Monday                    <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ wday.lbl.Saturday                  <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ wday.lbl.Sunday                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ wday.lbl.Thursday                  <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Tuesday                   <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, ...
## $ wday.lbl.Wednesday                 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ mweek.1                            <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, ...
## $ mweek.2                            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ mweek.3                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.4                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.5                            <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
## $ tmin_bands.03_00to25               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands.04_25to50               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands.05_50to75               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tmax_bands.02_00to25               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.03_25to50               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.04_50to75               <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.05_75to100              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ tmin_bands_l7.03_00to25            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.04_25to50            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.05_50to75            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tmax_bands_l7.02_00to25            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.03_25to50            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.04_50to75            <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.05_75to100           <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ divvy_all_trip_cnt_cus_l7          <dbl> 1705, 1492, 1472, 1585, 116...
## $ divvy_all_triptime_mean_cus_l7     <dbl> 2165.246, 1798.073, 1826.37...
## $ divvy_all_triptime_med_cus_l7      <dbl> 1441.0, 1273.5, 1217.5, 127...
## $ divvy_all_trip_cnt_sub_l7          <dbl> 1029, 1179, 1191, 1225, 991...
## $ divvy_all_triptime_mean_sub_l7     <dbl> 819.4402, 745.8338, 760.988...
## $ divvy_all_triptime_med_sub_l7      <dbl> 635.0, 614.0, 620.0, 617.0,...
## $ divvy_mindist_trip_cnt_cus_l7      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_trip_cnt_sub_l7      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_triptime_mean_cus_l7 <dbl> 0.00, 0.00, 0.00, 0.00, 0.0...
## $ divvy_mindist_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ el_rides_l07                       <dbl> 1140, 1043, 1081, 1084, 109...
## $ el_rides_l14                       <dbl> 1167, 1093, 1102, 1051, 110...
## $ el_rides_l21                       <dbl> 1063, 1065, 1127, 1150, 121...
## $ el_rides_l28                       <dbl> 1179, 1048, 1177, 687, 951,...
## $ el_rides_ma07                      <dbl> 956.2857, 964.1429, 955.285...
## $ el_rides_ma14                      <dbl> 970.8571, 971.2143, 965.285...
## $ el_rides_ma21                      <dbl> 990.2381, 991.8095, 986.666...
## $ el_rides_ma28                      <dbl> 968.5000, 970.2857, 964.642...
rm(DV_nzv_list)

First, I use caret::corr to remove highly correlated variables.

DV_corr_list <-
  DV_nzv_predict %>%
  map(~ preProcess(.x,
                   # method = c("nzv", "corr", "center", "scale", "medianImpute"),
                   method = "corr"
                   )
      )

DV_corr_predict <-
  map2(.x = DV_corr_list,
       .y = DV_nzv_predict,
       .f = function(a, b) {
         predict(a, b)
         }
       )


saveRDS(DV_corr_predict,
        paste0(wd,
               "/Data/Interim/",
               "DV_corr_predict.Rds"
               )
        )

# DV_corr_predict <-
#   readRDS(paste0(wd,
#                  "/Data/Interim/",
#                  "DV_corr_predict.Rds"
#                  )
#           )


message("before corr reduction")
## before corr reduction
DV_nzv_predict %>%
  map(~ dim(.x)
      )
## $`40600`
## [1] 905  69
## 
## $`41140`
## [1] 905  70
## 
## $`40120`
## [1] 905  72
## 
## $`40910`
## [1] 875  71
## 
## $`40380`
## [1] 905  70
## 
## $`41660`
## [1] 905  72
DV_nzv_predict$`41140` %>%
  glimpse()
## Observations: 905
## Variables: 70
## $ el_rides                           <dbl> 1084, 1098, 1019, 1204, 115...
## $ el_date                            <date> 2013-07-29, 2013-07-30, 20...
## $ data_use_el_stop_id                <chr> "01_train", "01_train", "01...
## $ divvy_pt5mi_stn_cnt                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_miles                <dbl> 1.486693, 1.486693, 1.48669...
## $ year.2013                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ year.2014                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ year.2015                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ half.1                             <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ half.2                             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.1                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.2                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.3                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.4                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.1                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.2                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.3                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.4                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.5                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.6                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.7                            <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ month.8                            <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ month.9                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.10                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.11                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.12                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Friday                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ wday.lbl.Monday                    <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ wday.lbl.Saturday                  <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ wday.lbl.Sunday                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ wday.lbl.Thursday                  <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Tuesday                   <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, ...
## $ wday.lbl.Wednesday                 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ mweek.1                            <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, ...
## $ mweek.2                            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ mweek.3                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.4                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.5                            <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
## $ tmin_bands.03_00to25               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands.04_25to50               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands.05_50to75               <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tmax_bands.02_00to25               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.03_25to50               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.04_50to75               <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.05_75to100              <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ tmin_bands_l7.03_00to25            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.04_25to50            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.05_50to75            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tmax_bands_l7.02_00to25            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.03_25to50            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.04_50to75            <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.05_75to100           <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ divvy_all_trip_cnt_cus_l7          <dbl> 1705, 1492, 1472, 1585, 116...
## $ divvy_all_triptime_mean_cus_l7     <dbl> 2165.246, 1798.073, 1826.37...
## $ divvy_all_triptime_med_cus_l7      <dbl> 1441.0, 1273.5, 1217.5, 127...
## $ divvy_all_trip_cnt_sub_l7          <dbl> 1029, 1179, 1191, 1225, 991...
## $ divvy_all_triptime_mean_sub_l7     <dbl> 819.4402, 745.8338, 760.988...
## $ divvy_all_triptime_med_sub_l7      <dbl> 635.0, 614.0, 620.0, 617.0,...
## $ divvy_mindist_trip_cnt_cus_l7      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_trip_cnt_sub_l7      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_triptime_mean_cus_l7 <dbl> 0.00, 0.00, 0.00, 0.00, 0.0...
## $ divvy_mindist_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ el_rides_l07                       <dbl> 1140, 1043, 1081, 1084, 109...
## $ el_rides_l14                       <dbl> 1167, 1093, 1102, 1051, 110...
## $ el_rides_l21                       <dbl> 1063, 1065, 1127, 1150, 121...
## $ el_rides_l28                       <dbl> 1179, 1048, 1177, 687, 951,...
## $ el_rides_ma07                      <dbl> 956.2857, 964.1429, 955.285...
## $ el_rides_ma14                      <dbl> 970.8571, 971.2143, 965.285...
## $ el_rides_ma21                      <dbl> 990.2381, 991.8095, 986.666...
## $ el_rides_ma28                      <dbl> 968.5000, 970.2857, 964.642...
message("after corr variable reduction")
## after corr variable reduction
DV_corr_predict %>%
  map(~ dim(.x)
      )
## $`40600`
## [1] 905  58
## 
## $`41140`
## [1] 905  58
## 
## $`40120`
## [1] 905  57
## 
## $`40910`
## [1] 875  59
## 
## $`40380`
## [1] 905  59
## 
## $`41660`
## [1] 905  60
DV_corr_predict$`41140` %>%
  glimpse()
## Observations: 905
## Variables: 58
## $ el_rides                           <dbl> 1084, 1098, 1019, 1204, 115...
## $ el_date                            <date> 2013-07-29, 2013-07-30, 20...
## $ data_use_el_stop_id                <chr> "01_train", "01_train", "01...
## $ divvy_pt5mi_stn_cnt                <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ year.2013                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ year.2014                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ year.2015                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ half.2                             <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.1                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.2                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.3                          <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.4                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.1                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.2                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.3                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.4                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.5                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.6                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.7                            <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ month.8                            <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ month.9                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.10                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.11                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.12                           <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Friday                    <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ wday.lbl.Monday                    <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ wday.lbl.Saturday                  <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ wday.lbl.Sunday                    <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ wday.lbl.Thursday                  <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Tuesday                   <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, ...
## $ wday.lbl.Wednesday                 <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ mweek.1                            <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, ...
## $ mweek.2                            <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ mweek.3                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.4                            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.5                            <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
## $ tmax_bands.04_50to75               <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.03_00to25            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.04_25to50            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.05_50to75            <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tmax_bands_l7.02_00to25            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.03_25to50            <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.05_75to100           <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ divvy_all_trip_cnt_cus_l7          <dbl> 1705, 1492, 1472, 1585, 116...
## $ divvy_all_triptime_mean_cus_l7     <dbl> 2165.246, 1798.073, 1826.37...
## $ divvy_all_triptime_med_cus_l7      <dbl> 1441.0, 1273.5, 1217.5, 127...
## $ divvy_all_trip_cnt_sub_l7          <dbl> 1029, 1179, 1191, 1225, 991...
## $ divvy_all_triptime_mean_sub_l7     <dbl> 819.4402, 745.8338, 760.988...
## $ divvy_all_triptime_med_sub_l7      <dbl> 635.0, 614.0, 620.0, 617.0,...
## $ divvy_mindist_trip_cnt_cus_l7      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_trip_cnt_sub_l7      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_triptime_mean_cus_l7 <dbl> 0.00, 0.00, 0.00, 0.00, 0.0...
## $ divvy_mindist_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ el_rides_l07                       <dbl> 1140, 1043, 1081, 1084, 109...
## $ el_rides_l14                       <dbl> 1167, 1093, 1102, 1051, 110...
## $ el_rides_l21                       <dbl> 1063, 1065, 1127, 1150, 121...
## $ el_rides_l28                       <dbl> 1179, 1048, 1177, 687, 951,...
## $ el_rides_ma07                      <dbl> 956.2857, 964.1429, 955.285...
rm(DV_corr_list)

Modeling Parameters

Modeling parameters used in multiple models.

period_train <- round((365 * 1.5),
                      digits = 0
                      ) + 30 # 1.5 years + 30 days of data needed for LSTM Keras modeling

period_test <- round((365 * 0.5),
                     digits = 0
                     ) + 30 # test on 0.5 years * 30 days of data (even though we just predict 14 days out) needed for LSTM Keras modeling

skip_span <- 8 # gives 13 evenly distributed  splits


saveRDS(period_train,
        paste0(wd,
               "/Data/Interim/",
               "period_train.Rds"
               )
        )

# period_train <-
#   readRDS(paste0(wd,
#                  "/Data/Interim/",
#                  "period_train.Rds"
#                  )
#           )

saveRDS(period_test,
        paste0(wd,
               "/Data/Interim/",
               "period_test.Rds"
               )
        )

# period_test <-
#   readRDS(paste0(wd,
#                  "/Data/Interim/",
#                  "period_test.Rds"
#                  )
#           )

saveRDS(skip_span,
        paste0(wd,
               "/Data/Interim/",
               "skip_span.Rds"
               )
        )

# skip_span <-
#   readRDS(paste0(wd,
#                  "/Data/Interim/",
#                  "skip_span.Rds"
#                  )
#           )