Load the relevant libraries.
# rm(list = ls())
# .rs.restartR()
# data manipulation
library("plyr")
library("tidyverse")
library("magrittr")
library("data.table")
library("lubridate")
library("sqldf")
# time series specific packages
library("timetk")
library("zoo")
library("tibbletime")
# modeling
library("fpp2")
library("prophet")
library("caret")
library("randomForest")
library("xgboost")
library("h2o")
library("keras")
# use_session_with_seed(123456789) # setting the seed to obtain reproducible results
# see https://keras.rstudio.com/articles/faq.html#how-can-i-obtain-reproducible-results-using-keras-during-development and https://cran.r-project.org/web/packages/keras/vignettes/faq.html
# can also re-enable gpu and parallel processing by using: use_session_with_seed(42, disable_gpu = FALSE, disable_parallel_cpu = FALSE)
# other
library("geosphere") # specific for distance calculations from lat-lon pairs
library("naniar") # inspecting missing data
library("rlang") # building functions
library("recipes") # used in Keras modeling to design matrices
library("rsample") # rolling samples for validation stats
library("tfruns") # used in Keras modeling for trainin runs
library("stringr") # string manipulation
library("ggplot2") # viz
library("sweep") # more easily pull out model statistics
library("yardstick") # easily calculate accuracy stats
library("doParallel") # parallel processing
Session Info.
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] doParallel_1.0.14 iterators_1.0.10 foreach_1.4.4
## [4] yardstick_0.0.2 sweep_0.2.1.1 tfruns_1.4
## [7] rsample_0.0.3 recipes_0.1.4 rlang_0.3.0.1
## [10] naniar_0.4.1 geosphere_1.5-7 keras_2.2.4
## [13] h2o_3.20.0.8 xgboost_0.71.2 randomForest_4.6-14
## [16] caret_6.0-81 lattice_0.20-38 prophet_0.3.0.1
## [19] Rcpp_1.0.0 fpp2_2.3 expsmooth_2.3
## [22] fma_2.3 forecast_8.4 tibbletime_0.1.1
## [25] zoo_1.8-4 timetk_0.1.1.1 sqldf_0.4-11
## [28] RSQLite_2.1.1 gsubfn_0.7 proto_1.0.0
## [31] lubridate_1.7.4 data.table_1.11.8 magrittr_1.5
## [34] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8
## [37] purrr_0.2.5 readr_1.2.1 tidyr_0.8.2
## [40] tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1
## [43] plyr_1.8.4
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 class_7.3-14 visdat_0.5.1
## [4] rprojroot_1.3-2 base64enc_0.1-3 rstudioapi_0.8
## [7] rstan_2.18.2 bit64_0.9-7 prodlim_2018.04.18
## [10] xml2_1.2.0 codetools_0.2-15 splines_3.5.1
## [13] knitr_1.20 zeallot_0.1.0 jsonlite_1.5
## [16] pROC_1.13.0 broom_0.5.0 compiler_3.5.1
## [19] httr_1.3.1 backports_1.1.2 assertthat_0.2.0
## [22] Matrix_1.2-15 lazyeval_0.2.1 cli_1.0.1
## [25] htmltools_0.3.6 prettyunits_1.0.2 tools_3.5.1
## [28] bindrcpp_0.2.2 gtable_0.2.0 glue_1.3.0
## [31] reshape2_1.4.3 cellranger_1.1.0 fracdiff_1.4-2
## [34] urca_1.3-0 debugme_1.1.0 nlme_3.1-137
## [37] lmtest_0.9-36 timeDate_3043.102 gower_0.1.2
## [40] ps_1.2.1 rvest_0.3.2 MASS_7.3-51.1
## [43] scales_1.0.0 ipred_0.9-8 hms_0.4.2
## [46] inline_0.3.15 yaml_2.2.0 quantmod_0.4-13
## [49] curl_3.2 reticulate_1.10 memoise_1.1.0
## [52] gridExtra_2.3 loo_2.0.0 StanHeaders_2.18.0
## [55] uroot_2.0-9 rpart_4.1-13 stringi_1.2.4
## [58] tensorflow_1.10 tseries_0.10-46 TTR_0.23-4
## [61] pkgbuild_1.0.2 lava_1.6.4 chron_2.3-53
## [64] bitops_1.0-6 pkgconfig_2.0.2 matrixStats_0.54.0
## [67] evaluate_0.12 bindr_0.1.1 bit_1.1-14
## [70] processx_3.2.0 tidyselect_0.2.5 R6_2.3.0
## [73] generics_0.0.2 DBI_1.0.0 whisker_0.3-2
## [76] pillar_1.3.0 haven_2.0.0 withr_2.1.2
## [79] xts_0.11-2 sp_1.3-1 RCurl_1.95-4.11
## [82] survival_2.43-3 nnet_7.3-12 modelr_0.1.2
## [85] crayon_1.3.4 rmarkdown_1.10 grid_3.5.1
## [88] readxl_1.1.0 blob_1.1.1 callr_3.0.0
## [91] ModelMetrics_1.2.2 digest_0.6.18 stats4_3.5.1
## [94] munsell_0.5.0 tcltk_3.5.1 quadprog_1.5-5
Setup the root directory.
Setting wd as the working directory.
wd <- getwd()
wd
## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy"
NOTE: remove_first_28na_rows is the output produced in Step 01
remove_first_28na_rows <-
readRDS(paste0(wd,
"/Data/Interim/",
"remove_first_28na_rows.Rds"
)
)
I’ll be testing models that use the entire dataset, and also models that are specific to each el_stop_id (i.e., a model for each El entry point). Therefore, I’ll create train-test splits (at 70% of the data) for each group.
First, I’ll create a function to to create the proper value depending on the situation (i.e., creating the split for the entire dataset, or for each value of el_stop_id as a single entity).
func_train_test_indicator <-
function(data, date_var, train_val_pct, train_pct, new_col_suffix) {
# get needed variables
date_var_enquo = enquo(date_var)
date_min = data %>%
select(!!date_var_enquo) %>%
distinct() %>%
top_n(n = -1, wt = !!date_var_enquo) %>%
pull(!!date_var_enquo)
date_max = data %>%
select(!!date_var_enquo) %>%
distinct() %>%
top_n(n = 1, wt = !!date_var_enquo) %>%
pull(!!date_var_enquo)
# train and validation split
days_trainval = round(train_val_pct * (date_max - date_min))
split_date_trainval = date_min + days_trainval
# train split
days_train = round(train_pct * (split_date_trainval - date_min))
split_date_train = date_min + days_train
# new dataset
new_data =
data %>%
mutate(new_var =
case_when(el_date < split_date_train ~ "01_train",
between(x = el_date,
lower = split_date_train,
upper = split_date_trainval,
incbounds = TRUE
) ~ "02_validation",
TRUE ~ "03_test"
)
)
colnames(new_data)[length(new_data)] = paste0("data_use_", new_col_suffix)
return(new_data)
}
Here I run the function for each value of el_stop_id.
add_trn_val_test <-
remove_first_28na_rows %>%
map(~ func_train_test_indicator(data = .x,
date_var = el_date,
train_val_pct = 0.8,
train_pct = 0.7,
new_col_suffix = "el_stop_id"
) %>%
mutate(wday.lbl = factor(as.character(wday.lbl)
)
)
)
## Save the data to the proper folder
saveRDS(add_trn_val_test,
paste0(wd,
"/Data/Interim/",
"add_trn_val_test.Rds"
)
)
# add_trn_val_test <-
# readRDS(paste0(wd,
# "/Data/Interim/",
# "add_trn_val_test.Rds"
# )
# )
str(add_trn_val_test$`41140`)
## 'data.frame': 1617 obs. of 52 variables:
## $ el_date : Date, format: "2013-07-29" "2013-07-30" ...
## $ el_stop_id : num 41140 41140 41140 41140 41140 ...
## $ divvy_pt5mi_stn_cnt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_mindist_miles : num 1.49 1.49 1.49 1.49 1.49 ...
## $ el_rides : num 1084 1098 1019 1204 1159 ...
## $ holiday_name : Factor w/ 22 levels "--Not_Holiday--",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday_comment : chr NA NA NA NA ...
## $ holiday : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ year : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ half : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
## $ quarter : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
## $ month : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 8 8 8 8 8 8 8 ...
## $ day : Factor w/ 31 levels "1","2","3","4",..: 29 30 31 1 2 3 4 5 6 7 ...
## $ wday.lbl : Factor w/ 7 levels "Friday","Monday",..: 2 6 7 5 1 3 4 2 6 7 ...
## $ mweek : Factor w/ 6 levels "1","2","3","4",..: 5 5 5 5 1 1 1 2 2 2 ...
## $ tmin_bands : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ tmax_bands : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
## $ tmin_bands_l7 : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 4 4 ...
## $ tmax_bands_l7 : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
## $ divvy_pt5mi_trip_cnt_cus_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_pt5mi_triptime_mean_cus_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_pt5mi_trip_cnt_dep_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_pt5mi_triptime_mean_dep_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_pt5mi_trip_cnt_sub_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_pt5mi_triptime_mean_sub_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_all_trip_cnt_cus_l7 : num 1705 1492 1472 1585 1166 ...
## $ divvy_all_triptime_mean_cus_l7 : num 2165 1798 1826 1823 1828 ...
## $ divvy_all_triptime_med_cus_l7 : num 1441 1274 1218 1278 1240 ...
## $ divvy_all_trip_cnt_dep_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_all_triptime_mean_dep_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_all_triptime_med_dep_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_all_trip_cnt_sub_l7 : num 1029 1179 1191 1225 991 ...
## $ divvy_all_triptime_mean_sub_l7 : num 819 746 761 845 704 ...
## $ divvy_all_triptime_med_sub_l7 : num 635 614 620 617 597 ...
## $ divvy_mindist_trip_cnt_cus_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_mindist_trip_cnt_dep_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_mindist_trip_cnt_sub_l7 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_mindist_triptime_mean_cus_l7: num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_mindist_triptime_mean_dep_l7: num 0 0 0 0 0 0 0 0 0 0 ...
## $ divvy_mindist_triptime_mean_sub_l7: num 0 0 0 0 0 0 0 0 0 0 ...
## $ holiday_name_l7 : Factor w/ 22 levels "--Not_Holiday--",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ holiday_comment_l7 : chr NA NA NA NA ...
## $ holiday_l7 : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ el_rides_l07 : num 1140 1043 1081 1084 1094 ...
## $ el_rides_l14 : num 1167 1093 1102 1051 1101 ...
## $ el_rides_l21 : num 1063 1065 1127 1150 1210 ...
## $ el_rides_l28 : num 1179 1048 1177 687 951 ...
## $ el_rides_ma07 : num 956 964 955 972 982 ...
## $ el_rides_ma14 : num 971 971 965 976 980 ...
## $ el_rides_ma21 : num 990 992 987 989 987 ...
## $ el_rides_ma28 : num 968 970 965 983 991 ...
## $ data_use_el_stop_id : chr "01_train" "01_train" "01_train" "01_train" ...
add_trn_val_test %>%
map(~ group_by(.x,
data_use_el_stop_id
) %>%
summarise(date_min = min(el_date),
date_max = max(el_date)
)
)
## $`40600`
## # A tibble: 3 x 3
## data_use_el_stop_id date_min date_max
## <chr> <date> <date>
## 1 01_train 2013-07-29 2016-01-19
## 2 02_validation 2016-01-20 2017-02-11
## 3 03_test 2017-02-12 2017-12-31
##
## $`41140`
## # A tibble: 3 x 3
## data_use_el_stop_id date_min date_max
## <chr> <date> <date>
## 1 01_train 2013-07-29 2016-01-19
## 2 02_validation 2016-01-20 2017-02-11
## 3 03_test 2017-02-12 2017-12-31
##
## $`40120`
## # A tibble: 3 x 3
## data_use_el_stop_id date_min date_max
## <chr> <date> <date>
## 1 01_train 2013-07-29 2016-01-19
## 2 02_validation 2016-01-20 2017-02-11
## 3 03_test 2017-02-12 2017-12-31
##
## $`40910`
## # A tibble: 3 x 3
## data_use_el_stop_id date_min date_max
## <chr> <date> <date>
## 1 01_train 2013-07-29 2016-01-19
## 2 02_validation 2016-01-20 2017-02-11
## 3 03_test 2017-02-12 2017-12-31
##
## $`40380`
## # A tibble: 3 x 3
## data_use_el_stop_id date_min date_max
## <chr> <date> <date>
## 1 01_train 2013-07-29 2016-01-19
## 2 02_validation 2016-01-20 2017-02-11
## 3 03_test 2017-02-12 2017-12-31
##
## $`41660`
## # A tibble: 3 x 3
## data_use_el_stop_id date_min date_max
## <chr> <date> <date>
## 1 01_train 2013-07-29 2016-01-19
## 2 02_validation 2016-01-20 2017-02-11
## 3 03_test 2017-02-12 2017-12-31
rm(remove_first_28na_rows, func_train_test_indicator)
To Delete (below) when no longer needed
# add_trn_val_test <-
# readRDS(paste0(wd,
# "/Data/Interim/",
# "add_trn_val_test.Rds"
# )
# )
#
#
# str(add_trn_val_test$`41140`)
To Delete (above) when no longer needed
Turn factor variables into dummy variables. NOTE that this is done with all levels of the factor. That is, it does NOT use “full rank parameterization” to leave one level out (e.g., to be used as the linear model’s intercept).
# create the function to create dummy vars
func_one_hot_vars <-
function(x) {
data_s = x %>%
select(-holiday_comment,
-holiday,
-day,
-holiday_comment_l7,
-holiday_l7
)
formula =
dummyVars(el_rides ~ .,
data = data_s %>%
select(-el_date,
-el_stop_id,
-data_use_el_stop_id
),
fullRank = FALSE
)
data_DV =
data_s %>%
select(el_rides,
el_date,
el_stop_id,
data_use_el_stop_id
) %>%
bind_cols(predict(object = formula,
newdata = data_s %>%
select(-el_date,
-el_stop_id,
-data_use_el_stop_id
)
) %>%
as.data.frame()
)
}
saveRDS(func_one_hot_vars,
paste0(wd,
"/Data/Interim/",
"func_one_hot_vars.Rds"
)
)
# func_one_hot_vars <-
# readRDS(paste0(wd,
# "/Data/Interim/",
# "func_one_hot_vars.Rds"
# )
# )
# run the function
DV_data <-
add_trn_val_test %>%
map(~ func_one_hot_vars(.x)
)
glimpse(DV_data$`41140`)
## Observations: 1,617
## Variables: 134
## $ el_rides <dbl> 1084, 1098, 1019...
## $ el_date <date> 2013-07-29, 201...
## $ el_stop_id <dbl> 41140, 41140, 41...
## $ data_use_el_stop_id <chr> "01_train", "01_...
## $ divvy_pt5mi_stn_cnt <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_miles <dbl> 1.486693, 1.4866...
## $ `holiday_name.--Not_Holiday--` <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name.Casimir Pulaski Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day (in lieu)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Columbus Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Christmas` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Thanksgiving` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Election Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Father's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Labor Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Lincoln's Birthday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Martin Luther King Jr. Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Memorial Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Mother's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day Holiday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Presidents' Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name.Thanksgiving <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2013 <dbl> 1, 1, 1, 1, 1, 1...
## $ year.2014 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2015 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2016 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2017 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2018 <dbl> 0, 0, 0, 0, 0, 0...
## $ half.1 <dbl> 0, 0, 0, 0, 0, 0...
## $ half.2 <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.1 <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.2 <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.3 <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.4 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.1 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.2 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.3 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.4 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.5 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.6 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.7 <dbl> 1, 1, 1, 0, 0, 0...
## $ month.8 <dbl> 0, 0, 0, 1, 1, 1...
## $ month.9 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.10 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.11 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.12 <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Friday <dbl> 0, 0, 0, 0, 1, 0...
## $ wday.lbl.Monday <dbl> 1, 0, 0, 0, 0, 0...
## $ wday.lbl.Saturday <dbl> 0, 0, 0, 0, 0, 1...
## $ wday.lbl.Sunday <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Thursday <dbl> 0, 0, 0, 1, 0, 0...
## $ wday.lbl.Tuesday <dbl> 0, 1, 0, 0, 0, 0...
## $ wday.lbl.Wednesday <dbl> 0, 0, 1, 0, 0, 0...
## $ mweek.1 <dbl> 0, 0, 0, 0, 1, 1...
## $ mweek.2 <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.3 <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.4 <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.5 <dbl> 1, 1, 1, 1, 0, 0...
## $ mweek.6 <dbl> 0, 0, 0, 0, 0, 0...
## $ `tmin_bands.02_-25to00` <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.03_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.04_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.05_50to75 <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands.06_75to100 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.02_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.03_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.04_50to75 <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands.05_75to100 <dbl> 1, 0, 0, 1, 1, 1...
## $ `tmin_bands_l7.02_-25to00` <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.03_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.04_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.05_50to75 <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands_l7.06_75to100 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.02_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.03_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.04_50to75 <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands_l7.05_75to100 <dbl> 1, 0, 0, 1, 1, 1...
## $ divvy_pt5mi_trip_cnt_cus_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_cus_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_cus_l7 <dbl> 1705, 1492, 1472...
## $ divvy_all_triptime_mean_cus_l7 <dbl> 2165.246, 1798.0...
## $ divvy_all_triptime_med_cus_l7 <dbl> 1441.0, 1273.5, ...
## $ divvy_all_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_med_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_sub_l7 <dbl> 1029, 1179, 1191...
## $ divvy_all_triptime_mean_sub_l7 <dbl> 819.4402, 745.83...
## $ divvy_all_triptime_med_sub_l7 <dbl> 635.0, 614.0, 62...
## $ divvy_mindist_trip_cnt_cus_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_cus_l7 <dbl> 0.00, 0.00, 0.00...
## $ divvy_mindist_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.--Not_Holiday--` <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name_l7.Casimir Pulaski Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day (in lieu)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Columbus Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Christmas` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Thanksgiving` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Election Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Father's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Labor Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Lincoln's Birthday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Martin Luther King Jr. Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Memorial Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Mother's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day Holiday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Presidents' Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name_l7.Thanksgiving <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ el_rides_l07 <dbl> 1140, 1043, 1081...
## $ el_rides_l14 <dbl> 1167, 1093, 1102...
## $ el_rides_l21 <dbl> 1063, 1065, 1127...
## $ el_rides_l28 <dbl> 1179, 1048, 1177...
## $ el_rides_ma07 <dbl> 956.2857, 964.14...
## $ el_rides_ma14 <dbl> 970.8571, 971.21...
## $ el_rides_ma21 <dbl> 990.2381, 991.80...
## $ el_rides_ma28 <dbl> 968.5000, 970.28...
# rm(func_one_hot_vars)
I’ll try both caret::randomForest and caret::xgboost on the dataset USING dummy variables. But first, I need to create some custom accuracy metrics.
func_custom_accuracy_metrics <-
function(data, lev = NULL, model = NULL) {
mae =
function(actual, predicted) {
mean(abs((actual - predicted)
),
na.rm = TRUE
)
}
mape =
function(actual, predicted) {
mean(abs((actual - predicted) / actual * 100),
na.rm = TRUE
)
}
rmse =
function(actual, predicted) {
sqrt(mean((actual - predicted)^2,
na.rm = TRUE
)
)
}
r2 =
function(actual, predicted) {
1 - (sum((actual - predicted)^2
) / sum((actual - mean(actual)
)^2
)
)
}
out = c(mae(data$obs,
data$pred
),
mape(data$obs,
data$pred
),
rmse(data$obs,
data$pred
),
r2(data$obs,
data$pred
)
)
names(out) = c("MAE", "MAPE", "RMSE", "R2")
out
}
saveRDS(func_custom_accuracy_metrics,
paste0(wd,
"/Data/Interim/",
"func_custom_accuracy_metrics.Rds"
)
)
# func_custom_accuracy_metrics <-
# readRDS(paste0(wd,
# "/Models/",
# "func_custom_accuracy_metrics.Rds"
# )
# )
First, limit to just training data, and confirm the datasets to use are the same (except for the one-hot-encoding producing dummy variables).
train_data <-
add_trn_val_test %>%
map(~ filter(.x,
data_use_el_stop_id == "01_train"
)
)
DV_train_data <-
DV_data %>%
map(~ filter(.x,
data_use_el_stop_id == "01_train"
)
)
saveRDS(train_data,
paste0(wd,
"/Models/",
"train_data.Rds"
)
)
# train_data <-
# readRDS(paste0(wd,
# "/Models/",
# "train_data.Rds"
# )
# )
saveRDS(DV_train_data,
paste0(wd,
"/Models/",
"DV_train_data.Rds"
)
)
# DV_train_data <-
# readRDS(paste0(wd,
# "/Models/",
# "DV_train_data.Rds"
# )
# )
message("train_data")
## train_data
train_data %>%
map(~ dim(.x)
)
## $`40600`
## [1] 905 52
##
## $`41140`
## [1] 905 52
##
## $`40120`
## [1] 905 52
##
## $`40910`
## [1] 875 52
##
## $`40380`
## [1] 905 52
##
## $`41660`
## [1] 905 52
message("DV_train_data")
## DV_train_data
DV_train_data %>%
map(~ dim(.x)
)
## $`40600`
## [1] 905 134
##
## $`41140`
## [1] 905 134
##
## $`40120`
## [1] 905 134
##
## $`40910`
## [1] 875 134
##
## $`40380`
## [1] 905 134
##
## $`41660`
## [1] 905 134
message("train_data")
## train_data
train_data$`41140` %>% glimpse()
## Observations: 905
## Variables: 52
## $ el_date <date> 2013-07-29, 2013-07-30, 20...
## $ el_stop_id <dbl> 41140, 41140, 41140, 41140,...
## $ divvy_pt5mi_stn_cnt <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_miles <dbl> 1.486693, 1.486693, 1.48669...
## $ el_rides <dbl> 1084, 1098, 1019, 1204, 115...
## $ holiday_name <fct> --Not_Holiday--, --Not_Holi...
## $ holiday_comment <chr> NA, NA, NA, NA, NA, NA, NA,...
## $ holiday <lgl> FALSE, FALSE, FALSE, FALSE,...
## $ year <fct> 2013, 2013, 2013, 2013, 201...
## $ half <fct> 2, 2, 2, 2, 2, 2, 2, 2, 2, ...
## $ quarter <fct> 3, 3, 3, 3, 3, 3, 3, 3, 3, ...
## $ month <fct> 7, 7, 7, 8, 8, 8, 8, 8, 8, ...
## $ day <fct> 29, 30, 31, 1, 2, 3, 4, 5, ...
## $ wday.lbl <fct> Monday, Tuesday, Wednesday,...
## $ mweek <fct> 5, 5, 5, 5, 1, 1, 1, 2, 2, ...
## $ tmin_bands <fct> 05_50to75, 05_50to75, 05_50...
## $ tmax_bands <fct> 05_75to100, 04_50to75, 04_5...
## $ tmin_bands_l7 <fct> 05_50to75, 05_50to75, 05_50...
## $ tmax_bands_l7 <fct> 05_75to100, 04_50to75, 04_5...
## $ divvy_pt5mi_trip_cnt_cus_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_pt5mi_triptime_mean_cus_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_pt5mi_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_pt5mi_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_pt5mi_trip_cnt_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_pt5mi_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_all_trip_cnt_cus_l7 <dbl> 1705, 1492, 1472, 1585, 116...
## $ divvy_all_triptime_mean_cus_l7 <dbl> 2165.246, 1798.073, 1826.37...
## $ divvy_all_triptime_med_cus_l7 <dbl> 1441.0, 1273.5, 1217.5, 127...
## $ divvy_all_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_all_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_all_triptime_med_dep_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_all_trip_cnt_sub_l7 <dbl> 1029, 1179, 1191, 1225, 991...
## $ divvy_all_triptime_mean_sub_l7 <dbl> 819.4402, 745.8338, 760.988...
## $ divvy_all_triptime_med_sub_l7 <dbl> 635.0, 614.0, 620.0, 617.0,...
## $ divvy_mindist_trip_cnt_cus_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_trip_cnt_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_triptime_mean_cus_l7 <dbl> 0.00, 0.00, 0.00, 0.00, 0.0...
## $ divvy_mindist_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ holiday_name_l7 <fct> --Not_Holiday--, --Not_Holi...
## $ holiday_comment_l7 <chr> NA, NA, NA, NA, NA, NA, NA,...
## $ holiday_l7 <lgl> FALSE, FALSE, FALSE, FALSE,...
## $ el_rides_l07 <dbl> 1140, 1043, 1081, 1084, 109...
## $ el_rides_l14 <dbl> 1167, 1093, 1102, 1051, 110...
## $ el_rides_l21 <dbl> 1063, 1065, 1127, 1150, 121...
## $ el_rides_l28 <dbl> 1179, 1048, 1177, 687, 951,...
## $ el_rides_ma07 <dbl> 956.2857, 964.1429, 955.285...
## $ el_rides_ma14 <dbl> 970.8571, 971.2143, 965.285...
## $ el_rides_ma21 <dbl> 990.2381, 991.8095, 986.666...
## $ el_rides_ma28 <dbl> 968.5000, 970.2857, 964.642...
## $ data_use_el_stop_id <chr> "01_train", "01_train", "01...
message("DV_train_data")
## DV_train_data
DV_train_data$`41140` %>% glimpse()
## Observations: 905
## Variables: 134
## $ el_rides <dbl> 1084, 1098, 1019...
## $ el_date <date> 2013-07-29, 201...
## $ el_stop_id <dbl> 41140, 41140, 41...
## $ data_use_el_stop_id <chr> "01_train", "01_...
## $ divvy_pt5mi_stn_cnt <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_miles <dbl> 1.486693, 1.4866...
## $ `holiday_name.--Not_Holiday--` <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name.Casimir Pulaski Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day (in lieu)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Columbus Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Christmas` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Thanksgiving` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Election Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Father's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Labor Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Lincoln's Birthday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Martin Luther King Jr. Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Memorial Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Mother's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day Holiday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Presidents' Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name.Thanksgiving <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2013 <dbl> 1, 1, 1, 1, 1, 1...
## $ year.2014 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2015 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2016 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2017 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2018 <dbl> 0, 0, 0, 0, 0, 0...
## $ half.1 <dbl> 0, 0, 0, 0, 0, 0...
## $ half.2 <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.1 <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.2 <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.3 <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.4 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.1 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.2 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.3 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.4 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.5 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.6 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.7 <dbl> 1, 1, 1, 0, 0, 0...
## $ month.8 <dbl> 0, 0, 0, 1, 1, 1...
## $ month.9 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.10 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.11 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.12 <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Friday <dbl> 0, 0, 0, 0, 1, 0...
## $ wday.lbl.Monday <dbl> 1, 0, 0, 0, 0, 0...
## $ wday.lbl.Saturday <dbl> 0, 0, 0, 0, 0, 1...
## $ wday.lbl.Sunday <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Thursday <dbl> 0, 0, 0, 1, 0, 0...
## $ wday.lbl.Tuesday <dbl> 0, 1, 0, 0, 0, 0...
## $ wday.lbl.Wednesday <dbl> 0, 0, 1, 0, 0, 0...
## $ mweek.1 <dbl> 0, 0, 0, 0, 1, 1...
## $ mweek.2 <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.3 <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.4 <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.5 <dbl> 1, 1, 1, 1, 0, 0...
## $ mweek.6 <dbl> 0, 0, 0, 0, 0, 0...
## $ `tmin_bands.02_-25to00` <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.03_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.04_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.05_50to75 <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands.06_75to100 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.02_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.03_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.04_50to75 <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands.05_75to100 <dbl> 1, 0, 0, 1, 1, 1...
## $ `tmin_bands_l7.02_-25to00` <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.03_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.04_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.05_50to75 <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands_l7.06_75to100 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.02_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.03_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.04_50to75 <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands_l7.05_75to100 <dbl> 1, 0, 0, 1, 1, 1...
## $ divvy_pt5mi_trip_cnt_cus_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_cus_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_cus_l7 <dbl> 1705, 1492, 1472...
## $ divvy_all_triptime_mean_cus_l7 <dbl> 2165.246, 1798.0...
## $ divvy_all_triptime_med_cus_l7 <dbl> 1441.0, 1273.5, ...
## $ divvy_all_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_med_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_sub_l7 <dbl> 1029, 1179, 1191...
## $ divvy_all_triptime_mean_sub_l7 <dbl> 819.4402, 745.83...
## $ divvy_all_triptime_med_sub_l7 <dbl> 635.0, 614.0, 62...
## $ divvy_mindist_trip_cnt_cus_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_cus_l7 <dbl> 0.00, 0.00, 0.00...
## $ divvy_mindist_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.--Not_Holiday--` <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name_l7.Casimir Pulaski Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day (in lieu)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Columbus Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Christmas` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Thanksgiving` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Election Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Father's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Labor Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Lincoln's Birthday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Martin Luther King Jr. Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Memorial Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Mother's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day Holiday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Presidents' Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name_l7.Thanksgiving <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ el_rides_l07 <dbl> 1140, 1043, 1081...
## $ el_rides_l14 <dbl> 1167, 1093, 1102...
## $ el_rides_l21 <dbl> 1063, 1065, 1127...
## $ el_rides_l28 <dbl> 1179, 1048, 1177...
## $ el_rides_ma07 <dbl> 956.2857, 964.14...
## $ el_rides_ma14 <dbl> 970.8571, 971.21...
## $ el_rides_ma21 <dbl> 990.2381, 991.80...
## $ el_rides_ma28 <dbl> 968.5000, 970.28...
Now I reduce the number of variables by using caret::nearZeroVar and caret::corr. This is done individually as caret will not handle a variable of zero standard deviation.
First, I use caret::nearZeroVar to remove variables with “near zero variance.
DV_nzv_list <-
DV_train_data %>%
map(~ preProcess(.x,
# method = c("nzv", "corr", "center", "scale", "medianImpute"),
method = "nzv"
)
)
DV_nzv_predict <-
map2(.x = DV_nzv_list,
.y = DV_train_data,
.f = function(a, b) {
predict(a, b)
}
)
saveRDS(DV_nzv_predict,
paste0(wd,
"/Data/Interim/",
"DV_nzv_predict.Rds"
)
)
# DV_nzv_predict <-
# readRDS(paste0(wd,
# "/Data/Interim/",
# "DV_nzv_predict.Rds"
# )
# )
message("before reduction")
## before reduction
DV_train_data %>%
map(~ dim(.x)
)
## $`40600`
## [1] 905 134
##
## $`41140`
## [1] 905 134
##
## $`40120`
## [1] 905 134
##
## $`40910`
## [1] 875 134
##
## $`40380`
## [1] 905 134
##
## $`41660`
## [1] 905 134
DV_train_data$`41140` %>%
glimpse()
## Observations: 905
## Variables: 134
## $ el_rides <dbl> 1084, 1098, 1019...
## $ el_date <date> 2013-07-29, 201...
## $ el_stop_id <dbl> 41140, 41140, 41...
## $ data_use_el_stop_id <chr> "01_train", "01_...
## $ divvy_pt5mi_stn_cnt <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_miles <dbl> 1.486693, 1.4866...
## $ `holiday_name.--Not_Holiday--` <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name.Casimir Pulaski Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Christmas Day (in lieu)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Columbus Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Christmas` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Day after Thanksgiving` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Election Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Father's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Independence Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Labor Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Lincoln's Birthday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Martin Luther King Jr. Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Memorial Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Mother's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.New Years Day Holiday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Presidents' Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name.Thanksgiving <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name.Veterans Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2013 <dbl> 1, 1, 1, 1, 1, 1...
## $ year.2014 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2015 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2016 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2017 <dbl> 0, 0, 0, 0, 0, 0...
## $ year.2018 <dbl> 0, 0, 0, 0, 0, 0...
## $ half.1 <dbl> 0, 0, 0, 0, 0, 0...
## $ half.2 <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.1 <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.2 <dbl> 0, 0, 0, 0, 0, 0...
## $ quarter.3 <dbl> 1, 1, 1, 1, 1, 1...
## $ quarter.4 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.1 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.2 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.3 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.4 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.5 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.6 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.7 <dbl> 1, 1, 1, 0, 0, 0...
## $ month.8 <dbl> 0, 0, 0, 1, 1, 1...
## $ month.9 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.10 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.11 <dbl> 0, 0, 0, 0, 0, 0...
## $ month.12 <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Friday <dbl> 0, 0, 0, 0, 1, 0...
## $ wday.lbl.Monday <dbl> 1, 0, 0, 0, 0, 0...
## $ wday.lbl.Saturday <dbl> 0, 0, 0, 0, 0, 1...
## $ wday.lbl.Sunday <dbl> 0, 0, 0, 0, 0, 0...
## $ wday.lbl.Thursday <dbl> 0, 0, 0, 1, 0, 0...
## $ wday.lbl.Tuesday <dbl> 0, 1, 0, 0, 0, 0...
## $ wday.lbl.Wednesday <dbl> 0, 0, 1, 0, 0, 0...
## $ mweek.1 <dbl> 0, 0, 0, 0, 1, 1...
## $ mweek.2 <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.3 <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.4 <dbl> 0, 0, 0, 0, 0, 0...
## $ mweek.5 <dbl> 1, 1, 1, 1, 0, 0...
## $ mweek.6 <dbl> 0, 0, 0, 0, 0, 0...
## $ `tmin_bands.02_-25to00` <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.03_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.04_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands.05_50to75 <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands.06_75to100 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.02_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.03_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands.04_50to75 <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands.05_75to100 <dbl> 1, 0, 0, 1, 1, 1...
## $ `tmin_bands_l7.02_-25to00` <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.03_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.04_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmin_bands_l7.05_50to75 <dbl> 1, 1, 1, 1, 1, 1...
## $ tmin_bands_l7.06_75to100 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.02_00to25 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.03_25to50 <dbl> 0, 0, 0, 0, 0, 0...
## $ tmax_bands_l7.04_50to75 <dbl> 0, 1, 1, 0, 0, 0...
## $ tmax_bands_l7.05_75to100 <dbl> 1, 0, 0, 1, 1, 1...
## $ divvy_pt5mi_trip_cnt_cus_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_cus_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_trip_cnt_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_pt5mi_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_cus_l7 <dbl> 1705, 1492, 1472...
## $ divvy_all_triptime_mean_cus_l7 <dbl> 2165.246, 1798.0...
## $ divvy_all_triptime_med_cus_l7 <dbl> 1441.0, 1273.5, ...
## $ divvy_all_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_triptime_med_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_all_trip_cnt_sub_l7 <dbl> 1029, 1179, 1191...
## $ divvy_all_triptime_mean_sub_l7 <dbl> 819.4402, 745.83...
## $ divvy_all_triptime_med_sub_l7 <dbl> 635.0, 614.0, 62...
## $ divvy_mindist_trip_cnt_cus_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_trip_cnt_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_cus_l7 <dbl> 0.00, 0.00, 0.00...
## $ divvy_mindist_triptime_mean_dep_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ divvy_mindist_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.--Not_Holiday--` <dbl> 1, 1, 1, 1, 1, 1...
## $ `holiday_name_l7.Casimir Pulaski Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Christmas Day (in lieu)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Columbus Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Christmas` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Day after Thanksgiving` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Election Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Father's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Independence Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Labor Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Lincoln's Birthday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Martin Luther King Jr. Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Memorial Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Mother's Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.New Years Day Holiday` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Presidents' Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ holiday_name_l7.Thanksgiving <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day` <dbl> 0, 0, 0, 0, 0, 0...
## $ `holiday_name_l7.Veterans Day (observed)` <dbl> 0, 0, 0, 0, 0, 0...
## $ el_rides_l07 <dbl> 1140, 1043, 1081...
## $ el_rides_l14 <dbl> 1167, 1093, 1102...
## $ el_rides_l21 <dbl> 1063, 1065, 1127...
## $ el_rides_l28 <dbl> 1179, 1048, 1177...
## $ el_rides_ma07 <dbl> 956.2857, 964.14...
## $ el_rides_ma14 <dbl> 970.8571, 971.21...
## $ el_rides_ma21 <dbl> 990.2381, 991.80...
## $ el_rides_ma28 <dbl> 968.5000, 970.28...
message("after near-zero variable reduction")
## after near-zero variable reduction
DV_nzv_predict %>%
map(~ dim(.x)
)
## $`40600`
## [1] 905 69
##
## $`41140`
## [1] 905 70
##
## $`40120`
## [1] 905 72
##
## $`40910`
## [1] 875 71
##
## $`40380`
## [1] 905 70
##
## $`41660`
## [1] 905 72
DV_nzv_predict$`41140` %>%
glimpse()
## Observations: 905
## Variables: 70
## $ el_rides <dbl> 1084, 1098, 1019, 1204, 115...
## $ el_date <date> 2013-07-29, 2013-07-30, 20...
## $ data_use_el_stop_id <chr> "01_train", "01_train", "01...
## $ divvy_pt5mi_stn_cnt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_miles <dbl> 1.486693, 1.486693, 1.48669...
## $ year.2013 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ year.2014 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ year.2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ half.1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ half.2 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.3 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.7 <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ month.8 <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ month.9 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.11 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.12 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Friday <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ wday.lbl.Monday <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ wday.lbl.Saturday <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ wday.lbl.Sunday <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ wday.lbl.Thursday <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Tuesday <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, ...
## $ wday.lbl.Wednesday <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ mweek.1 <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, ...
## $ mweek.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ mweek.3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.5 <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
## $ tmin_bands.03_00to25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands.04_25to50 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands.05_50to75 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tmax_bands.02_00to25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.03_25to50 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.04_50to75 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.05_75to100 <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ tmin_bands_l7.03_00to25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.04_25to50 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.05_50to75 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tmax_bands_l7.02_00to25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.03_25to50 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.04_50to75 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.05_75to100 <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ divvy_all_trip_cnt_cus_l7 <dbl> 1705, 1492, 1472, 1585, 116...
## $ divvy_all_triptime_mean_cus_l7 <dbl> 2165.246, 1798.073, 1826.37...
## $ divvy_all_triptime_med_cus_l7 <dbl> 1441.0, 1273.5, 1217.5, 127...
## $ divvy_all_trip_cnt_sub_l7 <dbl> 1029, 1179, 1191, 1225, 991...
## $ divvy_all_triptime_mean_sub_l7 <dbl> 819.4402, 745.8338, 760.988...
## $ divvy_all_triptime_med_sub_l7 <dbl> 635.0, 614.0, 620.0, 617.0,...
## $ divvy_mindist_trip_cnt_cus_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_trip_cnt_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_triptime_mean_cus_l7 <dbl> 0.00, 0.00, 0.00, 0.00, 0.0...
## $ divvy_mindist_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ el_rides_l07 <dbl> 1140, 1043, 1081, 1084, 109...
## $ el_rides_l14 <dbl> 1167, 1093, 1102, 1051, 110...
## $ el_rides_l21 <dbl> 1063, 1065, 1127, 1150, 121...
## $ el_rides_l28 <dbl> 1179, 1048, 1177, 687, 951,...
## $ el_rides_ma07 <dbl> 956.2857, 964.1429, 955.285...
## $ el_rides_ma14 <dbl> 970.8571, 971.2143, 965.285...
## $ el_rides_ma21 <dbl> 990.2381, 991.8095, 986.666...
## $ el_rides_ma28 <dbl> 968.5000, 970.2857, 964.642...
rm(DV_nzv_list)
First, I use caret::corr to remove highly correlated variables.
DV_corr_list <-
DV_nzv_predict %>%
map(~ preProcess(.x,
# method = c("nzv", "corr", "center", "scale", "medianImpute"),
method = "corr"
)
)
DV_corr_predict <-
map2(.x = DV_corr_list,
.y = DV_nzv_predict,
.f = function(a, b) {
predict(a, b)
}
)
saveRDS(DV_corr_predict,
paste0(wd,
"/Data/Interim/",
"DV_corr_predict.Rds"
)
)
# DV_corr_predict <-
# readRDS(paste0(wd,
# "/Data/Interim/",
# "DV_corr_predict.Rds"
# )
# )
message("before corr reduction")
## before corr reduction
DV_nzv_predict %>%
map(~ dim(.x)
)
## $`40600`
## [1] 905 69
##
## $`41140`
## [1] 905 70
##
## $`40120`
## [1] 905 72
##
## $`40910`
## [1] 875 71
##
## $`40380`
## [1] 905 70
##
## $`41660`
## [1] 905 72
DV_nzv_predict$`41140` %>%
glimpse()
## Observations: 905
## Variables: 70
## $ el_rides <dbl> 1084, 1098, 1019, 1204, 115...
## $ el_date <date> 2013-07-29, 2013-07-30, 20...
## $ data_use_el_stop_id <chr> "01_train", "01_train", "01...
## $ divvy_pt5mi_stn_cnt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_miles <dbl> 1.486693, 1.486693, 1.48669...
## $ year.2013 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ year.2014 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ year.2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ half.1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ half.2 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.3 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.7 <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ month.8 <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ month.9 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.11 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.12 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Friday <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ wday.lbl.Monday <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ wday.lbl.Saturday <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ wday.lbl.Sunday <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ wday.lbl.Thursday <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Tuesday <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, ...
## $ wday.lbl.Wednesday <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ mweek.1 <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, ...
## $ mweek.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ mweek.3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.5 <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
## $ tmin_bands.03_00to25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands.04_25to50 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands.05_50to75 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tmax_bands.02_00to25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.03_25to50 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.04_50to75 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands.05_75to100 <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ tmin_bands_l7.03_00to25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.04_25to50 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.05_50to75 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tmax_bands_l7.02_00to25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.03_25to50 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.04_50to75 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.05_75to100 <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ divvy_all_trip_cnt_cus_l7 <dbl> 1705, 1492, 1472, 1585, 116...
## $ divvy_all_triptime_mean_cus_l7 <dbl> 2165.246, 1798.073, 1826.37...
## $ divvy_all_triptime_med_cus_l7 <dbl> 1441.0, 1273.5, 1217.5, 127...
## $ divvy_all_trip_cnt_sub_l7 <dbl> 1029, 1179, 1191, 1225, 991...
## $ divvy_all_triptime_mean_sub_l7 <dbl> 819.4402, 745.8338, 760.988...
## $ divvy_all_triptime_med_sub_l7 <dbl> 635.0, 614.0, 620.0, 617.0,...
## $ divvy_mindist_trip_cnt_cus_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_trip_cnt_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_triptime_mean_cus_l7 <dbl> 0.00, 0.00, 0.00, 0.00, 0.0...
## $ divvy_mindist_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ el_rides_l07 <dbl> 1140, 1043, 1081, 1084, 109...
## $ el_rides_l14 <dbl> 1167, 1093, 1102, 1051, 110...
## $ el_rides_l21 <dbl> 1063, 1065, 1127, 1150, 121...
## $ el_rides_l28 <dbl> 1179, 1048, 1177, 687, 951,...
## $ el_rides_ma07 <dbl> 956.2857, 964.1429, 955.285...
## $ el_rides_ma14 <dbl> 970.8571, 971.2143, 965.285...
## $ el_rides_ma21 <dbl> 990.2381, 991.8095, 986.666...
## $ el_rides_ma28 <dbl> 968.5000, 970.2857, 964.642...
message("after corr variable reduction")
## after corr variable reduction
DV_corr_predict %>%
map(~ dim(.x)
)
## $`40600`
## [1] 905 58
##
## $`41140`
## [1] 905 58
##
## $`40120`
## [1] 905 57
##
## $`40910`
## [1] 875 59
##
## $`40380`
## [1] 905 59
##
## $`41660`
## [1] 905 60
DV_corr_predict$`41140` %>%
glimpse()
## Observations: 905
## Variables: 58
## $ el_rides <dbl> 1084, 1098, 1019, 1204, 115...
## $ el_date <date> 2013-07-29, 2013-07-30, 20...
## $ data_use_el_stop_id <chr> "01_train", "01_train", "01...
## $ divvy_pt5mi_stn_cnt <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ year.2013 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ year.2014 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ year.2015 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ half.2 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ quarter.3 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ quarter.4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.1 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.5 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.6 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.7 <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ month.8 <dbl> 0, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ month.9 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.10 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.11 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ month.12 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Friday <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
## $ wday.lbl.Monday <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 0, ...
## $ wday.lbl.Saturday <dbl> 0, 0, 0, 0, 0, 1, 0, 0, 0, ...
## $ wday.lbl.Sunday <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, ...
## $ wday.lbl.Thursday <dbl> 0, 0, 0, 1, 0, 0, 0, 0, 0, ...
## $ wday.lbl.Tuesday <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 1, ...
## $ wday.lbl.Wednesday <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, ...
## $ mweek.1 <dbl> 0, 0, 0, 0, 1, 1, 1, 0, 0, ...
## $ mweek.2 <dbl> 0, 0, 0, 0, 0, 0, 0, 1, 1, ...
## $ mweek.3 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.4 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ mweek.5 <dbl> 1, 1, 1, 1, 0, 0, 0, 0, 0, ...
## $ tmax_bands.04_50to75 <dbl> 0, 1, 1, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.03_00to25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.04_25to50 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmin_bands_l7.05_50to75 <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
## $ tmax_bands_l7.02_00to25 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.03_25to50 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ tmax_bands_l7.05_75to100 <dbl> 1, 0, 0, 1, 1, 1, 1, 1, 1, ...
## $ divvy_all_trip_cnt_cus_l7 <dbl> 1705, 1492, 1472, 1585, 116...
## $ divvy_all_triptime_mean_cus_l7 <dbl> 2165.246, 1798.073, 1826.37...
## $ divvy_all_triptime_med_cus_l7 <dbl> 1441.0, 1273.5, 1217.5, 127...
## $ divvy_all_trip_cnt_sub_l7 <dbl> 1029, 1179, 1191, 1225, 991...
## $ divvy_all_triptime_mean_sub_l7 <dbl> 819.4402, 745.8338, 760.988...
## $ divvy_all_triptime_med_sub_l7 <dbl> 635.0, 614.0, 620.0, 617.0,...
## $ divvy_mindist_trip_cnt_cus_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_trip_cnt_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ divvy_mindist_triptime_mean_cus_l7 <dbl> 0.00, 0.00, 0.00, 0.00, 0.0...
## $ divvy_mindist_triptime_mean_sub_l7 <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
## $ el_rides_l07 <dbl> 1140, 1043, 1081, 1084, 109...
## $ el_rides_l14 <dbl> 1167, 1093, 1102, 1051, 110...
## $ el_rides_l21 <dbl> 1063, 1065, 1127, 1150, 121...
## $ el_rides_l28 <dbl> 1179, 1048, 1177, 687, 951,...
## $ el_rides_ma07 <dbl> 956.2857, 964.1429, 955.285...
rm(DV_corr_list)
Modeling parameters used in multiple models.
period_train <- round((365 * 1.5),
digits = 0
) + 30 # 1.5 years + 30 days of data needed for LSTM Keras modeling
period_test <- round((365 * 0.5),
digits = 0
) + 30 # test on 0.5 years * 30 days of data (even though we just predict 14 days out) needed for LSTM Keras modeling
skip_span <- 8 # gives 13 evenly distributed splits
saveRDS(period_train,
paste0(wd,
"/Data/Interim/",
"period_train.Rds"
)
)
# period_train <-
# readRDS(paste0(wd,
# "/Data/Interim/",
# "period_train.Rds"
# )
# )
saveRDS(period_test,
paste0(wd,
"/Data/Interim/",
"period_test.Rds"
)
)
# period_test <-
# readRDS(paste0(wd,
# "/Data/Interim/",
# "period_test.Rds"
# )
# )
saveRDS(skip_span,
paste0(wd,
"/Data/Interim/",
"skip_span.Rds"
)
)
# skip_span <-
# readRDS(paste0(wd,
# "/Data/Interim/",
# "skip_span.Rds"
# )
# )