Load the relevant libraries.
# rm(list = ls())
# .rs.restartR()
# data manipulation
library("plyr")
library("tidyverse")
library("magrittr")
library("data.table")
library("lubridate")
library("sqldf")
# time series specific packages
library("timetk")
library("zoo")
library("tibbletime")
# modeling
library("fpp2")
library("prophet")
library("caret")
library("randomForest")
library("xgboost")
library("h2o")
library("keras")
# use_session_with_seed(123456789) # setting the seed to obtain reproducible results
# see https://keras.rstudio.com/articles/faq.html#how-can-i-obtain-reproducible-results-using-keras-during-development and https://cran.r-project.org/web/packages/keras/vignettes/faq.html
# can also re-enable gpu and parallel processing by using: use_session_with_seed(42, disable_gpu = FALSE, disable_parallel_cpu = FALSE)
# other
library("geosphere") # specific for distance calculations from lat-lon pairs
library("naniar") # inspecting missing data
library("rlang") # building functions
library("recipes") # used in Keras modeling to design matrices
library("rsample") # rolling samples for validation stats
library("tfruns") # used in Keras modeling for trainin runs
library("stringr") # string manipulation
library("ggplot2") # viz
library("sweep") # more easily pull out model statistics
library("yardstick") # easily calculate accuracy stats
library("doParallel") # parallel processing
Session Info.
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] doParallel_1.0.14 iterators_1.0.10 foreach_1.4.4
## [4] yardstick_0.0.2 sweep_0.2.1.1 tfruns_1.4
## [7] rsample_0.0.3 recipes_0.1.4 rlang_0.3.0.1
## [10] naniar_0.4.1 geosphere_1.5-7 keras_2.2.4
## [13] h2o_3.20.0.8 xgboost_0.71.2 randomForest_4.6-14
## [16] caret_6.0-81 lattice_0.20-38 prophet_0.3.0.1
## [19] Rcpp_1.0.0 fpp2_2.3 expsmooth_2.3
## [22] fma_2.3 forecast_8.4 tibbletime_0.1.1
## [25] zoo_1.8-4 timetk_0.1.1.1 sqldf_0.4-11
## [28] RSQLite_2.1.1 gsubfn_0.7 proto_1.0.0
## [31] lubridate_1.7.4 data.table_1.11.8 magrittr_1.5
## [34] forcats_0.3.0 stringr_1.3.1 dplyr_0.7.8
## [37] purrr_0.2.5 readr_1.2.1 tidyr_0.8.2
## [40] tibble_1.4.2 ggplot2_3.1.0 tidyverse_1.2.1
## [43] plyr_1.8.4
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.3-2 class_7.3-14 visdat_0.5.1
## [4] rprojroot_1.3-2 base64enc_0.1-3 rstudioapi_0.8
## [7] rstan_2.18.2 bit64_0.9-7 prodlim_2018.04.18
## [10] xml2_1.2.0 codetools_0.2-15 splines_3.5.1
## [13] knitr_1.20 zeallot_0.1.0 jsonlite_1.5
## [16] pROC_1.13.0 broom_0.5.0 compiler_3.5.1
## [19] httr_1.3.1 backports_1.1.2 assertthat_0.2.0
## [22] Matrix_1.2-15 lazyeval_0.2.1 cli_1.0.1
## [25] htmltools_0.3.6 prettyunits_1.0.2 tools_3.5.1
## [28] bindrcpp_0.2.2 gtable_0.2.0 glue_1.3.0
## [31] reshape2_1.4.3 cellranger_1.1.0 fracdiff_1.4-2
## [34] urca_1.3-0 debugme_1.1.0 nlme_3.1-137
## [37] lmtest_0.9-36 timeDate_3043.102 gower_0.1.2
## [40] ps_1.2.1 rvest_0.3.2 MASS_7.3-51.1
## [43] scales_1.0.0 ipred_0.9-8 hms_0.4.2
## [46] inline_0.3.15 yaml_2.2.0 quantmod_0.4-13
## [49] curl_3.2 reticulate_1.10 memoise_1.1.0
## [52] gridExtra_2.3 loo_2.0.0 StanHeaders_2.18.0
## [55] uroot_2.0-9 rpart_4.1-13 stringi_1.2.4
## [58] tensorflow_1.10 tseries_0.10-46 TTR_0.23-4
## [61] pkgbuild_1.0.2 lava_1.6.4 chron_2.3-53
## [64] bitops_1.0-6 pkgconfig_2.0.2 matrixStats_0.54.0
## [67] evaluate_0.12 bindr_0.1.1 bit_1.1-14
## [70] processx_3.2.0 tidyselect_0.2.5 R6_2.3.0
## [73] generics_0.0.2 DBI_1.0.0 whisker_0.3-2
## [76] pillar_1.3.0 haven_2.0.0 withr_2.1.2
## [79] xts_0.11-2 sp_1.3-1 RCurl_1.95-4.11
## [82] survival_2.43-3 nnet_7.3-12 modelr_0.1.2
## [85] crayon_1.3.4 rmarkdown_1.10 grid_3.5.1
## [88] readxl_1.1.0 blob_1.1.1 callr_3.0.0
## [91] ModelMetrics_1.2.2 digest_0.6.18 stats4_3.5.1
## [94] munsell_0.5.0 tcltk_3.5.1 quadprog_1.5-5
Setup the root directory.
Setting wd as the working directory.
wd <- getwd()
wd
## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy"
NOTE: all_split_rmse_mape the output produced in Step 07
all_split_rmse_mape <-
readRDS(paste0(wd,
"/Models/",
"all_split_rmse_mape.Rds"
)
)
Plot interpolation and extrapolation over time, by model.
plot_interp_extrap <-
all_split_rmse_mape %>%
map(~ select(.x,
arima.interpolation,
arima.extrapolation,
arima_xreg.interpolation,
arima_xreg.extrapolation,
prophet.interpolation,
prophet.extrapolation,
prophet_hol.interpolation,
prophet_hol.extrapolation,
rf.interpolation,
rf.extrapolation,
xgb.interpolation,
xgb.extrapolation,
h2o.interpolation,
h2o.extrapolation,
h2o.limvars.interpolation,
h2o.limvars.extrapolation,
keras.interpolation_rmse,
keras.extrapolation_rmse,
start_date,
el_stop_id
) %>%
as.data.frame %>%
rename(h2o_limvars.interpolation = h2o.limvars.interpolation,
h2o_limvars.extrapolation = h2o.limvars.extrapolation,
keras.interpolation = keras.interpolation_rmse,
keras.extrapolation = keras.extrapolation_rmse
) %>%
gather(error, RMSE, -start_date, -el_stop_id) %>%
separate(error,
c("model", "type"),
sep = "\\.",
remove = FALSE
) %>%
ggplot(aes(x = start_date,
y = RMSE,
# col = error
color = model
)
) +
facet_wrap(~ type,
scales = "free"
) +
geom_point() +
geom_line() +
ggtitle(label = .$el_stop_id#,
# subtitle = "ARIMA-based model"
) +
theme_bw() +
theme(legend.position = "bottom")
)
plot_interp_extrap
## $`40600`
##
## $`41140`
##
## $`40120`
##
## $`40910`
##
## $`40380`
##
## $`41660`
For each el_stop_id, plot the median RMSE for each model. First I munge the data.
median_RMSE_by_model <-
pmap(.l = list(a = all_split_rmse_mape),
.f = function(a) {
med.arima = median(a$arima.extrapolation)
med.arima_xreg = median(a$arima_xreg.extrapolation)
med.prophet = median(a$prophet.extrapolation)
med.prophet_hol = median(a$prophet_hol.extrapolation)
med.rf = median(a$rf.extrapolation)
med.xgb = median(a$xgb.extrapolation)
med.h2o = median(a$h2o.extrapolation)
med.h2o_limvars = median(a$h2o.limvars.extrapolation)
med.keras = median(a$keras.extrapolation_rmse)
med_extrap_all_models =
data.frame(el_stop_id = unique(a$el_stop_id),
arima = med.arima,
arima_xreg = med.arima_xreg,
prophet = med.prophet,
prophet_hol = med.prophet_hol,
rf = med.rf,
xgb = med.xgb,
h2o = med.h2o,
h2o_limvars = med.h2o_limvars,
keras = med.keras
)
return(med_extrap_all_models)
}
) %>%
bind_rows()
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
# View(median_RMSE_by_model)
Now I create the plot itself.
median_RMSE_by_model %>%
gather(key = "model",
value = "RMSE_med",
-el_stop_id
) %>%
ggplot(aes(x = model,
y = RMSE_med,
fill = model
)
) +
geom_col() +
geom_text(aes(label = format(round(RMSE_med, 1),
1
)
),
size = 3,
hjust = 1
) +
facet_wrap(~ el_stop_id,
ncol = 2,
scales = "free"
) +
coord_flip() +
theme_bw() +
theme(legend.position = "none") +
NULL
rm(median_RMSE_by_model)