Setup

Load the relevant libraries.

# rm(list = ls())
# .rs.restartR()


# data manipulation
library("plyr")
library("tidyverse")
library("magrittr")
library("data.table")
library("lubridate")
library("sqldf")


# time series specific packages
library("timetk")
library("zoo")
library("tibbletime")


# modeling
library("fpp2")
library("prophet")
library("caret")
library("randomForest")
library("xgboost")
library("h2o")
library("keras")
# use_session_with_seed(123456789) # setting the seed to obtain reproducible results
# see https://keras.rstudio.com/articles/faq.html#how-can-i-obtain-reproducible-results-using-keras-during-development and https://cran.r-project.org/web/packages/keras/vignettes/faq.html
# can also re-enable gpu and parallel processing by using:  use_session_with_seed(42, disable_gpu = FALSE, disable_parallel_cpu = FALSE)



# other
library("geosphere")          # specific for distance calculations from lat-lon pairs
library("naniar")             # inspecting missing data
library("rlang")              # building functions
library("recipes")            # used in Keras modeling to design matrices
library("rsample")            # rolling samples for validation stats
library("tfruns")             # used in Keras modeling for trainin runs
library("stringr")            # string manipulation
library("ggplot2")            # viz
library("sweep")              # more easily pull out model statistics
library("yardstick")          # easily calculate accuracy stats
library("doParallel")         # parallel processing

Session Info.

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] doParallel_1.0.14   iterators_1.0.10    foreach_1.4.4      
##  [4] yardstick_0.0.2     sweep_0.2.1.1       tfruns_1.4         
##  [7] rsample_0.0.3       recipes_0.1.4       rlang_0.3.0.1      
## [10] naniar_0.4.1        geosphere_1.5-7     keras_2.2.4        
## [13] h2o_3.20.0.8        xgboost_0.71.2      randomForest_4.6-14
## [16] caret_6.0-81        lattice_0.20-38     prophet_0.3.0.1    
## [19] Rcpp_1.0.0          fpp2_2.3            expsmooth_2.3      
## [22] fma_2.3             forecast_8.4        tibbletime_0.1.1   
## [25] zoo_1.8-4           timetk_0.1.1.1      sqldf_0.4-11       
## [28] RSQLite_2.1.1       gsubfn_0.7          proto_1.0.0        
## [31] lubridate_1.7.4     data.table_1.11.8   magrittr_1.5       
## [34] forcats_0.3.0       stringr_1.3.1       dplyr_0.7.8        
## [37] purrr_0.2.5         readr_1.2.1         tidyr_0.8.2        
## [40] tibble_1.4.2        ggplot2_3.1.0       tidyverse_1.2.1    
## [43] plyr_1.8.4         
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.3-2   class_7.3-14       visdat_0.5.1      
##  [4] rprojroot_1.3-2    base64enc_0.1-3    rstudioapi_0.8    
##  [7] rstan_2.18.2       bit64_0.9-7        prodlim_2018.04.18
## [10] xml2_1.2.0         codetools_0.2-15   splines_3.5.1     
## [13] knitr_1.20         zeallot_0.1.0      jsonlite_1.5      
## [16] pROC_1.13.0        broom_0.5.0        compiler_3.5.1    
## [19] httr_1.3.1         backports_1.1.2    assertthat_0.2.0  
## [22] Matrix_1.2-15      lazyeval_0.2.1     cli_1.0.1         
## [25] htmltools_0.3.6    prettyunits_1.0.2  tools_3.5.1       
## [28] bindrcpp_0.2.2     gtable_0.2.0       glue_1.3.0        
## [31] reshape2_1.4.3     cellranger_1.1.0   fracdiff_1.4-2    
## [34] urca_1.3-0         debugme_1.1.0      nlme_3.1-137      
## [37] lmtest_0.9-36      timeDate_3043.102  gower_0.1.2       
## [40] ps_1.2.1           rvest_0.3.2        MASS_7.3-51.1     
## [43] scales_1.0.0       ipred_0.9-8        hms_0.4.2         
## [46] inline_0.3.15      yaml_2.2.0         quantmod_0.4-13   
## [49] curl_3.2           reticulate_1.10    memoise_1.1.0     
## [52] gridExtra_2.3      loo_2.0.0          StanHeaders_2.18.0
## [55] uroot_2.0-9        rpart_4.1-13       stringi_1.2.4     
## [58] tensorflow_1.10    tseries_0.10-46    TTR_0.23-4        
## [61] pkgbuild_1.0.2     lava_1.6.4         chron_2.3-53      
## [64] bitops_1.0-6       pkgconfig_2.0.2    matrixStats_0.54.0
## [67] evaluate_0.12      bindr_0.1.1        bit_1.1-14        
## [70] processx_3.2.0     tidyselect_0.2.5   R6_2.3.0          
## [73] generics_0.0.2     DBI_1.0.0          whisker_0.3-2     
## [76] pillar_1.3.0       haven_2.0.0        withr_2.1.2       
## [79] xts_0.11-2         sp_1.3-1           RCurl_1.95-4.11   
## [82] survival_2.43-3    nnet_7.3-12        modelr_0.1.2      
## [85] crayon_1.3.4       rmarkdown_1.10     grid_3.5.1        
## [88] readxl_1.1.0       blob_1.1.1         callr_3.0.0       
## [91] ModelMetrics_1.2.2 digest_0.6.18      stats4_3.5.1      
## [94] munsell_0.5.0      tcltk_3.5.1        quadprog_1.5-5

Setup the root directory.

Setting wd as the working directory.

wd <- getwd()

wd
## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy"

Visualize the Accuracy Measures

NOTE: all_split_rmse_mape the output produced in Step 07

all_split_rmse_mape <-
  readRDS(paste0(wd,
                 "/Models/",
                 "all_split_rmse_mape.Rds"
                 )
          )

Plot interpolation and extrapolation over time, by model.

plot_interp_extrap <-
  all_split_rmse_mape %>% 
  map(~ select(.x,
               arima.interpolation,
               arima.extrapolation,
               arima_xreg.interpolation,
               arima_xreg.extrapolation,
               prophet.interpolation,
               prophet.extrapolation,
               prophet_hol.interpolation,
               prophet_hol.extrapolation,
               rf.interpolation,
               rf.extrapolation,
               xgb.interpolation,
               xgb.extrapolation,
               h2o.interpolation,
               h2o.extrapolation,
               h2o.limvars.interpolation,
               h2o.limvars.extrapolation,
               keras.interpolation_rmse,
               keras.extrapolation_rmse,
               start_date,
               el_stop_id
               ) %>%
        as.data.frame %>% 
        rename(h2o_limvars.interpolation = h2o.limvars.interpolation,
               h2o_limvars.extrapolation = h2o.limvars.extrapolation,
               keras.interpolation = keras.interpolation_rmse,
               keras.extrapolation = keras.extrapolation_rmse
               ) %>% 
        gather(error, RMSE, -start_date, -el_stop_id) %>% 
        separate(error,
                 c("model", "type"),
                 sep = "\\.",
                 remove = FALSE
                 ) %>% 
        ggplot(aes(x = start_date,
                   y = RMSE,
                   # col = error
                   color = model
                   )
               ) +
        facet_wrap(~ type,
                   scales = "free"
                   ) +
        geom_point() +
        geom_line() +
        ggtitle(label = .$el_stop_id#,
                # subtitle = "ARIMA-based model"
                ) +
        theme_bw() +
        theme(legend.position = "bottom")
      )

plot_interp_extrap
## $`40600`

## 
## $`41140`

## 
## $`40120`

## 
## $`40910`

## 
## $`40380`

## 
## $`41660`

For each el_stop_id, plot the median RMSE for each model. First I munge the data.

median_RMSE_by_model <-
  pmap(.l = list(a = all_split_rmse_mape),
       .f = function(a) {
         med.arima = median(a$arima.extrapolation)
         med.arima_xreg = median(a$arima_xreg.extrapolation)
         med.prophet = median(a$prophet.extrapolation)
         med.prophet_hol = median(a$prophet_hol.extrapolation)
         med.rf = median(a$rf.extrapolation)
         med.xgb = median(a$xgb.extrapolation)
         med.h2o = median(a$h2o.extrapolation)
         med.h2o_limvars = median(a$h2o.limvars.extrapolation)
         med.keras = median(a$keras.extrapolation_rmse)
         
         med_extrap_all_models =
           data.frame(el_stop_id = unique(a$el_stop_id),
                      arima = med.arima,
                      arima_xreg = med.arima_xreg,
                      prophet = med.prophet,
                      prophet_hol = med.prophet_hol,
                      rf = med.rf,
                      xgb = med.xgb,
                      h2o = med.h2o,
                      h2o_limvars = med.h2o_limvars,
                      keras = med.keras
                      )
         
         return(med_extrap_all_models)
         }
       ) %>% 
  bind_rows()
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector

## Warning in bind_rows_(x, .id): binding character and factor vector,
## coercing into character vector
# View(median_RMSE_by_model)

Now I create the plot itself.

median_RMSE_by_model %>% 
  gather(key = "model",
         value = "RMSE_med",
         -el_stop_id
         ) %>% 
  ggplot(aes(x = model,
             y = RMSE_med,
             fill = model
             )
         ) +
  geom_col() +
  geom_text(aes(label = format(round(RMSE_med, 1),
                               1
                               )
                ),
            size = 3,
            hjust = 1
            ) +
  facet_wrap(~ el_stop_id,
             ncol = 2,
             scales = "free"
             ) +
  coord_flip() +
  theme_bw() +
  theme(legend.position = "none") +
  NULL

rm(median_RMSE_by_model)