Setup

Load the relevant libraries.

# rm(list = ls())
# .rs.restartR()


# data manipulation
library("plyr")
library("tidyverse")
library("magrittr")
library("data.table")
library("lubridate")
library("sqldf")


# time series specific packages
library("timetk")
library("zoo")
library("tibbletime")


# modeling
library("fpp2")
library("prophet")
library("caret")
library("randomForest")
library("xgboost")
library("h2o")
library("keras")
# use_session_with_seed(123456789) # setting the seed to obtain reproducible results
# see https://keras.rstudio.com/articles/faq.html#how-can-i-obtain-reproducible-results-using-keras-during-development and https://cran.r-project.org/web/packages/keras/vignettes/faq.html
# can also re-enable gpu and parallel processing by using:  use_session_with_seed(42, disable_gpu = FALSE, disable_parallel_cpu = FALSE)



# other
library("geosphere")          # specific for distance calculations from lat-lon pairs
library("naniar")             # inspecting missing data
library("rlang")              # building functions
library("recipes")            # used in Keras modeling to design matrices
library("rsample")            # rolling samples for validation stats
library("tfruns")             # used in Keras modeling for trainin runs
library("stringr")            # string manipulation
library("ggplot2")            # viz
library("sweep")              # more easily pull out model statistics
library("yardstick")          # easily calculate accuracy stats
library("doParallel")         # parallel processing

Session Info.

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.6
## 
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] doParallel_1.0.14   iterators_1.0.10    foreach_1.4.4      
##  [4] yardstick_0.0.2     sweep_0.2.1.1       tfruns_1.4         
##  [7] rsample_0.0.3       recipes_0.1.4       rlang_0.3.0.1      
## [10] naniar_0.4.1        geosphere_1.5-7     keras_2.2.4        
## [13] h2o_3.20.0.8        xgboost_0.71.2      randomForest_4.6-14
## [16] caret_6.0-81        lattice_0.20-38     prophet_0.3.0.1    
## [19] Rcpp_1.0.0          fpp2_2.3            expsmooth_2.3      
## [22] fma_2.3             forecast_8.4        tibbletime_0.1.1   
## [25] zoo_1.8-4           timetk_0.1.1.1      sqldf_0.4-11       
## [28] RSQLite_2.1.1       gsubfn_0.7          proto_1.0.0        
## [31] lubridate_1.7.4     data.table_1.11.8   magrittr_1.5       
## [34] forcats_0.3.0       stringr_1.3.1       dplyr_0.7.8        
## [37] purrr_0.2.5         readr_1.2.1         tidyr_0.8.2        
## [40] tibble_1.4.2        ggplot2_3.1.0       tidyverse_1.2.1    
## [43] plyr_1.8.4         
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_1.3-2   class_7.3-14       visdat_0.5.1      
##  [4] rprojroot_1.3-2    base64enc_0.1-3    rstudioapi_0.8    
##  [7] rstan_2.18.2       bit64_0.9-7        prodlim_2018.04.18
## [10] xml2_1.2.0         codetools_0.2-15   splines_3.5.1     
## [13] knitr_1.20         zeallot_0.1.0      jsonlite_1.5      
## [16] pROC_1.13.0        broom_0.5.0        compiler_3.5.1    
## [19] httr_1.3.1         backports_1.1.2    assertthat_0.2.0  
## [22] Matrix_1.2-15      lazyeval_0.2.1     cli_1.0.1         
## [25] htmltools_0.3.6    prettyunits_1.0.2  tools_3.5.1       
## [28] bindrcpp_0.2.2     gtable_0.2.0       glue_1.3.0        
## [31] reshape2_1.4.3     cellranger_1.1.0   fracdiff_1.4-2    
## [34] urca_1.3-0         debugme_1.1.0      nlme_3.1-137      
## [37] lmtest_0.9-36      timeDate_3043.102  gower_0.1.2       
## [40] ps_1.2.1           rvest_0.3.2        MASS_7.3-51.1     
## [43] scales_1.0.0       ipred_0.9-8        hms_0.4.2         
## [46] inline_0.3.15      yaml_2.2.0         quantmod_0.4-13   
## [49] curl_3.2           reticulate_1.10    memoise_1.1.0     
## [52] gridExtra_2.3      loo_2.0.0          StanHeaders_2.18.0
## [55] uroot_2.0-9        rpart_4.1-13       stringi_1.2.4     
## [58] tensorflow_1.10    tseries_0.10-46    TTR_0.23-4        
## [61] pkgbuild_1.0.2     lava_1.6.4         chron_2.3-53      
## [64] bitops_1.0-6       pkgconfig_2.0.2    matrixStats_0.54.0
## [67] evaluate_0.12      bindr_0.1.1        bit_1.1-14        
## [70] processx_3.2.0     tidyselect_0.2.5   R6_2.3.0          
## [73] generics_0.0.1     DBI_1.0.0          whisker_0.3-2     
## [76] pillar_1.3.0       haven_2.0.0        withr_2.1.2       
## [79] xts_0.11-2         sp_1.3-1           RCurl_1.95-4.11   
## [82] survival_2.43-3    nnet_7.3-12        modelr_0.1.2      
## [85] crayon_1.3.4       rmarkdown_1.10     grid_3.5.1        
## [88] readxl_1.1.0       blob_1.1.1         callr_3.0.0       
## [91] ModelMetrics_1.2.2 digest_0.6.18      stats4_3.5.1      
## [94] munsell_0.5.0      tcltk_3.5.1        quadprog_1.5-5

Setup the root directory.

Setting wd as the working directory.

wd <- getwd()

wd
## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy"

Obtain Data

Note that the raw data were obtained and transformed into .Rds files in separate .R scripts. For full details, see https://github.com/supermdat/Chicago_El_Divvy.

Import the relevant .Rds files.

rds_file_paths <-
  list.files(path = paste0(wd,
                           "/Data/Processed"
                           ),
             pattern = "\\.*.Rds",
             full.names = TRUE
             )

rds_file_paths
## [1] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy/Data/Processed/data_divvy_trips.Rds"            
## [2] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy/Data/Processed/data_el_format_vars.Rds"         
## [3] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy/Data/Processed/data_el_stations_format_vars.Rds"
## [4] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy/Data/Processed/data_holidays_format_vars.Rds"   
## [5] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy/Data/Processed/data_weather_format_vars.Rds"    
## [6] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy/Data/Processed/data_weather.Rds"                
## [7] "/Users/mdturse/Desktop/Analytics/Chicago_El_Divvy/Data/Processed/divvy_data_stations.Rds"
rds_file_names <-
  str_replace(string = rds_file_paths,
              pattern = ".*/",
              replacement = ""
              )

# not importing the intermediate weather data
index_not_weather <-
  str_detect(string = rds_file_paths, pattern = "^(?!.*data_weather.Rds)")

rds_files <-
  rds_file_paths[index_not_weather] %>% 
  map(~ readRDS(file = .x)
      )

names(rds_files) <- rds_file_names[index_not_weather]


pmap(.l = list(a = rds_files,
               b = names(rds_files)
               ),
     .f = function(a, b) {
       message(b)
       
       str(a)
       }
     )
## data_divvy_trips.Rds
## Classes 'data.table' and 'data.frame':   13822258 obs. of  12 variables:
##  $ trip_id          : int  3940 4095 4113 4118 4119 4134 4162 4192 4216 4255 ...
##  $ bikeid           : int  914 480 711 480 711 145 711 303 907 907 ...
##  $ tripduration     : int  31177 301 140 316 87 11674 15758 60 171 22549 ...
##  $ from_station_id  : int  91 85 88 85 88 17 88 28 45 45 ...
##  $ from_station_name: Factor w/ 663 levels "2112 W Peterson Ave",..: 155 408 394 408 394 654 394 346 404 404 ...
##  $ to_station_id    : int  48 85 88 28 88 61 34 28 90 54 ...
##  $ to_station_name  : Factor w/ 663 levels "2112 W Peterson Ave",..: 345 408 394 346 394 657 94 346 414 444 ...
##  $ usertype         : Factor w/ 3 levels "Customer","Dependent",..: 3 3 3 1 3 3 3 3 3 3 ...
##  $ gender           : Factor w/ 3 levels "","Female","Male": 3 3 3 NA 3 3 3 3 3 3 ...
##  $ birthyear        : int  1982 1982 1982 NA 1982 1978 1982 1982 1982 1982 ...
##  $ start_dt         : POSIXct, format: "2013-06-27 01:06:00" "2013-06-27 12:06:00" ...
##  $ end_dt           : POSIXct, format: "2013-06-27 09:46:00" "2013-06-27 12:11:00" ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "trip_id" "start_dt"
## data_el_format_vars.Rds
## Classes 'data.table' and 'data.frame':   897032 obs. of  5 variables:
##  $ date       : Date, format: "2001-01-01" "2001-01-01" ...
##  $ daytype    : Factor w/ 3 levels "A","U","W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ rides      : num  290 633 483 374 804 ...
##  $ station_id : num  40010 40020 40030 40040 40050 ...
##  $ stationname: Factor w/ 148 levels "18th","35-Bronzeville-IIT",..: 23 70 120 122 52 26 78 130 49 109 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "date" "station_id"
## data_el_stations_format_vars.Rds
## Classes 'data.table' and 'data.frame':   300 obs. of  20 variables:
##  $ ada                     : logi  FALSE FALSE TRUE TRUE TRUE TRUE ...
##  $ blue                    : logi  TRUE TRUE FALSE FALSE FALSE FALSE ...
##  $ brn                     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ direction_id            : Factor w/ 4 levels "E","N","S","W": 1 4 1 4 1 4 2 3 4 2 ...
##  $ g                       : logi  FALSE FALSE TRUE TRUE TRUE TRUE ...
##  $ location.type           : Factor w/ 1 level "Point": 1 1 1 1 1 1 1 1 1 1 ...
##  $ map_id                  : int  40010 40010 40020 40020 40030 40030 40040 40040 40480 40050 ...
##  $ o                       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ p                       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pexp                    : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pnk                     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ red                     : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ station_descriptive_name: Factor w/ 145 levels "18th (Pink Line)",..: 24 24 73 73 119 119 122 122 43 55 ...
##  $ station_name            : Factor w/ 109 levels "18th","35th-Bronzeville-IIT",..: 20 20 52 52 89 89 90 90 30 38 ...
##  $ stop_id                 : int  30001 30002 30003 30004 30005 30006 30007 30008 30009 30010 ...
##  $ stop_name               : logi  NA NA NA NA NA NA ...
##  $ y                       : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ unnest_id               : int  35 34 117 116 210 211 212 213 65 86 ...
##  $ lat                     : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ lon                     : num  -87.8 -87.8 -87.8 -87.8 -87.7 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "stop_id"
## data_holidays_format_vars.Rds
## Classes 'data.table' and 'data.frame':   129 obs. of  3 variables:
##  $ date           : Date, format: "2010-01-01" "2010-01-18" ...
##  $ holiday_name   : Factor w/ 24 levels "Casimir Pulaski Day",..: 18 14 21 16 15 10 12 5 23 22 ...
##  $ holiday_comment: chr  "" "Third Monday in January" "Third Monday in February" "2nd Sunday in May. Not a public holiday." ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "date"
## data_weather_format_vars.Rds
## Classes 'data.table' and 'data.frame':   2922 obs. of  12 variables:
##  $ snwd     : num  2 2 2 2 2 2 6 6 5 5 ...
##  $ snow     : num  0 0 0 0 0 0 4.1 1.8 0 0 ...
##  $ station  : chr  "USC00111577" "USC00111577" "USC00111577" "USC00111577" ...
##  $ name     : chr  "CHICAGO MIDWAY AIRPORT 3 SW, IL US" "CHICAGO MIDWAY AIRPORT 3 SW, IL US" "CHICAGO MIDWAY AIRPORT 3 SW, IL US" "CHICAGO MIDWAY AIRPORT 3 SW, IL US" ...
##  $ latitude : num  41.7 41.7 41.7 41.7 41.7 ...
##  $ longitude: num  -87.8 -87.8 -87.8 -87.8 -87.8 ...
##  $ elevation: num  189 189 189 189 189 189 189 189 189 189 ...
##  $ date     : Date, format: "2010-01-01" "2010-01-02" ...
##  $ prcp     : num  0 0 0 0 0 0 0.32 0.11 0 0 ...
##  $ tmax     : int  18 12 19 19 23 23 21 28 24 19 ...
##  $ tmin     : int  5 3 1 9 15 8 15 15 9 0 ...
##  $ tobs     : int  11 6 12 16 15 16 20 15 10 19 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "date"
## divvy_data_stations.Rds
## Classes 'data.table' and 'data.frame':   4238 obs. of  12 variables:
##  $ file_og           : Factor w/ 8 levels "Divvy_Stations_2013.csv",..: 3 4 5 6 7 8 3 4 5 6 ...
##  $ file_og_start_date: Date, format: "2015-01-01" "2016-01-01" ...
##  $ file_og_end_date  : Date, format: "2015-12-31" "2016-06-30" ...
##  $ id                : num  2 2 2 2 2 2 3 3 3 3 ...
##  $ name              : Factor w/ 661 levels "2112 W Peterson Ave",..: 402 60 402 402 60 60 518 518 518 518 ...
##  $ latitude          : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ longitude         : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ dpcapacity        : num  35 35 35 35 27 27 31 31 31 31 ...
##  $ landmark          : int  541 NA NA NA NA NA 544 NA NA NA ...
##  $ online_date       : Date, format: NA "2015-05-08" ...
##  $ city              : chr  NA NA NA NA ...
##  $ V8                : logi  NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "id" "file_og_start_date"
## $data_divvy_trips.Rds
## NULL
## 
## $data_el_format_vars.Rds
## NULL
## 
## $data_el_stations_format_vars.Rds
## NULL
## 
## $data_holidays_format_vars.Rds
## NULL
## 
## $data_weather_format_vars.Rds
## NULL
## 
## $divvy_data_stations.Rds
## NULL
rm(rds_file_names, rds_file_paths, index_not_weather)

Feature Engineering

Distance Between El and Divvy

I think the proximity of a close Divvy station could affect El ridership, so here, I’ll calculate: 1. The distance between El and Divvy stations; and 2. The number of Divvy stations that are “close” to an El station

“Crow Flies” Distance

First, I just get get the necessary data sets. Note 1: As data for Divvy stations regularly includes multiple latitude and longitude values for the same station, I will simply take the average of these values. Note 2: As data for Divvy stations regularly include multiple values for when a station began (online_date), I will simply take the oldest of these values. Note 3: The first Divvy station and trip occurred in June 2013, so to have complete data, I will begin analyses from July 2013 onward.

el_stations_latlon <-
  rds_files$data_el_stations_format_vars.Rds %>% 
  select(#stop_id,
         map_id,
         lat,
         lon
         ) %>% 
  distinct() %>% 
  rename(el_stop_id = map_id,
         el_lat = lat,
         el_lon = lon
         ) %>% 
  mutate(temp_col_for_joining = 1)

message("el_stations_latlon")
## el_stations_latlon
str(el_stations_latlon)
## 'data.frame':    144 obs. of  4 variables:
##  $ el_stop_id          : int  40010 40020 40030 40040 40480 40050 40060 40070 40080 40090 ...
##  $ el_lat              : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ el_lon              : num  -87.8 -87.8 -87.7 -87.6 -87.7 ...
##  $ temp_col_for_joining: num  1 1 1 1 1 1 1 1 1 1 ...
# View(el_stations_latlon)


divvy_stations_latlon <-
  rds_files$divvy_data_stations.Rds %>% 
  group_by(id) %>% 
  summarise(divvy_lat = mean(latitude, na.rm = TRUE),
            divvy_lon = mean(longitude, na.rm = TRUE)
            ) %>% 
  ungroup() %>% 
  arrange(id)

# View(divvy_stations_latlon)
# View(rds_files$divvy_data_stations.Rds)

divvy_stations_online_date <-
  rds_files$divvy_data_stations.Rds %>% 
  group_by(id) %>% 
  summarise(divvy_online_date_min = min(online_date, na.rm = TRUE)
            ) %>% 
  ungroup() %>% 
  arrange(id)

# View(divvy_stations_online_date)

divvy_stations_latlon_online <-
  divvy_stations_latlon %>% 
  left_join(y = divvy_stations_online_date,
            by = "id"
            ) %>% 
  rename(divvy_station_id = id) %>% 
  mutate(temp_col_for_joining = 1)

# View(divvy_stations_latlon_online)
message("divvy_stations_latlon_online")
## divvy_stations_latlon_online
summary(divvy_stations_latlon_online)
##  divvy_station_id   divvy_lat       divvy_lon      divvy_online_date_min
##  Min.   :  2.0    Min.   :41.74   Min.   :-87.80   Min.   :2013-06-10   
##  1st Qu.:163.2    1st Qu.:41.85   1st Qu.:-87.68   1st Qu.:2013-07-19   
##  Median :315.5    Median :41.89   Median :-87.65   Median :2013-09-20   
##  Mean   :317.4    Mean   :41.89   Mean   :-87.66   Mean   :2014-07-01   
##  3rd Qu.:474.8    3rd Qu.:41.93   3rd Qu.:-87.63   3rd Qu.:2015-02-10   
##  Max.   :626.0    Max.   :42.06   Max.   :-87.55   Max.   :2017-12-21   
##  temp_col_for_joining
##  Min.   :1           
##  1st Qu.:1           
##  Median :1           
##  Mean   :1           
##  3rd Qu.:1           
##  Max.   :1
message("divvy_stations_latlon_online$divvy_online_date_min")
## divvy_stations_latlon_online$divvy_online_date_min
summary(divvy_stations_latlon_online$divvy_online_date_min)
##         Min.      1st Qu.       Median         Mean      3rd Qu. 
## "2013-06-10" "2013-07-19" "2013-09-20" "2014-07-01" "2015-02-10" 
##         Max. 
## "2017-12-21"
message("rds_files$data_divvy_trips.Rds$start_dt")
## rds_files$data_divvy_trips.Rds$start_dt
summary(rds_files$data_divvy_trips.Rds$start_dt)
##                  Min.               1st Qu.                Median 
## "2013-06-27 01:06:00" "2015-04-10 07:03:00" "2016-04-17 21:57:00" 
##                  Mean               3rd Qu.                  Max. 
## "2016-02-02 02:34:14" "2017-03-21 09:45:18" "2017-12-31 23:58:00"
rm(divvy_stations_latlon, divvy_stations_online_date)

Now I simply get the unique combinations of El stations and Divvy stations into one dataset.

stations_el_divvy_all <-
  el_stations_latlon %>% 
  inner_join(y = divvy_stations_latlon_online,
             by = "temp_col_for_joining"
             ) %>% 
  arrange(el_stop_id,
          divvy_station_id
          ) %>% 
  select(-temp_col_for_joining)

str(stations_el_divvy_all)
## 'data.frame':    84384 obs. of  7 variables:
##  $ el_stop_id           : int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_lat               : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ el_lon               : num  -87.8 -87.8 -87.8 -87.8 -87.8 ...
##  $ divvy_station_id     : num  2 3 4 5 6 7 9 11 12 13 ...
##  $ divvy_lat            : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ divvy_lon            : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ divvy_online_date_min: Date, format: "2013-06-10" "2013-06-10" ...
# View(stations_el_divvy_all %>% head(1000))


# rm(el_stations_latlon)
rm(divvy_stations_latlon_online)

Here I calculate the Haversine (“crow flies”) distance. This is not exactly the same as the walking distance (e.g., from Google Maps), but it should be similar, and this calculation avoids having to pay for using the Google Distance Matrix API).

# user  system elapsed 
#  32.793   3.027  36.038
system.time(
distance_el_divvy_stations <-
  stations_el_divvy_all %>% 
  mutate(dist_miles = by(data = stations_el_divvy_all,
                         INDICES = 1:nrow(stations_el_divvy_all),
                         FUN = function(row) {
                           distHaversine(p1 = c(row$el_lon, row$el_lat),
                                         p2 = c(row$divvy_lon, row$divvy_lat)
                                         )
                           }
                         ) / 1609.344, # meters in 1 mile
         dist_miles = as.numeric(dist_miles), # this is done to force the variable to be numeric instead of 'by' numeric
         temp_col_for_joining = 1
         ) %>% 
  as.data.table() %>% 
  setkey(temp_col_for_joining,
         el_stop_id,
         divvy_online_date_min
         # dist_miles
         )
)
##    user  system elapsed 
##  28.627   2.194  30.920
str(distance_el_divvy_stations)
## Classes 'data.table' and 'data.frame':   84384 obs. of  9 variables:
##  $ el_stop_id           : int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_lat               : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ el_lon               : num  -87.8 -87.8 -87.8 -87.8 -87.8 ...
##  $ divvy_station_id     : num  2 3 4 5 6 81 46 74 36 75 ...
##  $ divvy_lat            : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ divvy_lon            : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ divvy_online_date_min: Date, format: "2013-06-10" "2013-06-10" ...
##  $ dist_miles           : num  7.95 8.32 8.48 7.68 8.5 ...
##  $ temp_col_for_joining : num  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "temp_col_for_joining" "el_stop_id" "divvy_online_date_min"
summary(distance_el_divvy_stations)
##    el_stop_id        el_lat          el_lon       divvy_station_id
##  Min.   :40010   Min.   :41.72   Min.   :-87.90   Min.   :  2.0   
##  1st Qu.:40388   1st Qu.:41.86   1st Qu.:-87.71   1st Qu.:163.0   
##  Median :40785   Median :41.89   Median :-87.67   Median :315.5   
##  Mean   :40790   Mean   :41.90   Mean   :-87.68   Mean   :317.4   
##  3rd Qu.:41182   3rd Qu.:41.95   3rd Qu.:-87.63   3rd Qu.:475.0   
##  Max.   :41700   Max.   :42.07   Max.   :-87.61   Max.   :626.0   
##    divvy_lat       divvy_lon      divvy_online_date_min
##  Min.   :41.74   Min.   :-87.80   Min.   :2013-06-10   
##  1st Qu.:41.85   1st Qu.:-87.68   1st Qu.:2013-07-19   
##  Median :41.89   Median :-87.65   Median :2013-09-20   
##  Mean   :41.89   Mean   :-87.66   Mean   :2014-07-01   
##  3rd Qu.:41.93   3rd Qu.:-87.63   3rd Qu.:2015-02-10   
##  Max.   :42.06   Max.   :-87.55   Max.   :2017-12-21   
##    dist_miles        temp_col_for_joining
##  Min.   : 0.006815   Min.   :1           
##  1st Qu.: 3.552514   1st Qu.:1           
##  Median : 6.096796   Median :1           
##  Mean   : 6.682220   Mean   :1           
##  3rd Qu.: 9.044057   3rd Qu.:1           
##  Max.   :24.302634   Max.   :1
# View(head(distance_el_divvy_stations, 1000))

rm(stations_el_divvy_all)

Calculate a feature counting the number of Divvy Stations within X miles of every El Station (on every day)

As some Divvy stations came online (came into use) on diffrent dates, here I manipulate the data to ensure combinations of El-Divvy stations are only counted after the Divvy station is in operation.

First, I create the dataframe of all dates used in the analyses (2013-07-01 to 2018-01-01).

el_dates <-
  data.frame("el_date" = seq(from = ymd("2013-07-01"),
                             to = ymd("2018-01-01"),
                             by ="day"
                             )
             ) %>% 
  mutate(temp_col_for_joining = 1) %>% 
  as.data.table() %>% 
  setkey(temp_col_for_joining,
         el_date
         )

message("el_dates")
## el_dates
str(el_dates)
## Classes 'data.table' and 'data.frame':   1646 obs. of  2 variables:
##  $ el_date             : Date, format: "2013-07-01" "2013-07-02" ...
##  $ temp_col_for_joining: num  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "temp_col_for_joining" "el_date"

Now, for those instances when the el_station–divvy_station distance is 0.5 miles or below, I create a dataset that incorporates these combinations with all el_dates dates (i.e., all the dates in the analyses).

el_divvy_distance_setting <- 0.5

dist_miles_pt5 <-
  distance_el_divvy_stations[dist_miles <= el_divvy_distance_setting]

# View(dist_miles_pt5)
rm(el_divvy_distance_setting)


# user  system elapsed 
#   0.413   0.082   0.541
system.time(
  full_dates_dt <-
    merge(x = el_dates,
          # y = distance_el_divvy_stations,
          y = dist_miles_pt5,
          by.x = "temp_col_for_joining",
          by.y = "temp_col_for_joining",
          all = TRUE,
          allow.cartesian = TRUE
          ) %>% 
    setkey(el_stop_id,
           el_date,
           divvy_online_date_min
           )
  )
##    user  system elapsed 
##   0.503   0.127   0.458
message("full_dates_dt")
## full_dates_dt
str(full_dates_dt)
## Classes 'data.table' and 'data.frame':   1654230 obs. of  10 variables:
##  $ temp_col_for_joining : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ el_date              : Date, format: "2013-07-01" "2013-07-02" ...
##  $ el_stop_id           : int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_lat               : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ el_lon               : num  -87.8 -87.8 -87.8 -87.8 -87.8 ...
##  $ divvy_station_id     : num  618 618 618 618 618 618 618 618 618 618 ...
##  $ divvy_lat            : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ divvy_lon            : num  -87.8 -87.8 -87.8 -87.8 -87.8 ...
##  $ divvy_online_date_min: Date, format: "2016-06-23" "2016-06-23" ...
##  $ dist_miles           : num  0.107 0.107 0.107 0.107 0.107 ...
##  - attr(*, "sorted")= chr  "el_stop_id" "el_date" "divvy_online_date_min"
##  - attr(*, ".internal.selfref")=<externalptr>
# View(full_dates_dt %>% tail(10000))
rm(dist_miles_pt5)

Now I limit the dataset to only those el_dates that appear AFTER divvy_online_date_min (the date the Divvy station was made available).

within_pt5miles_after_online <-
  full_dates_dt[ , c("temp_col_for_joining",
                     "el_lat",
                     "el_lon",
                     "divvy_lat",
                     "divvy_lon"
                     ) := NULL
                 ][el_date >= divvy_online_date_min
                   ][order(el_date,
                           el_stop_id,
                           dist_miles
                           )
                     ]

message("within_pt5miles_after_online")
## within_pt5miles_after_online
str(within_pt5miles_after_online)
## Classes 'data.table' and 'data.frame':   1461999 obs. of  5 variables:
##  $ el_date              : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id           : int  40040 40040 40040 40040 40040 40040 40040 40040 40040 40040 ...
##  $ divvy_station_id     : num  40 36 37 50 89 49 33 75 18 68 ...
##  $ divvy_online_date_min: Date, format: "2013-06-25" "2013-06-15" ...
##  $ dist_miles           : num  0.1 0.108 0.208 0.252 0.259 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# View(within_pt5miles_after_online %>% filter(el_stop_id == 40090))
rm(full_dates_dt)

Here, for each el_stop_id and el_date, I count the number of instances - which are the number of Divvy stations within 0.5 miles of the El station on that day

within_pt5miles_cnts_el_stop_date <-
  within_pt5miles_after_online %>% 
  count(el_stop_id,
        el_date
        ) %>% 
  rename(num_divvy_stns_pt5_miles = n)

message("within_pt5miles_cnts_el_stop_date")
## within_pt5miles_cnts_el_stop_date
str(within_pt5miles_cnts_el_stop_date)
## Classes 'tbl_df', 'tbl' and 'data.frame':    165485 obs. of  3 variables:
##  $ el_stop_id              : int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_date                 : Date, format: "2016-06-23" "2016-06-24" ...
##  $ num_divvy_stns_pt5_miles: int  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr>
# View(within_pt5miles_cnts_el_stop_date %>% filter(el_stop_id == 40090))
# rm(within_pt5miles_after_online)

For each unique combination of el_stations and el_dates, now I add in the number of Divvy stations within 0.5 miles (as of that date). This will be a variable used when modeling/forecasting.

el_stations_all_dates <-
  el_dates %>% 
  inner_join(y = el_stations_latlon %>% select(el_stop_id, temp_col_for_joining),
             by = "temp_col_for_joining"
             ) %>% 
  arrange(el_stop_id,
          el_date
          ) %>% 
  select(el_stop_id,
         el_date
         )

message("el_stations_all_dates")
## el_stations_all_dates
str(el_stations_all_dates)
## 'data.frame':    237024 obs. of  2 variables:
##  $ el_stop_id: int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_date   : Date, format: "2013-07-01" "2013-07-02" ...
# View(el_stations_all_dates %>% head(1000))
rm(el_dates, el_stations_latlon)



el_stns_full_dates_within_pt5 <-
  el_stations_all_dates %>% 
  left_join(y = within_pt5miles_cnts_el_stop_date,
            by = c("el_stop_id" = "el_stop_id",
                   "el_date" = "el_date"
                   )
            ) %>% 
  left_join(y = within_pt5miles_after_online,
            by = c("el_stop_id" = "el_stop_id",
                   "el_date" = "el_date"
                   )
            ) %>% 
  mutate(num_divvy_stns_pt5_miles = if_else(is.na(num_divvy_stns_pt5_miles),
                                            as.integer(0),
                                            num_divvy_stns_pt5_miles
                                            )
         ) %>% 
  as.data.table() %>% 
  setkey(el_stop_id,
         el_date
         )

message("el_stns_full_dates_within_pt5")
## el_stns_full_dates_within_pt5
str(el_stns_full_dates_within_pt5)
## Classes 'data.table' and 'data.frame':   1562084 obs. of  6 variables:
##  $ el_stop_id              : int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_date                 : Date, format: "2013-07-01" "2013-07-02" ...
##  $ num_divvy_stns_pt5_miles: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_station_id        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ divvy_online_date_min   : Date, format: NA NA ...
##  $ dist_miles              : num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_stop_id" "el_date"
# View(el_stns_full_dates_within_pt5 %>% filter(el_stop_id == 40090))
rm(within_pt5miles_cnts_el_stop_date)


## Save the original data to the proper folder
saveRDS(el_stns_full_dates_within_pt5,
        paste0(wd,
               "/Data/Interim/",
               "el_stns_full_dates_within_pt5.Rds"
               )
        )


# confirming the steps above worked
message("distance_el_divvy_stations")
## distance_el_divvy_stations
str(distance_el_divvy_stations)
## Classes 'data.table' and 'data.frame':   84384 obs. of  9 variables:
##  $ el_stop_id           : int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_lat               : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ el_lon               : num  -87.8 -87.8 -87.8 -87.8 -87.8 ...
##  $ divvy_station_id     : num  2 3 4 5 6 81 46 74 36 75 ...
##  $ divvy_lat            : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ divvy_lon            : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ divvy_online_date_min: Date, format: "2013-06-10" "2013-06-10" ...
##  $ dist_miles           : num  7.95 8.32 8.48 7.68 8.5 ...
##  $ temp_col_for_joining : num  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "temp_col_for_joining" "el_stop_id" "divvy_online_date_min"
# View(distance_el_divvy_stations[el_stop_id == 40090 &
#                                   dist_miles <= 0.5
#                                 ][order(divvy_online_date_min)]
#      )


rm(within_pt5miles_after_online)

Calculate a feature calculating the closest Divvy station to each El station (on every day)

Note: This must be done for each combination of date and El station as Divvy stations are only online (made available) after certain dates.

Because the combination of data leads to a very large data file, I use purrr:pmap to do the joining separatey for each el_stop_id, and then combine the results into a single data.table.

Here I create the data.tables to use - duplicate variables for el_date2 and divvy_online_date_min are created for the purpose of merging the datasets.

message("el_stations_all_dates")
## el_stations_all_dates
str(el_stations_all_dates)
## 'data.frame':    237024 obs. of  2 variables:
##  $ el_stop_id: int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_date   : Date, format: "2013-07-01" "2013-07-02" ...
message("distance_el_divvy_stations")
## distance_el_divvy_stations
str(distance_el_divvy_stations)
## Classes 'data.table' and 'data.frame':   84384 obs. of  9 variables:
##  $ el_stop_id           : int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_lat               : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ el_lon               : num  -87.8 -87.8 -87.8 -87.8 -87.8 ...
##  $ divvy_station_id     : num  2 3 4 5 6 81 46 74 36 75 ...
##  $ divvy_lat            : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ divvy_lon            : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ divvy_online_date_min: Date, format: "2013-06-10" "2013-06-10" ...
##  $ dist_miles           : num  7.95 8.32 8.48 7.68 8.5 ...
##  $ temp_col_for_joining : num  1 1 1 1 1 1 1 1 1 1 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "temp_col_for_joining" "el_stop_id" "divvy_online_date_min"
el_stations_all_dates_dt <-
  el_stations_all_dates %>% 
  mutate(el_date2 = el_date) %>% 
  as.data.table() %>% 
  setkey(el_stop_id,
         el_date2
         )

distance_el_divvy_stations_dt <-
  distance_el_divvy_stations[ , divvy_online_date_min2 := divvy_online_date_min] %>% 
  setkey(el_stop_id,
         divvy_online_date_min2
         )


## Save the original data to the proper folder
saveRDS(distance_el_divvy_stations_dt,
        paste0(wd,
               "/Data/Interim/",
               "distance_el_divvy_stations_dt.Rds"
               )
        )


message("el_stations_all_dates_dt")
## el_stations_all_dates_dt
str(el_stations_all_dates_dt)
## Classes 'data.table' and 'data.frame':   237024 obs. of  3 variables:
##  $ el_stop_id: int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_date   : Date, format: "2013-07-01" "2013-07-02" ...
##  $ el_date2  : Date, format: "2013-07-01" "2013-07-02" ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_stop_id" "el_date2"
message("distance_el_divvy_stations_dt")
## distance_el_divvy_stations_dt
str(distance_el_divvy_stations_dt)
## Classes 'data.table' and 'data.frame':   84384 obs. of  10 variables:
##  $ el_stop_id            : int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_lat                : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ el_lon                : num  -87.8 -87.8 -87.8 -87.8 -87.8 ...
##  $ divvy_station_id      : num  2 3 4 5 6 81 46 74 36 75 ...
##  $ divvy_lat             : num  41.9 41.9 41.9 41.9 41.9 ...
##  $ divvy_lon             : num  -87.6 -87.6 -87.6 -87.6 -87.6 ...
##  $ divvy_online_date_min : Date, format: "2013-06-10" "2013-06-10" ...
##  $ dist_miles            : num  7.95 8.32 8.48 7.68 8.5 ...
##  $ temp_col_for_joining  : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ divvy_online_date_min2: Date, format: "2013-06-10" "2013-06-10" ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_stop_id" "divvy_online_date_min2"
rm(el_stations_all_dates, distance_el_divvy_stations)

Now, using purrr:pmap I perform the merge for each value of el_stop_id, then I put the results together into a single data.table.

## create the distinct values for el_stop_id
el_stns_dstnct <-
  el_stations_all_dates_dt %>% 
  select(el_stop_id) %>% 
  distinct() %>% 
  pull()


## the merge itself
# user  system elapsed 
# 108.265   4.020 113.568
system.time(
pmap_merge <-
  pmap(.l = list(a = el_stns_dstnct),
       .f = function(a) {
         # create small data.tables
         sngle_el_stn_all_dates = el_stations_all_dates_dt[el_stop_id == a]
         sngle_dstnc_el_divvy_stns = distance_el_divvy_stations_dt[el_stop_id == a]
         
         # the merge
         full_dt =
           sngle_dstnc_el_divvy_stns[sngle_el_stn_all_dates,
                                     on = .(el_stop_id = el_stop_id,
                                            divvy_online_date_min2 <= el_date2
                                            ),
                                     allow.cartesian = TRUE
                                     ]
         
         min_dt =
           full_dt %>% 
           group_by(el_stop_id,
                    el_date
                    ) %>% 
           top_n(n = -1, wt = dist_miles) %>% 
           ungroup() %>% 
           rename(min_dist_miles = dist_miles) %>% 
           select(el_stop_id,
                  el_date,
                  min_dist_miles,
                  divvy_station_id
                  ) %>% 
           as.data.table() %>% 
           setkey(el_stop_id,
                  el_date
                  )
         
         return(min_dt)
         }
       )
)
##    user  system elapsed 
## 107.347   6.872 119.636
## create the single data.table
el_stns_full_dates_miles_to_closest_divvy <-
  bind_rows(pmap_merge) %>% 
  distinct() %>% 
  setkey(el_stop_id,
         el_date
         )

str(el_stns_full_dates_miles_to_closest_divvy)
## Classes 'data.table' and 'data.frame':   235378 obs. of  4 variables:
##  $ el_stop_id      : int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_date         : Date, format: "2013-07-01" "2013-07-02" ...
##  $ min_dist_miles  : num  5.75 5.75 5.75 5.75 5.75 ...
##  $ divvy_station_id: num  9 9 9 9 9 9 9 9 9 9 ...
##  - attr(*, "sorted")= chr  "el_stop_id" "el_date"
# View(el_stns_full_dates_miles_to_closest_divvy[el_stop_id == 40090])


## Save the original data to the proper folder
saveRDS(el_stns_full_dates_miles_to_closest_divvy,
        paste0(wd,
               "/Data/Interim/",
               "el_stns_full_dates_miles_to_closest_divvy.Rds"
               )
        )


rm(el_stns_dstnct, pmap_merge, el_stations_all_dates_dt)

Munge Divvy trips data into something more usable

Inspecting divvy trips by day. First, looking at groupings of the “from” station to the “to” station. These data are interesting, but look too infrequent to be very useful. Also, I don’t believe the analyses at this level will vary greatly on where the trip was going to.

message("rds_files$data_divvy_trips.Rds")
## rds_files$data_divvy_trips.Rds
str(rds_files$data_divvy_trips.Rds)
## Classes 'data.table' and 'data.frame':   13822258 obs. of  12 variables:
##  $ trip_id          : int  3940 4095 4113 4118 4119 4134 4162 4192 4216 4255 ...
##  $ bikeid           : int  914 480 711 480 711 145 711 303 907 907 ...
##  $ tripduration     : int  31177 301 140 316 87 11674 15758 60 171 22549 ...
##  $ from_station_id  : int  91 85 88 85 88 17 88 28 45 45 ...
##  $ from_station_name: Factor w/ 663 levels "2112 W Peterson Ave",..: 155 408 394 408 394 654 394 346 404 404 ...
##  $ to_station_id    : int  48 85 88 28 88 61 34 28 90 54 ...
##  $ to_station_name  : Factor w/ 663 levels "2112 W Peterson Ave",..: 345 408 394 346 394 657 94 346 414 444 ...
##  $ usertype         : Factor w/ 3 levels "Customer","Dependent",..: 3 3 3 1 3 3 3 3 3 3 ...
##  $ gender           : Factor w/ 3 levels "","Female","Male": 3 3 3 NA 3 3 3 3 3 3 ...
##  $ birthyear        : int  1982 1982 1982 NA 1982 1978 1982 1982 1982 1982 ...
##  $ start_dt         : POSIXct, format: "2013-06-27 01:06:00" "2013-06-27 12:06:00" ...
##  $ end_dt           : POSIXct, format: "2013-06-27 09:46:00" "2013-06-27 12:11:00" ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "trip_id" "start_dt"
divvy_trips_by_day_from_to <-
  rds_files$data_divvy_trips.Rds[ , start_date:= as_date(start_dt)
                                 ][ ,
                                   .(trips_cnt = .N,
                                     # trip_duration_mean = mean(tripduration, na.rm = TRUE),
                                     # trip_duration_med = median(tripduration, na.rm = TRUE),
                                     trip_duration_sum = sum(tripduration)
                                     ),
                                   keyby = c("start_date",
                                             "usertype",
                                             "from_station_id",
                                             "to_station_id"
                                             )
                                   ]

message("divvy_trips_by_day_from_to")
## divvy_trips_by_day_from_to
str(divvy_trips_by_day_from_to)
## Classes 'data.table' and 'data.frame':   9080673 obs. of  6 variables:
##  $ start_date       : Date, format: "2013-06-27" "2013-06-27" ...
##  $ usertype         : Factor w/ 3 levels "Customer","Dependent",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ from_station_id  : int  17 19 19 20 24 32 32 32 34 37 ...
##  $ to_station_id    : int  61 19 55 46 24 15 19 32 34 37 ...
##  $ trips_cnt        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ trip_duration_sum: int  730 123 852 1712 3482 423 433 64 2285 3419 ...
##  - attr(*, "sorted")= chr  "start_date" "usertype" "from_station_id" "to_station_id"
##  - attr(*, ".internal.selfref")=<externalptr>
# View(tail(divvy_trips_by_day_from_to, 1000))


rm(divvy_trips_by_day_from_to)

So here, I create counts, and tripduration stats based on the start_date and the from_station_id.

divvy_trips_by_day_from_l <-
  rds_files$data_divvy_trips.Rds[ , start_date:= as_date(start_dt)
                                 ][ ,
                                   .(trip_cnt__dayfrom = .N,
                                     # trip_duration_mean__dayfrom =
                                     #   mean(tripduration, na.rm = TRUE),
                                     # trip_duration_med__dayfrom =
                                     #   median(tripduration, na.rm = TRUE),
                                     trip_duration_sum__dayfrom =
                                       sum(tripduration)
                                     ),
                                   keyby = c("start_date",
                                             "usertype",
                                             "from_station_id"
                                             )
                                   ]

message("divvy_trips_by_day_from_l")
## divvy_trips_by_day_from_l
str(divvy_trips_by_day_from_l)
## Classes 'data.table' and 'data.frame':   925604 obs. of  5 variables:
##  $ start_date                : Date, format: "2013-06-27" "2013-06-27" ...
##  $ usertype                  : Factor w/ 3 levels "Customer","Dependent",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ from_station_id           : int  17 19 20 24 32 34 37 44 51 52 ...
##  $ trip_cnt__dayfrom         : int  1 2 1 1 3 1 2 2 1 2 ...
##  $ trip_duration_sum__dayfrom: int  730 975 1712 3482 920 2285 4513 4674 653 1814 ...
##  - attr(*, "sorted")= chr  "start_date" "usertype" "from_station_id"
##  - attr(*, ".internal.selfref")=<externalptr>
# View(divvy_trips_by_day_from_l[from_station_id == 238])

Munge the data into a “wider” format based on the usertype value.

divvy_trips_by_day_from <-
  divvy_trips_by_day_from_l %>% 
  gather(key = variable,
         value = value,
         trip_cnt__dayfrom:trip_duration_sum__dayfrom
         ) %>% 
  unite(col = temp,
        usertype,
        variable,
        sep = "_"
        ) %>%
  spread(key = temp,
         value = value,
         fill = 0
         ) %>% 
  as.data.table() %>% 
  setkey(start_date,
         from_station_id
         )

message("divvy_trips_by_day_from")
## divvy_trips_by_day_from
str(divvy_trips_by_day_from)
## Classes 'data.table' and 'data.frame':   605030 obs. of  8 variables:
##  $ start_date                           : Date, format: "2013-06-27" "2013-06-27" ...
##  $ from_station_id                      : int  17 19 20 24 28 31 32 34 36 37 ...
##  $ Customer_trip_cnt__dayfrom           : num  1 2 1 1 0 0 3 1 0 2 ...
##  $ Customer_trip_duration_sum__dayfrom  : num  730 975 1712 3482 0 ...
##  $ Dependent_trip_cnt__dayfrom          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dependent_trip_duration_sum__dayfrom : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Subscriber_trip_cnt__dayfrom         : num  3 0 0 0 1 1 1 1 1 2 ...
##  $ Subscriber_trip_duration_sum__dayfrom: num  12651 0 0 0 60 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "start_date" "from_station_id"
# View(divvy_trips_by_day_from[from_station_id == 238])


## Save the data to the proper folder
saveRDS(divvy_trips_by_day_from,
        paste0(wd,
               "/Data/Interim/",
               "divvy_trips_by_day_from.Rds"
               )
        )


## Quick plot inspection
frequently_used_station <-
  divvy_trips_by_day_from %>% 
  top_n(n = 1, wt = Subscriber_trip_cnt__dayfrom) %>% 
  pull(from_station_id)

divvy_trips_by_day_from[from_station_id == frequently_used_station] %>% 
  ggplot(aes(x = start_date,
             y = Subscriber_trip_cnt__dayfrom
             )
         ) +
  geom_line()

rm(frequently_used_station, divvy_trips_by_day_from_l)

And now I create counts, and tripduration stats based on the start_date (i.e., without considering from_station_id).

divvy_trips_by_day_l <-
  rds_files$data_divvy_trips.Rds[ , start_date:= as_date(start_dt)
                                 ][ ,
                                   .(trip_cnt__day = .N,
                                     trip_duration_mean__day =
                                       mean(tripduration, na.rm = TRUE),
                                     trip_duration_med__day =
                                       median(tripduration, na.rm = TRUE)
                                     ),
                                   keyby = c("start_date",
                                             "usertype"
                                             )
                                   ]

message("divvy_trips_by_day_l")
## divvy_trips_by_day_l
str(divvy_trips_by_day_l)
## Classes 'data.table' and 'data.frame':   3398 obs. of  5 variables:
##  $ start_date             : Date, format: "2013-06-27" "2013-06-27" ...
##  $ usertype               : Factor w/ 3 levels "Customer","Dependent",..: 1 3 1 3 1 3 1 3 1 3 ...
##  $ trip_cnt__day          : int  36 59 500 397 996 205 1588 224 1022 537 ...
##  $ trip_duration_mean__day: num  1644 5258 2546 1245 1965 ...
##  $ trip_duration_med__day : num  1121 735 1400 821 1316 ...
##  - attr(*, "sorted")= chr  "start_date" "usertype"
##  - attr(*, ".internal.selfref")=<externalptr>
# View(divvy_trips_by_day_l)

As above, here I create a wider dataset vased on the usertype value.

divvy_trips_by_day <-
  divvy_trips_by_day_l %>% 
  gather(key = variable,
         value = value,
         trip_cnt__day:trip_duration_med__day
         ) %>% 
  unite(col = temp,
        usertype,
        variable,
        sep = "_"
        ) %>%
  spread(key = temp,
         value = value,
         fill = 0
         ) %>% 
  as.data.table() %>% 
  setkey(start_date)

message("divvy_trips_by_day")
## divvy_trips_by_day
str(divvy_trips_by_day)
## Classes 'data.table' and 'data.frame':   1647 obs. of  10 variables:
##  $ start_date                        : Date, format: "2013-06-27" "2013-06-28" ...
##  $ Customer_trip_cnt__day            : num  36 500 996 1588 1022 ...
##  $ Customer_trip_duration_mean__day  : num  1644 2546 1965 2417 2109 ...
##  $ Customer_trip_duration_med__day   : num  1121 1400 1316 1430 1262 ...
##  $ Dependent_trip_cnt__day           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dependent_trip_duration_mean__day : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dependent_trip_duration_med__day  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Subscriber_trip_cnt__day          : num  59 397 205 224 537 509 528 304 392 260 ...
##  $ Subscriber_trip_duration_mean__day: num  5258 1245 1431 888 1050 ...
##  $ Subscriber_trip_duration_med__day : num  735 821 769 582 639 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "start_date"
# View(divvy_trips_by_day)


## Save the data to the proper folder
saveRDS(divvy_trips_by_day,
        paste0(wd,
               "/Data/Interim/",
               "divvy_trips_by_day.Rds"
               )
        )


## Quick plot inspection
divvy_trips_by_day %>% 
  ggplot(aes(x = start_date,
             y = Subscriber_trip_cnt__day
             )
         ) +
  geom_line()

rm(divvy_trips_by_day_l)

New Features

Within 0.5 Miles, Create Station Counts, Trip Counts, and Trip Time Statistics

Original merge of the datasets.

message("el_stns_full_dates_within_pt5")
## el_stns_full_dates_within_pt5
str(el_stns_full_dates_within_pt5)
## Classes 'data.table' and 'data.frame':   1562084 obs. of  6 variables:
##  $ el_stop_id              : int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_date                 : Date, format: "2013-07-01" "2013-07-02" ...
##  $ num_divvy_stns_pt5_miles: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_station_id        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ divvy_online_date_min   : Date, format: NA NA ...
##  $ dist_miles              : num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_stop_id" "el_date"
message("divvy_trips_by_day_from")
## divvy_trips_by_day_from
str(divvy_trips_by_day_from)
## Classes 'data.table' and 'data.frame':   605030 obs. of  8 variables:
##  $ start_date                           : Date, format: "2013-06-27" "2013-06-27" ...
##  $ from_station_id                      : int  17 19 20 24 28 31 32 34 36 37 ...
##  $ Customer_trip_cnt__dayfrom           : num  1 2 1 1 0 0 3 1 0 2 ...
##  $ Customer_trip_duration_sum__dayfrom  : num  730 975 1712 3482 0 ...
##  $ Dependent_trip_cnt__dayfrom          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dependent_trip_duration_sum__dayfrom : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Subscriber_trip_cnt__dayfrom         : num  3 0 0 0 1 1 1 1 1 2 ...
##  $ Subscriber_trip_duration_sum__dayfrom: num  12651 0 0 0 60 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "start_date" "from_station_id"
##  - attr(*, "index")= int 
##   ..- attr(*, "__from_station_id")= int  174362 174743 175114 175469 175844 176231 176619 177023 177438 177869 ...
# View(divvy_trips_by_day_from[from_station_id == 238])


the_merge <-
  merge(x = el_stns_full_dates_within_pt5,
        y = divvy_trips_by_day_from,
        by.x = c("el_date", "divvy_station_id"),
        by.y = c("start_date", "from_station_id"),
        all.x = TRUE
        )

message("the_merge")
## the_merge
str(the_merge)
## Classes 'data.table' and 'data.frame':   1562084 obs. of  12 variables:
##  $ el_date                              : Date, format: "2013-07-01" "2013-07-01" ...
##  $ divvy_station_id                     : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ el_stop_id                           : int  40010 40020 40030 40050 40060 40080 40090 40100 40130 40140 ...
##  $ num_divvy_stns_pt5_miles             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_online_date_min                : Date, format: NA NA ...
##  $ dist_miles                           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Customer_trip_cnt__dayfrom           : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Customer_trip_duration_sum__dayfrom  : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Dependent_trip_cnt__dayfrom          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Dependent_trip_duration_sum__dayfrom : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Subscriber_trip_cnt__dayfrom         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Subscriber_trip_duration_sum__dayfrom: num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_date" "divvy_station_id"
# View(the_merge[el_stop_id == 40090])

Creating summed variables grouped by el_date, el_stop_id, and num_divvy_stns_pt5_miles

new_vars_by <-
  the_merge[ ,
            .(c_tripcnt__dayfrom = sum(Customer_trip_cnt__dayfrom),
              c_tripduration_sum__dayfrom = sum(Customer_trip_duration_sum__dayfrom),
              d_tripcnt__dayfrom = sum(Dependent_trip_cnt__dayfrom),
              d_tripduration_sum__dayfrom = sum(Dependent_trip_duration_sum__dayfrom),
              s_tripcnt__dayfrom = sum(Subscriber_trip_cnt__dayfrom),
              s_tripduration_sum__dayfrom = sum(Subscriber_trip_duration_sum__dayfrom)
              ),
            keyby = c("el_date", "el_stop_id", "num_divvy_stns_pt5_miles")
            ]

str(new_vars_by)
## Classes 'data.table' and 'data.frame':   235378 obs. of  9 variables:
##  $ el_date                    : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                 : int  40010 40020 40030 40040 40050 40060 40070 40080 40090 40100 ...
##  $ num_divvy_stns_pt5_miles   : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ c_tripcnt__dayfrom         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ c_tripduration_sum__dayfrom: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ d_tripcnt__dayfrom         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ d_tripduration_sum__dayfrom: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ s_tripcnt__dayfrom         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ s_tripduration_sum__dayfrom: num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id" "num_divvy_stns_pt5_miles"
##  - attr(*, ".internal.selfref")=<externalptr>

Creating a new variable for the mean trip duration, and if any new variable is NA, this means no trips, so change NA to 0 (zero).

new_vars_1 <-
  new_vars_by[ ,
              `:=` (c_tripduration_mean__dayfrom =
                      c_tripduration_sum__dayfrom / c_tripcnt__dayfrom,
                    d_tripduration_mean__dayfrom =
                      d_tripduration_sum__dayfrom / d_tripcnt__dayfrom,
                    s_tripduration_mean__dayfrom =
                      s_tripduration_sum__dayfrom / s_tripcnt__dayfrom
                    )
              ]

message("new_vars_1")
## new_vars_1
str(new_vars_1)
## Classes 'data.table' and 'data.frame':   235378 obs. of  12 variables:
##  $ el_date                     : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                  : int  40010 40020 40030 40040 40050 40060 40070 40080 40090 40100 ...
##  $ num_divvy_stns_pt5_miles    : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ c_tripcnt__dayfrom          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ c_tripduration_sum__dayfrom : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ d_tripcnt__dayfrom          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ d_tripduration_sum__dayfrom : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ s_tripcnt__dayfrom          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ s_tripduration_sum__dayfrom : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ c_tripduration_mean__dayfrom: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ d_tripduration_mean__dayfrom: num  NA NA NA NA NA NA NA NA NA NA ...
##  $ s_tripduration_mean__dayfrom: num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id" "num_divvy_stns_pt5_miles"
##  - attr(*, ".internal.selfref")=<externalptr>
rm(new_vars_by)



new_vars_2 <-
  new_vars_1[ ,
             `:=` (Customer_tripcnt__dayfrom =
                     if_else(num_divvy_stns_pt5_miles >= 0 &
                               is.na(c_tripcnt__dayfrom),
                             as.numeric(0),
                             c_tripcnt__dayfrom
                             ),
                   Customer_tripduration_mean__dayfrom =
                     if_else(num_divvy_stns_pt5_miles >= 0 &
                               is.na(c_tripduration_mean__dayfrom),
                             as.numeric(0),
                             c_tripduration_mean__dayfrom
                             ),
                   Dependent_tripcnt__dayfrom =
                     if_else(num_divvy_stns_pt5_miles >= 0 &
                               is.na(d_tripcnt__dayfrom),
                             as.numeric(0),
                             d_tripcnt__dayfrom
                             ),
                   Dependent_tripduration_mean__dayfrom =
                     if_else(num_divvy_stns_pt5_miles >= 0 &
                               is.na(d_tripduration_mean__dayfrom),
                             as.numeric(0),
                             d_tripduration_mean__dayfrom
                             ),
                   Subscriber_tripcnt__dayfrom =
                     if_else(num_divvy_stns_pt5_miles >= 0 &
                               is.na(s_tripcnt__dayfrom),
                             as.numeric(0),
                             s_tripcnt__dayfrom
                             ),
                   Subscriber_tripduration_mean__dayfrom =
                     if_else(num_divvy_stns_pt5_miles >= 0 &
                               is.na(s_tripduration_mean__dayfrom),
                             as.numeric(0),
                             s_tripduration_mean__dayfrom
                             )
                   )
             ]

message("new_vars_2")
## new_vars_2
str(new_vars_2)
## Classes 'data.table' and 'data.frame':   235378 obs. of  18 variables:
##  $ el_date                              : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                           : int  40010 40020 40030 40040 40050 40060 40070 40080 40090 40100 ...
##  $ num_divvy_stns_pt5_miles             : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ c_tripcnt__dayfrom                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ c_tripduration_sum__dayfrom          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ d_tripcnt__dayfrom                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ d_tripduration_sum__dayfrom          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ s_tripcnt__dayfrom                   : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ s_tripduration_sum__dayfrom          : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ c_tripduration_mean__dayfrom         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ d_tripduration_mean__dayfrom         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ s_tripduration_mean__dayfrom         : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ Customer_tripcnt__dayfrom            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Customer_tripduration_mean__dayfrom  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dependent_tripcnt__dayfrom           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dependent_tripduration_mean__dayfrom : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Subscriber_tripcnt__dayfrom          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Subscriber_tripduration_mean__dayfrom: num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id" "num_divvy_stns_pt5_miles"
##  - attr(*, ".internal.selfref")=<externalptr>
rm(new_vars_1)

Remove no-longer-needed variables and rename the kept variables.

clean_vars <-
  new_vars_2[ , c("c_tripduration_sum__dayfrom",
                  "c_tripcnt__dayfrom",
                  "c_tripduration_mean__dayfrom",
                  "d_tripduration_sum__dayfrom",
                  "d_tripcnt__dayfrom",
                  "d_tripduration_mean__dayfrom",
                  "s_tripduration_sum__dayfrom",
                  "s_tripcnt__dayfrom",
                  "s_tripduration_mean__dayfrom"
                  ) := NULL
              ] %>% 
  rename(divvy_pt5mi_stn_cnt = num_divvy_stns_pt5_miles,
         divvy_pt5mi_trip_cnt_cus = Customer_tripcnt__dayfrom,
         divvy_pt5mi_triptime_mean_cus = Customer_tripduration_mean__dayfrom,
         divvy_pt5mi_trip_cnt_dep = Dependent_tripcnt__dayfrom,
         divvy_pt5mi_triptime_mean_dep = Dependent_tripduration_mean__dayfrom,
         divvy_pt5mi_trip_cnt_sub = Subscriber_tripcnt__dayfrom,
         divvy_pt5mi_triptime_mean_sub = Subscriber_tripduration_mean__dayfrom
         )

rm(new_vars_2)


# Confirm everything worked
# View(the_merge %>% filter(el_stop_id == 40090))
# View(clean_vars[el_stop_id == 40090])


message("clean_vars")
## clean_vars
str(clean_vars)
## Classes 'data.table' and 'data.frame':   235378 obs. of  9 variables:
##  $ el_date                      : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                   : int  40010 40020 40030 40040 40050 40060 40070 40080 40090 40100 ...
##  $ divvy_pt5mi_stn_cnt          : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub: num  0 0 0 0 0 0 0 0 0 0 ...
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id" "num_divvy_stns_pt5_miles"
##  - attr(*, ".internal.selfref")=<externalptr>
message("el_stns_full_dates_within_pt5")
## el_stns_full_dates_within_pt5
str(el_stns_full_dates_within_pt5)
## Classes 'data.table' and 'data.frame':   1562084 obs. of  6 variables:
##  $ el_stop_id              : int  40010 40010 40010 40010 40010 40010 40010 40010 40010 40010 ...
##  $ el_date                 : Date, format: "2013-07-01" "2013-07-02" ...
##  $ num_divvy_stns_pt5_miles: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_station_id        : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ divvy_online_date_min   : Date, format: NA NA ...
##  $ dist_miles              : num  NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_stop_id" "el_date"
rm(el_stns_full_dates_within_pt5, the_merge)

Add in the general daily data (not by specific Divvy station) - i.e., the Divvy rides data for each day

## initial merge
base_merge <-
  merge(x = clean_vars,
        y = divvy_trips_by_day,
        by.x = c("el_date"),
        by.y = c("start_date"),
        all.x = TRUE
        )

rm(clean_vars)


## change NA to zero
new_vars <-
  base_merge[ ,
             `:=` (divvy_all_trip_cnt_cus =
                     if_else(is.na(Customer_trip_cnt__day),
                             as.numeric(0),
                             Customer_trip_cnt__day
                             ),
                   divvy_all_triptime_mean_cus =
                     if_else(is.na(Customer_trip_duration_mean__day),
                             as.numeric(0),
                             Customer_trip_duration_mean__day
                             ),
                   divvy_all_triptime_med_cus =
                     if_else(is.na(Customer_trip_duration_med__day),
                             as.numeric(0),
                             Customer_trip_duration_med__day
                             ),
                   divvy_all_trip_cnt_dep =
                     if_else(is.na(Dependent_trip_cnt__day),
                             as.numeric(0),
                             Dependent_trip_cnt__day
                             ),
                   divvy_all_triptime_mean_dep =
                     if_else(is.na(Dependent_trip_duration_mean__day),
                             as.numeric(0),
                             Dependent_trip_duration_mean__day
                             ),
                   divvy_all_triptime_med_dep =
                     if_else(is.na(Dependent_trip_duration_med__day),
                             as.numeric(0),
                             Dependent_trip_duration_med__day
                             ),
                   divvy_all_trip_cnt_sub =
                     if_else(is.na(Subscriber_trip_cnt__day),
                             as.numeric(0),
                             Subscriber_trip_cnt__day
                             ),
                   divvy_all_triptime_mean_sub =
                     if_else(is.na(Subscriber_trip_duration_mean__day),
                             as.numeric(0),
                             Subscriber_trip_duration_mean__day
                             ),
                   divvy_all_triptime_med_sub =
                     if_else(is.na(Subscriber_trip_duration_med__day),
                             as.numeric(0),
                             Subscriber_trip_duration_med__day
                             )
                   )
             ]

rm(base_merge)
# str(new_vars)


## remove no-longer-needed variables
add_daily_data <-
  new_vars[ ,
           c("Customer_trip_cnt__day",
             "Customer_trip_duration_mean__day",
             "Customer_trip_duration_med__day",
             "Dependent_trip_cnt__day",
             "Dependent_trip_duration_mean__day",
             "Dependent_trip_duration_med__day",
             "Subscriber_trip_cnt__day",
             "Subscriber_trip_duration_mean__day",
             "Subscriber_trip_duration_med__day"
             ) := NULL] %>% 
  setkey(el_date,
         el_stop_id
         )

str(add_daily_data)
## Classes 'data.table' and 'data.frame':   235378 obs. of  18 variables:
##  $ el_date                      : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                   : int  40010 40020 40030 40040 40050 40060 40070 40080 40090 40100 ...
##  $ divvy_pt5mi_stn_cnt          : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus       : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus  : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus   : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub       : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub  : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub   : num  639 639 639 639 639 639 639 639 639 639 ...
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id"
##  - attr(*, ".internal.selfref")=<externalptr>
# View(add_daily_data[el_stop_id == 40090])
# View(divvy_trips_by_day)


rm(new_vars)

Add data on trip counts and trip times for the closest Divvy station to each El station (on each day)

First merge daily data and data on the closest divvy stations.

merge_in_closest_divvy_stn <-
  merge(x = add_daily_data,
        y = el_stns_full_dates_miles_to_closest_divvy,
        by.x = c("el_date", "el_stop_id"),
        by.y = c("el_date", "el_stop_id"),
        all.x = TRUE
        )

message("add_daily_data")
## add_daily_data
str(add_daily_data)
## Classes 'data.table' and 'data.frame':   235378 obs. of  18 variables:
##  $ el_date                      : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                   : int  40010 40020 40030 40040 40050 40060 40070 40080 40090 40100 ...
##  $ divvy_pt5mi_stn_cnt          : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus       : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus  : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus   : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub       : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub  : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub   : num  639 639 639 639 639 639 639 639 639 639 ...
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id"
##  - attr(*, ".internal.selfref")=<externalptr>
message("merge_in_closest_divvy_stn")
## merge_in_closest_divvy_stn
str(merge_in_closest_divvy_stn)
## Classes 'data.table' and 'data.frame':   235378 obs. of  20 variables:
##  $ el_date                      : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                   : int  40010 40020 40030 40040 40050 40060 40070 40080 40090 40100 ...
##  $ divvy_pt5mi_stn_cnt          : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus       : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus  : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus   : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub       : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub  : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub   : num  639 639 639 639 639 639 639 639 639 639 ...
##  $ min_dist_miles               : num  5.75 6.65 2.96 0.1 8.13 ...
##  $ divvy_station_id             : num  9 69 69 40 13 69 37 13 13 13 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id"
rm(add_daily_data)

Next merge in the data on Divvy stats by day and by Divvy station.

el_stns_full_dates_daily_stats_all <-
  merge(x = merge_in_closest_divvy_stn,
        y = divvy_trips_by_day_from,
        by.x = c("el_date", "divvy_station_id"),
        by.y = c("start_date", "from_station_id"),
        all.x = TRUE
        ) %>% 
  # rename the variables for consistency
  rename(divvy_mindist_miles = min_dist_miles,
         divvy_mindist_trip_cnt_cus = Customer_trip_cnt__dayfrom,
         divvy_mindist_triptime_sum_cus = Customer_trip_duration_sum__dayfrom,
         divvy_mindist_trip_cnt_dep = Dependent_trip_cnt__dayfrom,
         divvy_mindist_triptime_sum_dep = Dependent_trip_duration_sum__dayfrom,
         divvy_mindist_trip_cnt_sub = Subscriber_trip_cnt__dayfrom,
         divvy_mindist_triptime_sum_sub = Subscriber_trip_duration_sum__dayfrom
         ) %>% 
  mutate(divvy_mindist_trip_cnt_cus =
           if_else(is.na(divvy_mindist_trip_cnt_cus),
                   as.numeric(0),
                   divvy_mindist_trip_cnt_cus
                   ),
         divvy_mindist_triptime_mean_cus =
           if_else(is.na(divvy_mindist_triptime_sum_cus / divvy_mindist_trip_cnt_cus),
                   as.numeric(0),
                   divvy_mindist_triptime_sum_cus / divvy_mindist_trip_cnt_cus
                   ),
         divvy_mindist_trip_cnt_dep =
           if_else(is.na(divvy_mindist_trip_cnt_dep),
                   as.numeric(0),
                   divvy_mindist_trip_cnt_dep
                   ),
         divvy_mindist_triptime_mean_dep =
           if_else(is.na(divvy_mindist_triptime_sum_dep / divvy_mindist_trip_cnt_dep),
                   as.numeric(0),
                   divvy_mindist_triptime_sum_dep / divvy_mindist_trip_cnt_dep
                   ),
         divvy_mindist_trip_cnt_sub =
           if_else(is.na(divvy_mindist_trip_cnt_sub),
                   as.numeric(0),
                   divvy_mindist_trip_cnt_sub
                   ),
         divvy_mindist_triptime_mean_sub =
           if_else(is.na(divvy_mindist_triptime_sum_sub / divvy_mindist_trip_cnt_sub),
                   as.numeric(0),
                   divvy_mindist_triptime_sum_sub / divvy_mindist_trip_cnt_sub
                   )
         ) %>% 
  select(-divvy_mindist_triptime_sum_cus,
         -divvy_mindist_triptime_sum_dep,
         -divvy_mindist_triptime_sum_sub,
         -divvy_station_id
         ) %>% 
  as.data.table() %>% 
  setkey(el_date,
         el_stop_id
         )


message("merge_in_closest_divvy_stn")
## merge_in_closest_divvy_stn
str(merge_in_closest_divvy_stn)
## Classes 'data.table' and 'data.frame':   235378 obs. of  20 variables:
##  $ el_date                      : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                   : int  40010 40020 40030 40040 40050 40060 40070 40080 40090 40100 ...
##  $ divvy_pt5mi_stn_cnt          : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus       : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus  : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus   : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub       : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub  : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub   : num  639 639 639 639 639 639 639 639 639 639 ...
##  $ min_dist_miles               : num  5.75 6.65 2.96 0.1 8.13 ...
##  $ divvy_station_id             : num  9 69 69 40 13 69 37 13 13 13 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id"
message("el_stns_full_dates_daily_stats_all")
## el_stns_full_dates_daily_stats_all
str(el_stns_full_dates_daily_stats_all)
## Classes 'data.table' and 'data.frame':   235378 obs. of  25 variables:
##  $ el_date                        : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                     : int  40010 40020 40030 40040 40050 40060 40070 40080 40090 40100 ...
##  $ divvy_pt5mi_stn_cnt            : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus         : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus    : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus     : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub         : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub    : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub     : num  639 639 639 639 639 639 639 639 639 639 ...
##  $ divvy_mindist_miles            : num  5.75 6.65 2.96 0.1 8.13 ...
##  $ divvy_mindist_trip_cnt_cus     : num  0 10 10 0 3 10 11 3 3 3 ...
##  $ divvy_mindist_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub     : num  0 8 8 0 7 8 19 7 7 7 ...
##  $ divvy_mindist_triptime_mean_cus: num  0 2399 2399 0 1413 ...
##  $ divvy_mindist_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub: num  0 822 822 0 859 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id"
# View(el_stns_full_dates_daily_stats_all[el_stop_id == 40090])

rm(merge_in_closest_divvy_stn, el_stns_full_dates_miles_to_closest_divvy)


## Save the data to the proper folder
saveRDS(el_stns_full_dates_daily_stats_all,
        paste0(wd,
               "/Data/Interim/",
               "el_stns_full_dates_daily_stats_all.Rds"
               )
        )

Create the full dataset by merging in the additional data

Merge in data on el rides.

message("el_stns_full_dates_daily_stats_all")
## el_stns_full_dates_daily_stats_all
str(el_stns_full_dates_daily_stats_all)
## Classes 'data.table' and 'data.frame':   235378 obs. of  25 variables:
##  $ el_date                        : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                     : int  40010 40020 40030 40040 40050 40060 40070 40080 40090 40100 ...
##  $ divvy_pt5mi_stn_cnt            : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus         : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus    : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus     : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub         : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub    : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub     : num  639 639 639 639 639 639 639 639 639 639 ...
##  $ divvy_mindist_miles            : num  5.75 6.65 2.96 0.1 8.13 ...
##  $ divvy_mindist_trip_cnt_cus     : num  0 10 10 0 3 10 11 3 3 3 ...
##  $ divvy_mindist_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub     : num  0 8 8 0 7 8 19 7 7 7 ...
##  $ divvy_mindist_triptime_mean_cus: num  0 2399 2399 0 1413 ...
##  $ divvy_mindist_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub: num  0 822 822 0 859 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id"
message("rds_files$data_el_format_vars.Rds")
## rds_files$data_el_format_vars.Rds
str(rds_files$data_el_format_vars.Rds)
## Classes 'data.table' and 'data.frame':   897032 obs. of  5 variables:
##  $ date       : Date, format: "2001-01-01" "2001-01-01" ...
##  $ daytype    : Factor w/ 3 levels "A","U","W": 2 2 2 2 2 2 2 2 2 2 ...
##  $ rides      : num  290 633 483 374 804 ...
##  $ station_id : num  40010 40020 40030 40040 40050 ...
##  $ stationname: Factor w/ 148 levels "18th","35-Bronzeville-IIT",..: 23 70 120 122 52 26 78 130 49 109 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "date" "station_id"
## Quick plot inspection
freqently_used_el <-
  rds_files$data_el_format_vars.Rds %>% 
  top_n(n = 1, w = rides) %>% 
  pull(station_id)
  

rds_files$data_el_format_vars.Rds[station_id == freqently_used_el &
                                    date >= as_date("2013-07-01")
                                  ] %>% 
  ggplot(aes(x = date,
             y = rides
             )
         ) +
  geom_line()

## The merge
add_el_rides <-
  merge(x = el_stns_full_dates_daily_stats_all,
        y = rds_files$data_el_format_vars.Rds,
        by.x = c("el_date", "el_stop_id"),
        by.y = c("date", "station_id"),
        all.x = TRUE
        ) %>% 
  rename(el_rides = rides) %>% 
  select(-daytype,
         -stationname
         )

message("el_stns_full_dates_daily_stats_all")
## el_stns_full_dates_daily_stats_all
dim(el_stns_full_dates_daily_stats_all)
## [1] 235378     25
message("add_el_rides")
## add_el_rides
dim(add_el_rides)
## [1] 235378     26
str(add_el_rides)
## Classes 'data.table' and 'data.frame':   235378 obs. of  26 variables:
##  $ el_date                        : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                     : num  40010 40020 40030 40040 40050 ...
##  $ divvy_pt5mi_stn_cnt            : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus         : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus    : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus     : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub         : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub    : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub     : num  639 639 639 639 639 639 639 639 639 639 ...
##  $ divvy_mindist_miles            : num  5.75 6.65 2.96 0.1 8.13 ...
##  $ divvy_mindist_trip_cnt_cus     : num  0 10 10 0 3 10 11 3 3 3 ...
##  $ divvy_mindist_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub     : num  0 8 8 0 7 8 19 7 7 7 ...
##  $ divvy_mindist_triptime_mean_cus: num  0 2399 2399 0 1413 ...
##  $ divvy_mindist_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub: num  0 822 822 0 859 ...
##  $ el_rides                       : num  2058 3942 2070 8671 3703 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id"
rm(freqently_used_el, el_stns_full_dates_daily_stats_all)

Merge in data on holidays.

message("rds_files$data_holidays_format_vars.Rds")
## rds_files$data_holidays_format_vars.Rds
str(rds_files$data_holidays_format_vars.Rds)
## Classes 'data.table' and 'data.frame':   129 obs. of  3 variables:
##  $ date           : Date, format: "2010-01-01" "2010-01-18" ...
##  $ holiday_name   : Factor w/ 24 levels "Casimir Pulaski Day",..: 18 14 21 16 15 10 12 5 23 22 ...
##  $ holiday_comment: chr  "" "Third Monday in January" "Third Monday in February" "2nd Sunday in May. Not a public holiday." ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "date"
## rds_files$data_holidays_format_vars.Rds has one duplicate date, so remove that first
dupe_dates <-
  rds_files$data_holidays_format_vars.Rds %>% 
  count(date) %>% 
  arrange(desc(n)) %>% 
  filter(n != 1) %>% 
  pull(date)

remove_dupe <-
  rds_files$data_holidays_format_vars.Rds %>% 
  filter(date == as_date(dupe_dates)
         ) %>% 
  rowid_to_column() %>% 
  filter(rowid != 1) %>% 
  mutate(filter_temp = paste(date,
                             holiday_name,
                             holiday_comment,
                             sep = "_"
                             )
         )
## Warning: `list_len()` is soft-deprecated as of rlang 0.2.0.
## Please use `new_list()` instead
## This warning is displayed once per session.
holiday_dedupe <-
  rds_files$data_holidays_format_vars.Rds %>% 
  mutate(filter_temp = paste(date,
                             holiday_name,
                             holiday_comment,
                             sep = "_"
                             )
         ) %>% 
  filter(filter_temp != remove_dupe$filter_temp) %>% 
  select(-filter_temp)

message("holiday_dedupe")
## holiday_dedupe
str(holiday_dedupe)
## 'data.frame':    128 obs. of  3 variables:
##  $ date           : Date, format: "2010-01-01" "2010-01-18" ...
##  $ holiday_name   : Factor w/ 24 levels "Casimir Pulaski Day",..: 18 14 21 16 15 10 12 5 23 22 ...
##  $ holiday_comment: chr  "" "Third Monday in January" "Third Monday in February" "2nd Sunday in May. Not a public holiday." ...
## merge itself
base <-
  merge(x = add_el_rides,
        y = holiday_dedupe,
        by.x = "el_date",
        by.y = "date",
        all.x = TRUE
        )


## add 1/0 holiday indicator
add_holidays <-
  base[ , holiday := if_else(is.na(holiday_name), FALSE, TRUE)]


message("add_el_rides")
## add_el_rides
dim(add_el_rides)
## [1] 235378     26
message("add_holidays")
## add_holidays
dim(add_holidays)
## [1] 235378     29
str(add_holidays)
## Classes 'data.table' and 'data.frame':   235378 obs. of  29 variables:
##  $ el_date                        : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                     : num  40010 40020 40030 40040 40050 ...
##  $ divvy_pt5mi_stn_cnt            : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus         : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus    : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus     : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub         : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub    : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub     : num  639 639 639 639 639 639 639 639 639 639 ...
##  $ divvy_mindist_miles            : num  5.75 6.65 2.96 0.1 8.13 ...
##  $ divvy_mindist_trip_cnt_cus     : num  0 10 10 0 3 10 11 3 3 3 ...
##  $ divvy_mindist_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub     : num  0 8 8 0 7 8 19 7 7 7 ...
##  $ divvy_mindist_triptime_mean_cus: num  0 2399 2399 0 1413 ...
##  $ divvy_mindist_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub: num  0 822 822 0 859 ...
##  $ el_rides                       : num  2058 3942 2070 8671 3703 ...
##  $ holiday_name                   : Factor w/ 24 levels "Casimir Pulaski Day",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ holiday_comment                : chr  NA NA NA NA ...
##  $ holiday                        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "el_date"
rm(dupe_dates, remove_dupe, base, holiday_dedupe, add_el_rides)

Merge in data on weather.

message("rds_files$data_weather_format_vars.Rds")
## rds_files$data_weather_format_vars.Rds
str(rds_files$data_weather_format_vars.Rds)
## Classes 'data.table' and 'data.frame':   2922 obs. of  12 variables:
##  $ snwd     : num  2 2 2 2 2 2 6 6 5 5 ...
##  $ snow     : num  0 0 0 0 0 0 4.1 1.8 0 0 ...
##  $ station  : chr  "USC00111577" "USC00111577" "USC00111577" "USC00111577" ...
##  $ name     : chr  "CHICAGO MIDWAY AIRPORT 3 SW, IL US" "CHICAGO MIDWAY AIRPORT 3 SW, IL US" "CHICAGO MIDWAY AIRPORT 3 SW, IL US" "CHICAGO MIDWAY AIRPORT 3 SW, IL US" ...
##  $ latitude : num  41.7 41.7 41.7 41.7 41.7 ...
##  $ longitude: num  -87.8 -87.8 -87.8 -87.8 -87.8 ...
##  $ elevation: num  189 189 189 189 189 189 189 189 189 189 ...
##  $ date     : Date, format: "2010-01-01" "2010-01-02" ...
##  $ prcp     : num  0 0 0 0 0 0 0.32 0.11 0 0 ...
##  $ tmax     : int  18 12 19 19 23 23 21 28 24 19 ...
##  $ tmin     : int  5 3 1 9 15 8 15 15 9 0 ...
##  $ tobs     : int  11 6 12 16 15 16 20 15 10 19 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "date"
## The merge
add_weather <-
  merge(x = add_holidays,
        y = rds_files$data_weather_format_vars.Rds,
        by.x = "el_date",
        by.y = "date",
        all.x = TRUE
        ) %>% 
  select(-station,
         -latitude,
         -longitude,
         -elevation,
         -name
         ) %>% 
  setkey(el_date,
         el_stop_id
         )


message("add_holidays")
## add_holidays
dim(add_holidays)
## [1] 235378     29
message("add_weather")
## add_weather
dim(add_weather)
## [1] 235378     35
str(add_weather)
## Classes 'data.table' and 'data.frame':   235378 obs. of  35 variables:
##  $ el_date                        : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                     : num  40010 40020 40030 40040 40050 ...
##  $ divvy_pt5mi_stn_cnt            : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus         : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus    : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus     : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub         : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub    : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub     : num  639 639 639 639 639 639 639 639 639 639 ...
##  $ divvy_mindist_miles            : num  5.75 6.65 2.96 0.1 8.13 ...
##  $ divvy_mindist_trip_cnt_cus     : num  0 10 10 0 3 10 11 3 3 3 ...
##  $ divvy_mindist_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub     : num  0 8 8 0 7 8 19 7 7 7 ...
##  $ divvy_mindist_triptime_mean_cus: num  0 2399 2399 0 1413 ...
##  $ divvy_mindist_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub: num  0 822 822 0 859 ...
##  $ el_rides                       : num  2058 3942 2070 8671 3703 ...
##  $ holiday_name                   : Factor w/ 24 levels "Casimir Pulaski Day",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ holiday_comment                : chr  NA NA NA NA ...
##  $ holiday                        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ snwd                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ snow                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ prcp                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ tmax                           : int  75 75 75 75 75 75 75 75 75 75 ...
##  $ tmin                           : int  60 60 60 60 60 60 60 60 60 60 ...
##  $ tobs                           : int  67 67 67 67 67 67 67 67 67 67 ...
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id"
##  - attr(*, ".internal.selfref")=<externalptr>
rm(add_holidays)
rm(distance_el_divvy_stations_dt, divvy_trips_by_day, divvy_trips_by_day_from)
# rm(rds_files)

Create time-based variables, and add them in.

Here I use timetk::tk_augment_timeseries_signature to create the “time variables” (e.g., month, week, etc.), and I turn integer variables into factors to facilitate modeling later on.

date_min <- min(add_weather$el_date)
date_max <- max(add_weather$el_date)

dates_dt <- 
  data.frame(base_date = as_date(date_min:date_max)
             ) %>%  
  tk_augment_timeseries_signature() %>% 
  select(-diff,
         -index.num,
         -year.iso,
         -month.xts,
         -month.lbl,
         -hour,
         -minute,
         -second,
         -hour12,
         -am.pm,
         -wday,
         -wday.xts,
         -mday,
         -qday,
         -yday,
         -week2,
         -week3,
         -week4,
         -mday7,
         -week,
         -week.iso
         ) %>% 
  mutate_if(is.integer, factor) %>% 
  as.data.table() %>% 
  setkey(base_date)
  

str(dates_dt)
## Classes 'data.table' and 'data.frame':   1646 obs. of  8 variables:
##  $ base_date: Date, format: "2013-07-01" "2013-07-02" ...
##  $ year     : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ half     : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ quarter  : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##  $ month    : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day      : Factor w/ 31 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ wday.lbl : Ord.factor w/ 7 levels "Sunday"<"Monday"<..: 2 3 4 5 6 7 1 2 3 4 ...
##  $ mweek    : Factor w/ 6 levels "1","2","3","4",..: 6 1 1 1 1 1 1 2 2 2 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr "base_date"
rm(date_min, date_max)

Then I simply merge the datasets.

add_time <-
  merge(x = add_weather,
        y = dates_dt,
        by.x = "el_date",
        by.y = "base_date",
        all.x = TRUE
        ) %>% 
  setkey(el_stop_id,
         el_date)

dim(add_weather)
## [1] 235378     35
dim(add_time)
## [1] 235378     42
str(add_time)
## Classes 'data.table' and 'data.frame':   235378 obs. of  42 variables:
##  $ el_date                        : Date, format: "2013-07-01" "2013-07-02" ...
##  $ el_stop_id                     : num  40010 40010 40010 40010 40010 ...
##  $ divvy_pt5mi_stn_cnt            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus         : num  1022 599 479 2652 2071 ...
##  $ divvy_all_triptime_mean_cus    : num  2109 2071 2292 2597 2492 ...
##  $ divvy_all_triptime_med_cus     : num  1262 1197 1143 1469 1559 ...
##  $ divvy_all_trip_cnt_dep         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub         : num  537 509 528 304 392 260 286 503 702 751 ...
##  $ divvy_all_triptime_mean_sub    : num  1050 1230 958 1009 844 ...
##  $ divvy_all_triptime_med_sub     : num  639 584 620 804 724 ...
##  $ divvy_mindist_miles            : num  5.75 5.75 5.75 5.75 5.75 ...
##  $ divvy_mindist_trip_cnt_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ el_rides                       : num  2058 2002 2050 1045 1546 ...
##  $ holiday_name                   : Factor w/ 24 levels "Casimir Pulaski Day",..: NA NA NA 10 NA NA NA NA NA NA ...
##  $ holiday_comment                : chr  NA NA NA "" ...
##  $ holiday                        : logi  FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ snwd                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ snow                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ prcp                           : num  0 0 0.02 0 0 0 0 1.32 0.05 0 ...
##  $ tmax                           : int  75 68 74 84 87 82 89 87 85 87 ...
##  $ tmin                           : int  60 61 61 60 62 71 70 70 75 65 ...
##  $ tobs                           : int  67 61 61 71 74 73 79 78 80 65 ...
##  $ year                           : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ half                           : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ quarter                        : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##  $ month                          : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day                            : Factor w/ 31 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ wday.lbl                       : Ord.factor w/ 7 levels "Sunday"<"Monday"<..: 2 3 4 5 6 7 1 2 3 4 ...
##  $ mweek                          : Factor w/ 6 levels "1","2","3","4",..: 6 1 1 1 1 1 1 2 2 2 ...
##  - attr(*, "sorted")= chr  "el_stop_id" "el_date"
##  - attr(*, ".internal.selfref")=<externalptr>
rm(add_weather, dates_dt)

Inspect Missing Values

summary(add_time)
##     el_date             el_stop_id    divvy_pt5mi_stn_cnt
##  Min.   :2013-07-01   Min.   :40010   Min.   : 0.000     
##  1st Qu.:2014-08-16   1st Qu.:40380   1st Qu.: 0.000     
##  Median :2015-10-01   Median :40780   Median : 3.000     
##  Mean   :2015-10-01   Mean   :40786   Mean   : 6.211     
##  3rd Qu.:2016-11-16   3rd Qu.:41180   3rd Qu.: 9.000     
##  Max.   :2018-01-01   Max.   :41700   Max.   :29.000     
##                                                          
##  divvy_pt5mi_trip_cnt_cus divvy_pt5mi_triptime_mean_cus
##  Min.   :   0.00          Min.   :    0.0              
##  1st Qu.:   0.00          1st Qu.:    0.0              
##  Median :   0.00          Median :    0.0              
##  Mean   :  28.94          Mean   :  609.4              
##  3rd Qu.:   4.00          3rd Qu.: 1123.2              
##  Max.   :2772.00          Max.   :85578.0              
##                                                        
##  divvy_pt5mi_trip_cnt_dep divvy_pt5mi_triptime_mean_dep
##  Min.   :0.0000000        Min.   :   0.0000            
##  1st Qu.:0.0000000        1st Qu.:   0.0000            
##  Median :0.0000000        Median :   0.0000            
##  Mean   :0.0006883        Mean   :   0.4439            
##  3rd Qu.:0.0000000        3rd Qu.:   0.0000            
##  Max.   :3.0000000        Max.   :2080.0000            
##                                                        
##  divvy_pt5mi_trip_cnt_sub divvy_pt5mi_triptime_mean_sub
##  Min.   :   0             Min.   :    0.0              
##  1st Qu.:   0             1st Qu.:    0.0              
##  Median :   0             Median :    0.0              
##  Mean   : 109             Mean   :  296.0              
##  3rd Qu.:  40             3rd Qu.:  638.4              
##  Max.   :3423             Max.   :75696.0              
##                                                        
##  divvy_all_trip_cnt_cus divvy_all_triptime_mean_cus
##  Min.   :    0          Min.   :   0               
##  1st Qu.:  206          1st Qu.:1494               
##  Median : 1306          Median :1690               
##  Mean   : 2290          Mean   :1659               
##  3rd Qu.: 3305          3rd Qu.:1835               
##  Max.   :18280          Max.   :6984               
##                                                    
##  divvy_all_triptime_med_cus divvy_all_trip_cnt_dep
##  Min.   :   0               Min.   :0.0000        
##  1st Qu.:1076               1st Qu.:0.0000        
##  Median :1235               Median :0.0000        
##  Mean   :1179               Mean   :0.1154        
##  3rd Qu.:1326               3rd Qu.:0.0000        
##  Max.   :2155               Max.   :7.0000        
##                                                   
##  divvy_all_triptime_mean_dep divvy_all_triptime_med_dep
##  Min.   :   0.00             Min.   :   0.00           
##  1st Qu.:   0.00             1st Qu.:   0.00           
##  Median :   0.00             Median :   0.00           
##  Mean   :  44.53             Mean   :  43.86           
##  3rd Qu.:   0.00             3rd Qu.:   0.00           
##  Max.   :1938.50             Max.   :1938.50           
##                                                        
##  divvy_all_trip_cnt_sub divvy_all_triptime_mean_sub
##  Min.   :    0          Min.   :   0.0             
##  1st Qu.: 2604          1st Qu.: 631.3             
##  Median : 5484          Median : 695.0             
##  Mean   : 6104          Mean   : 699.7             
##  3rd Qu.: 8973          3rd Qu.: 751.6             
##  Max.   :16243          Max.   :2001.9             
##                                                    
##  divvy_all_triptime_med_sub divvy_mindist_miles divvy_mindist_trip_cnt_cus
##  Min.   :  0.0              Min.   : 0.006815   Min.   :  0.000           
##  1st Qu.:491.0              1st Qu.: 0.034447   1st Qu.:  0.000           
##  Median :550.0              Median : 0.070045   Median :  0.000           
##  Mean   :555.8              Mean   : 0.912961   Mean   :  3.026           
##  3rd Qu.:610.0              3rd Qu.: 0.999441   3rd Qu.:  2.000           
##  Max.   :939.0              Max.   :12.575219   Max.   :228.000           
##                                                                           
##  divvy_mindist_trip_cnt_dep divvy_mindist_trip_cnt_sub
##  Min.   :0.0000000          Min.   :  0.00            
##  1st Qu.:0.0000000          1st Qu.:  0.00            
##  Median :0.0000000          Median :  3.00            
##  Mean   :0.0001529          Mean   : 11.31            
##  3rd Qu.:0.0000000          3rd Qu.: 13.00            
##  Max.   :2.0000000          Max.   :234.00            
##                                                       
##  divvy_mindist_triptime_mean_cus divvy_mindist_triptime_mean_dep
##  Min.   :    0.0                 Min.   :   0.000               
##  1st Qu.:    0.0                 1st Qu.:   0.000               
##  Median :    0.0                 Median :   0.000               
##  Mean   :  832.5                 Mean   :   0.114               
##  3rd Qu.: 1157.0                 3rd Qu.:   0.000               
##  Max.   :86224.0                 Max.   :1877.000               
##                                                                 
##  divvy_mindist_triptime_mean_sub    el_rides    
##  Min.   :    0.0                 Min.   :    0  
##  1st Qu.:    0.0                 1st Qu.: 1278  
##  Median :  540.9                 Median : 2606  
##  Mean   :  556.4                 Mean   : 3681  
##  3rd Qu.:  741.0                 3rd Qu.: 4686  
##  Max.   :81045.0                 Max.   :36323  
##                                  NA's   :2311   
##                  holiday_name    holiday_comment     holiday       
##  Columbus Day          :   715   Length:235378      Mode :logical  
##  Day after Thanksgiving:   715   Class :character   FALSE:225368   
##  Labor Day             :   715   Mode  :character   TRUE :10010    
##  Thanksgiving          :   715                                     
##  Christmas Day         :   572                                     
##  (Other)               :  6578                                     
##  NA's                  :225368                                     
##       snwd              snow              prcp             tmax      
##  Min.   : 0.0000   Min.   : 0.0000   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:44.00  
##  Median : 0.0000   Median : 0.0000   Median :0.0000   Median :64.00  
##  Mean   : 0.6072   Mean   : 0.1182   Mean   :0.1231   Mean   :61.09  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.:0.0500   3rd Qu.:80.00  
##  Max.   :17.0000   Max.   :17.2000   Max.   :5.1100   Max.   :97.00  
##  NA's   :572       NA's   :429       NA's   :143      NA's   :143    
##       tmin             tobs         year       half       quarter  
##  Min.   :-15.00   Min.   :-10.0   2013:26312   1:103818   1:51766  
##  1st Qu.: 31.00   1st Qu.: 36.0   2014:52195   2:131560   2:52052  
##  Median : 46.00   Median : 52.0   2015:52195              3:65780  
##  Mean   : 45.03   Mean   : 50.4   2016:52338              4:65780  
##  3rd Qu.: 62.00   3rd Qu.: 67.0   2017:52195                       
##  Max.   : 79.00   Max.   : 84.0   2018:  143                       
##  NA's   :143      NA's   :143                                      
##      month             day              wday.lbl     mweek    
##  7      : 22165   1      :  7865   Sunday   :33605   1:31603  
##  8      : 22165   2      :  7722   Monday   :33748   2:54054  
##  10     : 22165   3      :  7722   Tuesday  :33605   3:54054  
##  12     : 22165   4      :  7722   Wednesday:33605   4:54054  
##  9      : 21450   5      :  7722   Thursday :33605   5:39325  
##  11     : 21450   6      :  7722   Friday   :33605   6: 2288  
##  (Other):103818   (Other):188903   Saturday :33605
miss_var_summary(add_time)
# View(miss_var_summary(add_time))

El Rides

I believe NA values for el_rides are for dates when a particular station was out of service (e.g., a station was out of service for a period of time, or only came into existence after a certain date). For modeling, I therefore limit the dataset to only those instances where el_rides is greater than zero. NOTE: For time series methods requiring equal spaced events, this “could” present an issue.

Showing an example of missing el_rides values

el_stop_na <-
  add_time %>% 
  filter(is.na(el_rides)
         ) %>% 
  select(el_stop_id) %>% 
  arrange(el_stop_id) %>% 
  distinct() %>% 
  top_n(n = 2, wt = el_stop_id) %>% 
  pull(el_stop_id)


# View(
  add_time %>% 
    filter(el_stop_id %in% el_stop_na) %>% 
    select(el_stop_id,
           el_date,
           el_rides
           ) %>% 
    arrange(el_stop_id,
            el_date
            ) %>% 
    split(.$el_stop_id) %>% 
    map(~ head(.x, 100)
        )
## $`41690`
##     el_stop_id    el_date el_rides
## 1        41690 2013-07-01       NA
## 2        41690 2013-07-02       NA
## 3        41690 2013-07-03       NA
## 4        41690 2013-07-04       NA
## 5        41690 2013-07-05       NA
## 6        41690 2013-07-06       NA
## 7        41690 2013-07-07       NA
## 8        41690 2013-07-08       NA
## 9        41690 2013-07-09       NA
## 10       41690 2013-07-10       NA
## 11       41690 2013-07-11       NA
## 12       41690 2013-07-12       NA
## 13       41690 2013-07-13       NA
## 14       41690 2013-07-14       NA
## 15       41690 2013-07-15       NA
## 16       41690 2013-07-16       NA
## 17       41690 2013-07-17       NA
## 18       41690 2013-07-18       NA
## 19       41690 2013-07-19       NA
## 20       41690 2013-07-20       NA
## 21       41690 2013-07-21       NA
## 22       41690 2013-07-22       NA
## 23       41690 2013-07-23       NA
## 24       41690 2013-07-24       NA
## 25       41690 2013-07-25       NA
## 26       41690 2013-07-26       NA
## 27       41690 2013-07-27       NA
## 28       41690 2013-07-28       NA
## 29       41690 2013-07-29       NA
## 30       41690 2013-07-30       NA
## 31       41690 2013-07-31       NA
## 32       41690 2013-08-01       NA
## 33       41690 2013-08-02       NA
## 34       41690 2013-08-03       NA
## 35       41690 2013-08-04       NA
## 36       41690 2013-08-05       NA
## 37       41690 2013-08-06       NA
## 38       41690 2013-08-07       NA
## 39       41690 2013-08-08       NA
## 40       41690 2013-08-09       NA
## 41       41690 2013-08-10       NA
## 42       41690 2013-08-11       NA
## 43       41690 2013-08-12       NA
## 44       41690 2013-08-13       NA
## 45       41690 2013-08-14       NA
## 46       41690 2013-08-15       NA
## 47       41690 2013-08-16       NA
## 48       41690 2013-08-17       NA
## 49       41690 2013-08-18       NA
## 50       41690 2013-08-19       NA
## 51       41690 2013-08-20       NA
## 52       41690 2013-08-21       NA
## 53       41690 2013-08-22       NA
## 54       41690 2013-08-23       NA
## 55       41690 2013-08-24       NA
## 56       41690 2013-08-25       NA
## 57       41690 2013-08-26       NA
## 58       41690 2013-08-27       NA
## 59       41690 2013-08-28       NA
## 60       41690 2013-08-29       NA
## 61       41690 2013-08-30       NA
## 62       41690 2013-08-31       NA
## 63       41690 2013-09-01       NA
## 64       41690 2013-09-02       NA
## 65       41690 2013-09-03       NA
## 66       41690 2013-09-04       NA
## 67       41690 2013-09-05       NA
## 68       41690 2013-09-06       NA
## 69       41690 2013-09-07       NA
## 70       41690 2013-09-08       NA
## 71       41690 2013-09-09       NA
## 72       41690 2013-09-10       NA
## 73       41690 2013-09-11       NA
## 74       41690 2013-09-12       NA
## 75       41690 2013-09-13       NA
## 76       41690 2013-09-14       NA
## 77       41690 2013-09-15       NA
## 78       41690 2013-09-16       NA
## 79       41690 2013-09-17       NA
## 80       41690 2013-09-18       NA
## 81       41690 2013-09-19       NA
## 82       41690 2013-09-20       NA
## 83       41690 2013-09-21       NA
## 84       41690 2013-09-22       NA
## 85       41690 2013-09-23       NA
## 86       41690 2013-09-24       NA
## 87       41690 2013-09-25       NA
## 88       41690 2013-09-26       NA
## 89       41690 2013-09-27       NA
## 90       41690 2013-09-28       NA
## 91       41690 2013-09-29       NA
## 92       41690 2013-09-30       NA
## 93       41690 2013-10-01       NA
## 94       41690 2013-10-02       NA
## 95       41690 2013-10-03       NA
## 96       41690 2013-10-04       NA
## 97       41690 2013-10-05       NA
## 98       41690 2013-10-06       NA
## 99       41690 2013-10-07       NA
## 100      41690 2013-10-08       NA
## 
## $`41700`
##      el_stop_id    el_date el_rides
## 1647      41700 2013-07-01       NA
## 1648      41700 2013-07-02       NA
## 1649      41700 2013-07-03       NA
## 1650      41700 2013-07-04       NA
## 1651      41700 2013-07-05       NA
## 1652      41700 2013-07-06       NA
## 1653      41700 2013-07-07       NA
## 1654      41700 2013-07-08       NA
## 1655      41700 2013-07-09       NA
## 1656      41700 2013-07-10       NA
## 1657      41700 2013-07-11       NA
## 1658      41700 2013-07-12       NA
## 1659      41700 2013-07-13       NA
## 1660      41700 2013-07-14       NA
## 1661      41700 2013-07-15       NA
## 1662      41700 2013-07-16       NA
## 1663      41700 2013-07-17       NA
## 1664      41700 2013-07-18       NA
## 1665      41700 2013-07-19       NA
## 1666      41700 2013-07-20       NA
## 1667      41700 2013-07-21       NA
## 1668      41700 2013-07-22       NA
## 1669      41700 2013-07-23       NA
## 1670      41700 2013-07-24       NA
## 1671      41700 2013-07-25       NA
## 1672      41700 2013-07-26       NA
## 1673      41700 2013-07-27       NA
## 1674      41700 2013-07-28       NA
## 1675      41700 2013-07-29       NA
## 1676      41700 2013-07-30       NA
## 1677      41700 2013-07-31       NA
## 1678      41700 2013-08-01       NA
## 1679      41700 2013-08-02       NA
## 1680      41700 2013-08-03       NA
## 1681      41700 2013-08-04       NA
## 1682      41700 2013-08-05       NA
## 1683      41700 2013-08-06       NA
## 1684      41700 2013-08-07       NA
## 1685      41700 2013-08-08       NA
## 1686      41700 2013-08-09       NA
## 1687      41700 2013-08-10       NA
## 1688      41700 2013-08-11       NA
## 1689      41700 2013-08-12       NA
## 1690      41700 2013-08-13       NA
## 1691      41700 2013-08-14       NA
## 1692      41700 2013-08-15       NA
## 1693      41700 2013-08-16       NA
## 1694      41700 2013-08-17       NA
## 1695      41700 2013-08-18       NA
## 1696      41700 2013-08-19       NA
## 1697      41700 2013-08-20       NA
## 1698      41700 2013-08-21       NA
## 1699      41700 2013-08-22       NA
## 1700      41700 2013-08-23       NA
## 1701      41700 2013-08-24       NA
## 1702      41700 2013-08-25       NA
## 1703      41700 2013-08-26       NA
## 1704      41700 2013-08-27       NA
## 1705      41700 2013-08-28       NA
## 1706      41700 2013-08-29       NA
## 1707      41700 2013-08-30       NA
## 1708      41700 2013-08-31       NA
## 1709      41700 2013-09-01       NA
## 1710      41700 2013-09-02       NA
## 1711      41700 2013-09-03       NA
## 1712      41700 2013-09-04       NA
## 1713      41700 2013-09-05       NA
## 1714      41700 2013-09-06       NA
## 1715      41700 2013-09-07       NA
## 1716      41700 2013-09-08       NA
## 1717      41700 2013-09-09       NA
## 1718      41700 2013-09-10       NA
## 1719      41700 2013-09-11       NA
## 1720      41700 2013-09-12       NA
## 1721      41700 2013-09-13       NA
## 1722      41700 2013-09-14       NA
## 1723      41700 2013-09-15       NA
## 1724      41700 2013-09-16       NA
## 1725      41700 2013-09-17       NA
## 1726      41700 2013-09-18       NA
## 1727      41700 2013-09-19       NA
## 1728      41700 2013-09-20       NA
## 1729      41700 2013-09-21       NA
## 1730      41700 2013-09-22       NA
## 1731      41700 2013-09-23       NA
## 1732      41700 2013-09-24       NA
## 1733      41700 2013-09-25       NA
## 1734      41700 2013-09-26       NA
## 1735      41700 2013-09-27       NA
## 1736      41700 2013-09-28       NA
## 1737      41700 2013-09-29       NA
## 1738      41700 2013-09-30       NA
## 1739      41700 2013-10-01       NA
## 1740      41700 2013-10-02       NA
## 1741      41700 2013-10-03       NA
## 1742      41700 2013-10-04       NA
## 1743      41700 2013-10-05       NA
## 1744      41700 2013-10-06       NA
## 1745      41700 2013-10-07       NA
## 1746      41700 2013-10-08       NA
  # )

rm(el_stop_na)

Filter the data for el_rides greater than zero.

el_rides_nozero_noNA <-
  add_time %>% 
  filter(el_rides >= 0)


message("rows removed")
## rows removed
nrow(add_time) - nrow(el_rides_nozero_noNA)
## [1] 2311
message("el_rides_nozero_noNA")
## el_rides_nozero_noNA
str(el_rides_nozero_noNA)
## 'data.frame':    233067 obs. of  42 variables:
##  $ el_date                        : Date, format: "2013-07-01" "2013-07-02" ...
##  $ el_stop_id                     : num  40010 40010 40010 40010 40010 ...
##  $ divvy_pt5mi_stn_cnt            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus         : num  1022 599 479 2652 2071 ...
##  $ divvy_all_triptime_mean_cus    : num  2109 2071 2292 2597 2492 ...
##  $ divvy_all_triptime_med_cus     : num  1262 1197 1143 1469 1559 ...
##  $ divvy_all_trip_cnt_dep         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub         : num  537 509 528 304 392 260 286 503 702 751 ...
##  $ divvy_all_triptime_mean_sub    : num  1050 1230 958 1009 844 ...
##  $ divvy_all_triptime_med_sub     : num  639 584 620 804 724 ...
##  $ divvy_mindist_miles            : num  5.75 5.75 5.75 5.75 5.75 ...
##  $ divvy_mindist_trip_cnt_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ el_rides                       : num  2058 2002 2050 1045 1546 ...
##  $ holiday_name                   : Factor w/ 24 levels "Casimir Pulaski Day",..: NA NA NA 10 NA NA NA NA NA NA ...
##  $ holiday_comment                : chr  NA NA NA "" ...
##  $ holiday                        : logi  FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ snwd                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ snow                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ prcp                           : num  0 0 0.02 0 0 0 0 1.32 0.05 0 ...
##  $ tmax                           : int  75 68 74 84 87 82 89 87 85 87 ...
##  $ tmin                           : int  60 61 61 60 62 71 70 70 75 65 ...
##  $ tobs                           : int  67 61 61 71 74 73 79 78 80 65 ...
##  $ year                           : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ half                           : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ quarter                        : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##  $ month                          : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day                            : Factor w/ 31 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ wday.lbl                       : Ord.factor w/ 7 levels "Sunday"<"Monday"<..: 2 3 4 5 6 7 1 2 3 4 ...
##  $ mweek                          : Factor w/ 6 levels "1","2","3","4",..: 6 1 1 1 1 1 1 2 2 2 ...
##  - attr(*, "sorted")= chr  "el_stop_id" "el_date"
##  - attr(*, ".internal.selfref")=<externalptr>
summary(el_rides_nozero_noNA)
##     el_date             el_stop_id    divvy_pt5mi_stn_cnt
##  Min.   :2013-07-01   Min.   :40010   Min.   : 0.0       
##  1st Qu.:2014-08-20   1st Qu.:40380   1st Qu.: 0.0       
##  Median :2015-10-05   Median :40770   Median : 3.0       
##  Mean   :2015-10-04   Mean   :40778   Mean   : 6.1       
##  3rd Qu.:2016-11-18   3rd Qu.:41170   3rd Qu.: 8.0       
##  Max.   :2018-01-01   Max.   :41700   Max.   :29.0       
##                                                          
##  divvy_pt5mi_trip_cnt_cus divvy_pt5mi_triptime_mean_cus
##  Min.   :   0.00          Min.   :    0.0              
##  1st Qu.:   0.00          1st Qu.:    0.0              
##  Median :   0.00          Median :    0.0              
##  Mean   :  27.94          Mean   :  607.4              
##  3rd Qu.:   4.00          3rd Qu.: 1114.8              
##  Max.   :2772.00          Max.   :85578.0              
##                                                        
##  divvy_pt5mi_trip_cnt_dep divvy_pt5mi_triptime_mean_dep
##  Min.   :0.0000000        Min.   :   0.0000            
##  1st Qu.:0.0000000        1st Qu.:   0.0000            
##  Median :0.0000000        Median :   0.0000            
##  Mean   :0.0006951        Mean   :   0.4483            
##  3rd Qu.:0.0000000        3rd Qu.:   0.0000            
##  Max.   :3.0000000        Max.   :2080.0000            
##                                                        
##  divvy_pt5mi_trip_cnt_sub divvy_pt5mi_triptime_mean_sub
##  Min.   :   0.0           Min.   :    0.0              
##  1st Qu.:   0.0           1st Qu.:    0.0              
##  Median :   0.0           Median :    0.0              
##  Mean   : 106.9           Mean   :  295.5              
##  3rd Qu.:  39.0           3rd Qu.:  637.6              
##  Max.   :3423.0           Max.   :75696.0              
##                                                        
##  divvy_all_trip_cnt_cus divvy_all_triptime_mean_cus
##  Min.   :    0          Min.   :   0               
##  1st Qu.:  206          1st Qu.:1494               
##  Median : 1305          Median :1690               
##  Mean   : 2289          Mean   :1659               
##  3rd Qu.: 3305          3rd Qu.:1835               
##  Max.   :18280          Max.   :6984               
##                                                    
##  divvy_all_triptime_med_cus divvy_all_trip_cnt_dep
##  Min.   :   0               Min.   :0.0000        
##  1st Qu.:1076               1st Qu.:0.0000        
##  Median :1237               Median :0.0000        
##  Mean   :1179               Mean   :0.1158        
##  3rd Qu.:1326               3rd Qu.:0.0000        
##  Max.   :2155               Max.   :7.0000        
##                                                   
##  divvy_all_triptime_mean_dep divvy_all_triptime_med_dep
##  Min.   :   0.00             Min.   :   0.00           
##  1st Qu.:   0.00             1st Qu.:   0.00           
##  Median :   0.00             Median :   0.00           
##  Mean   :  44.66             Mean   :  43.99           
##  3rd Qu.:   0.00             3rd Qu.:   0.00           
##  Max.   :1938.50             Max.   :1938.50           
##                                                        
##  divvy_all_trip_cnt_sub divvy_all_triptime_mean_sub
##  Min.   :    0          Min.   :   0.0             
##  1st Qu.: 2617          1st Qu.: 631.0             
##  Median : 5512          Median : 694.9             
##  Mean   : 6115          Mean   : 699.5             
##  3rd Qu.: 8990          3rd Qu.: 751.5             
##  Max.   :16243          Max.   :2001.9             
##                                                    
##  divvy_all_triptime_med_sub divvy_mindist_miles divvy_mindist_trip_cnt_cus
##  Min.   :  0.0              Min.   : 0.006815   Min.   :  0.000           
##  1st Qu.:491.0              1st Qu.: 0.034447   1st Qu.:  0.000           
##  Median :550.0              Median : 0.070045   Median :  0.000           
##  Mean   :555.8              Mean   : 0.919726   Mean   :  2.881           
##  3rd Qu.:610.0              3rd Qu.: 1.033721   3rd Qu.:  2.000           
##  Max.   :939.0              Max.   :12.575219   Max.   :228.000           
##                                                                           
##  divvy_mindist_trip_cnt_dep divvy_mindist_trip_cnt_sub
##  Min.   :0.0000000          Min.   :  0.00            
##  1st Qu.:0.0000000          1st Qu.:  0.00            
##  Median :0.0000000          Median :  3.00            
##  Mean   :0.0001545          Mean   : 11.14            
##  3rd Qu.:0.0000000          3rd Qu.: 13.00            
##  Max.   :2.0000000          Max.   :234.00            
##                                                       
##  divvy_mindist_triptime_mean_cus divvy_mindist_triptime_mean_dep
##  Min.   :    0.0                 Min.   :   0.0000              
##  1st Qu.:    0.0                 1st Qu.:   0.0000              
##  Median :    0.0                 Median :   0.0000              
##  Mean   :  826.2                 Mean   :   0.1151              
##  3rd Qu.: 1144.6                 3rd Qu.:   0.0000              
##  Max.   :86224.0                 Max.   :1877.0000              
##                                                                 
##  divvy_mindist_triptime_mean_sub    el_rides    
##  Min.   :    0.0                 Min.   :    0  
##  1st Qu.:    0.0                 1st Qu.: 1278  
##  Median :  539.1                 Median : 2606  
##  Mean   :  555.4                 Mean   : 3681  
##  3rd Qu.:  740.8                 3rd Qu.: 4686  
##  Max.   :81045.0                 Max.   :36323  
##                                                 
##                  holiday_name    holiday_comment     holiday       
##  Columbus Day          :   709   Length:233067      Mode :logical  
##  Day after Thanksgiving:   709   Class :character   FALSE:223153   
##  Thanksgiving          :   709   Mode  :character   TRUE :9914     
##  Labor Day             :   700                                     
##  Christmas Day         :   567                                     
##  (Other)               :  6520                                     
##  NA's                  :223153                                     
##       snwd              snow              prcp             tmax      
##  Min.   : 0.0000   Min.   : 0.0000   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.: 0.0000   1st Qu.: 0.0000   1st Qu.:0.0000   1st Qu.:44.00  
##  Median : 0.0000   Median : 0.0000   Median :0.0000   Median :64.00  
##  Mean   : 0.6069   Mean   : 0.1182   Mean   :0.1231   Mean   :61.08  
##  3rd Qu.: 0.0000   3rd Qu.: 0.0000   3rd Qu.:0.0500   3rd Qu.:80.00  
##  Max.   :17.0000   Max.   :17.2000   Max.   :5.1100   Max.   :97.00  
##  NA's   :569       NA's   :426       NA's   :143      NA's   :143    
##       tmin             tobs          year       half       quarter  
##  Min.   :-15.00   Min.   :-10.00   2013:25674   1:102912   1:51315  
##  1st Qu.: 31.00   1st Qu.: 36.00   2014:51465   2:130155   2:51597  
##  Median : 46.00   Median : 52.00   2015:51830              3:64927  
##  Mean   : 45.02   Mean   : 50.39   2016:51972              4:65228  
##  3rd Qu.: 62.00   3rd Qu.: 67.00   2017:51983                       
##  Max.   : 79.00   Max.   : 84.00   2018:  143                       
##  NA's   :143      NA's   :143                                       
##      month             day              wday.lbl     mweek    
##  8      : 21979   1      :  7789   Sunday   :33269   1:31264  
##  10     : 21979   2      :  7646   Monday   :33410   2:53522  
##  12     : 21979   3      :  7646   Tuesday  :33277   3:53522  
##  7      : 21948   4      :  7646   Wednesday:33277   4:53522  
##  11     : 21270   5      :  7646   Thursday :33278   5:38968  
##  9      : 21000   6      :  7646   Friday   :33278   6: 2269  
##  (Other):102912   (Other):187048   Saturday :33278
miss_var_summary(el_rides_nozero_noNA)
# View(miss_var_summary(el_rides_nozero_noNA))


rm(add_time)

Weather Data

Data were included for el_rides for dates up to, and including, 2018-01-01, but the Weather data stops at 2017-12-31. So I can simply remove data after 2017-12-31 (which removes the bulk of the NA Weather data).

el_dates_no_weather <-
  el_rides_nozero_noNA %>% 
  filter(is.na(prcp)
         ) %>% 
  select(el_date) %>% 
  distinct() %>% 
  pull(el_date)


weather_20171231 <-
  el_rides_nozero_noNA %>% 
  filter(el_date < as_date(el_dates_no_weather)
         )


message("rows removed")
## rows removed
nrow(el_rides_nozero_noNA) - nrow(weather_20171231)
## [1] 143
message("weather_20171231")
## weather_20171231
# str(weather_20171231)
# summary(weather_20171231)

miss_var_summary(weather_20171231)
# View(miss_var_summary(weather_20171231))


rm(el_dates_no_weather, el_rides_nozero_noNA)

As snwd and snow appear to be sporadically (and rarely) missing (equipment failure?), here I simply fill in the NA values with the value from the previous day.

weather_20171231 %>% 
  filter(is.na(snwd) | is.na(snow)
         ) %>% 
  select(el_date, snwd, snow) %>% 
  distinct()
data_no_missing <-
  weather_20171231 %>% 
  mutate(snwd = na.locf(snwd),
         snow = na.locf(snow)
         ) %>% 
  as.data.table() %>% 
  setkey(el_date,
         el_stop_id
         )


message("data_no_missing")
## data_no_missing
str(data_no_missing)
## Classes 'data.table' and 'data.frame':   232924 obs. of  42 variables:
##  $ el_date                        : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                     : num  40010 40020 40030 40040 40050 ...
##  $ divvy_pt5mi_stn_cnt            : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus         : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus    : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus     : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub         : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub    : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub     : num  639 639 639 639 639 639 639 639 639 639 ...
##  $ divvy_mindist_miles            : num  5.75 6.65 2.96 0.1 8.13 ...
##  $ divvy_mindist_trip_cnt_cus     : num  0 10 10 0 3 10 11 3 3 3 ...
##  $ divvy_mindist_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub     : num  0 8 8 0 7 8 19 7 7 7 ...
##  $ divvy_mindist_triptime_mean_cus: num  0 2399 2399 0 1413 ...
##  $ divvy_mindist_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub: num  0 822 822 0 859 ...
##  $ el_rides                       : num  2058 3942 2070 8671 3703 ...
##  $ holiday_name                   : Factor w/ 24 levels "Casimir Pulaski Day",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ holiday_comment                : chr  NA NA NA NA ...
##  $ holiday                        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ snwd                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ snow                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ prcp                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ tmax                           : int  75 75 75 75 75 75 75 75 75 75 ...
##  $ tmin                           : int  60 60 60 60 60 60 60 60 60 60 ...
##  $ tobs                           : int  67 67 67 67 67 67 67 67 67 67 ...
##  $ year                           : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ half                           : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ quarter                        : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##  $ month                          : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day                            : Factor w/ 31 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ wday.lbl                       : Ord.factor w/ 7 levels "Sunday"<"Monday"<..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ mweek                          : Factor w/ 6 levels "1","2","3","4",..: 6 6 6 6 6 6 6 6 6 6 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id"
summary(data_no_missing)
##     el_date             el_stop_id    divvy_pt5mi_stn_cnt
##  Min.   :2013-07-01   Min.   :40010   Min.   : 0.000     
##  1st Qu.:2014-08-19   1st Qu.:40380   1st Qu.: 0.000     
##  Median :2015-10-04   Median :40770   Median : 3.000     
##  Mean   :2015-10-03   Mean   :40778   Mean   : 6.099     
##  3rd Qu.:2016-11-17   3rd Qu.:41170   3rd Qu.: 8.000     
##  Max.   :2017-12-31   Max.   :41700   Max.   :29.000     
##                                                          
##  divvy_pt5mi_trip_cnt_cus divvy_pt5mi_triptime_mean_cus
##  Min.   :   0.00          Min.   :    0.0              
##  1st Qu.:   0.00          1st Qu.:    0.0              
##  Median :   0.00          Median :    0.0              
##  Mean   :  27.96          Mean   :  607.8              
##  3rd Qu.:   4.00          3rd Qu.: 1115.5              
##  Max.   :2772.00          Max.   :85578.0              
##                                                        
##  divvy_pt5mi_trip_cnt_dep divvy_pt5mi_triptime_mean_dep
##  Min.   :0.0000000        Min.   :   0.0000            
##  1st Qu.:0.0000000        1st Qu.:   0.0000            
##  Median :0.0000000        Median :   0.0000            
##  Mean   :0.0006955        Mean   :   0.4485            
##  3rd Qu.:0.0000000        3rd Qu.:   0.0000            
##  Max.   :3.0000000        Max.   :2080.0000            
##                                                        
##  divvy_pt5mi_trip_cnt_sub divvy_pt5mi_triptime_mean_sub
##  Min.   :   0.0           Min.   :    0.0              
##  1st Qu.:   0.0           1st Qu.:    0.0              
##  Median :   0.0           Median :    0.0              
##  Mean   : 106.9           Mean   :  295.7              
##  3rd Qu.:  39.0           3rd Qu.:  637.7              
##  Max.   :3423.0           Max.   :75696.0              
##                                                        
##  divvy_all_trip_cnt_cus divvy_all_triptime_mean_cus
##  Min.   :    0          Min.   :   0               
##  1st Qu.:  206          1st Qu.:1494               
##  Median : 1305          Median :1691               
##  Mean   : 2291          Mean   :1660               
##  3rd Qu.: 3305          3rd Qu.:1836               
##  Max.   :18280          Max.   :6984               
##                                                    
##  divvy_all_triptime_med_cus divvy_all_trip_cnt_dep
##  Min.   :   0               Min.   :0.0000        
##  1st Qu.:1076               1st Qu.:0.0000        
##  Median :1237               Median :0.0000        
##  Mean   :1180               Mean   :0.1158        
##  3rd Qu.:1326               3rd Qu.:0.0000        
##  Max.   :2155               Max.   :7.0000        
##                                                   
##  divvy_all_triptime_mean_dep divvy_all_triptime_med_dep
##  Min.   :   0.00             Min.   :   0.00           
##  1st Qu.:   0.00             1st Qu.:   0.00           
##  Median :   0.00             Median :   0.00           
##  Mean   :  44.69             Mean   :  44.02           
##  3rd Qu.:   0.00             3rd Qu.:   0.00           
##  Max.   :1938.50             Max.   :1938.50           
##                                                        
##  divvy_all_trip_cnt_sub divvy_all_triptime_mean_sub
##  Min.   :    0          Min.   :   0.0             
##  1st Qu.: 2621          1st Qu.: 631.3             
##  Median : 5512          Median : 694.9             
##  Mean   : 6119          Mean   : 700.0             
##  3rd Qu.: 9001          3rd Qu.: 751.6             
##  Max.   :16243          Max.   :2001.9             
##                                                    
##  divvy_all_triptime_med_sub divvy_mindist_miles divvy_mindist_trip_cnt_cus
##  Min.   :  0.0              Min.   : 0.006815   Min.   :  0.000           
##  1st Qu.:491.0              1st Qu.: 0.034447   1st Qu.:  0.000           
##  Median :550.0              Median : 0.070045   Median :  0.000           
##  Mean   :556.1              Mean   : 0.920075   Mean   :  2.883           
##  3rd Qu.:610.0              3rd Qu.: 1.035528   3rd Qu.:  2.000           
##  Max.   :939.0              Max.   :12.575219   Max.   :228.000           
##                                                                           
##  divvy_mindist_trip_cnt_dep divvy_mindist_trip_cnt_sub
##  Min.   :0.0000000          Min.   :  0.00            
##  1st Qu.:0.0000000          1st Qu.:  0.00            
##  Median :0.0000000          Median :  3.00            
##  Mean   :0.0001546          Mean   : 11.15            
##  3rd Qu.:0.0000000          3rd Qu.: 13.00            
##  Max.   :2.0000000          Max.   :234.00            
##                                                       
##  divvy_mindist_triptime_mean_cus divvy_mindist_triptime_mean_dep
##  Min.   :    0.0                 Min.   :   0.0000              
##  1st Qu.:    0.0                 1st Qu.:   0.0000              
##  Median :    0.0                 Median :   0.0000              
##  Mean   :  826.8                 Mean   :   0.1152              
##  3rd Qu.: 1145.2                 3rd Qu.:   0.0000              
##  Max.   :86224.0                 Max.   :1877.0000              
##                                                                 
##  divvy_mindist_triptime_mean_sub    el_rides    
##  Min.   :    0.0                 Min.   :    0  
##  1st Qu.:    0.0                 1st Qu.: 1279  
##  Median :  539.5                 Median : 2608  
##  Mean   :  555.7                 Mean   : 3683  
##  3rd Qu.:  741.0                 3rd Qu.: 4687  
##  Max.   :81045.0                 Max.   :36323  
##                                                 
##                  holiday_name    holiday_comment     holiday       
##  Columbus Day          :   709   Length:232924      Mode :logical  
##  Day after Thanksgiving:   709   Class :character   FALSE:223153   
##  Thanksgiving          :   709   Mode  :character   TRUE :9771     
##  Labor Day             :   700                                     
##  Christmas Day         :   567                                     
##  (Other)               :  6377                                     
##  NA's                  :223153                                     
##       snwd              snow             prcp             tmax      
##  Min.   : 0.0000   Min.   : 0.000   Min.   :0.0000   Min.   : 0.00  
##  1st Qu.: 0.0000   1st Qu.: 0.000   1st Qu.:0.0000   1st Qu.:44.00  
##  Median : 0.0000   Median : 0.000   Median :0.0000   Median :64.00  
##  Mean   : 0.6058   Mean   : 0.118   Mean   :0.1231   Mean   :61.08  
##  3rd Qu.: 0.0000   3rd Qu.: 0.000   3rd Qu.:0.0500   3rd Qu.:80.00  
##  Max.   :17.0000   Max.   :17.200   Max.   :5.1100   Max.   :97.00  
##                                                                     
##       tmin             tobs          year       half       quarter  
##  Min.   :-15.00   Min.   :-10.00   2013:25674   1:102769   1:51172  
##  1st Qu.: 31.00   1st Qu.: 36.00   2014:51465   2:130155   2:51597  
##  Median : 46.00   Median : 52.00   2015:51830              3:64927  
##  Mean   : 45.02   Mean   : 50.39   2016:51972              4:65228  
##  3rd Qu.: 62.00   3rd Qu.: 67.00   2017:51983                       
##  Max.   : 79.00   Max.   : 84.00   2018:    0                       
##                                                                     
##      month             day              wday.lbl     mweek    
##  8      : 21979   1      :  7646   Sunday   :33269   1:31264  
##  10     : 21979   2      :  7646   Monday   :33267   2:53522  
##  12     : 21979   3      :  7646   Tuesday  :33277   3:53522  
##  7      : 21948   4      :  7646   Wednesday:33277   4:53522  
##  11     : 21270   5      :  7646   Thursday :33278   5:38968  
##  9      : 21000   6      :  7646   Friday   :33278   6: 2126  
##  (Other):102769   (Other):187048   Saturday :33278
miss_var_summary(data_no_missing)
# View(miss_var_summary(data_no_missing))


## Save the  data to the proper folder
saveRDS(data_no_missing,
        paste0(wd,
               "/Data/Interim/",
               "data_no_missing.Rds"
               )
        )


rm(weather_20171231)

As the prediction forcast is 7 days in the future, I don’t think we’ll be able to accuratley know the temperature variables tmin and tmax within one degree of accuracy. So here, I simply test if knowing within X degrees of accuracy is viable.

Visually, it looks like 25 degrees may be a good range (this is a lot wider than I epected!).

func_temp_linegraph <-
  function(date_1, date_2) {
    data_no_missing %>% 
      select(el_date,
             tmin,
             tmax
             ) %>% 
      distinct() %>% 
      mutate(wk_start = floor_date(x = el_date,
                                   unit = "week"#,
                                   # week_start = getOption("lubridate.week.start", 7)
                                   )
             ) %>% 
      filter(dplyr::between(wk_start,
                            as_date(date_1),
                            as_date(date_2)
                            )
             ) %>% 
      gather(key = temp_type,
             value = degrees_f,
             tmin:tmax
             ) %>% 
      ggplot(aes(x = el_date,
                 y = degrees_f,
                 color = temp_type
                 )
             ) +
      geom_line() +
      facet_wrap(~ wk_start,
                 scales = "free",
                 ncol = 4
                 ) +
      theme_minimal() +
      NULL
  }

func_temp_linegraph(date_1 = "2013-06-30", date_2 = "2013-09-30")

func_temp_linegraph(date_1 = "2013-09-30", date_2 = "2013-12-31")

func_temp_linegraph(date_1 = "2013-12-31", date_2 = "2014-03-30")

func_temp_linegraph(date_1 = "2014-04-30", date_2 = "2014-06-30")

func_temp_linegraph(date_1 = "2014-06-30", date_2 = "2014-09-30")

rm(func_temp_linegraph)

Formally testing accuracy for rounding to nearest 10 degrees with a lag of 7 days.

min(data_no_missing$tmin)
## [1] -15
max(data_no_missing$tmin)
## [1] 79
min(data_no_missing$tmax)
## [1] 0
max(data_no_missing$tmax)
## [1] 97
temp_round <-
  data_no_missing %>% 
  select(el_date,
         tmin,
         tmax
         ) %>% 
  distinct() %>% 
  mutate(#tmin_rnd10 = floor(tmin / 10) * 10,
         #tmax_rnd10 = floor(tmax / 10) * 10,
         tmin_rnd10 = round(tmin, -1),
         tmax_rnd10 = round(tmax, -1),
         tmin_rnd10_l7 = dplyr::lag(x = tmin_rnd10, n = 7),
         tmax_rnd10_l7 = dplyr::lag(x = tmax_rnd10, n = 7)
         ) %>% 
  mutate_at(vars(tmin_rnd10:tmax_rnd10_l7), factor)

message("temp_round")
## temp_round
str(temp_round)
## 'data.frame':    1645 obs. of  7 variables:
##  $ el_date      : Date, format: "2013-07-01" "2013-07-02" ...
##  $ tmin         : int  60 61 61 60 62 71 70 70 75 65 ...
##  $ tmax         : int  75 68 74 84 87 82 89 87 85 87 ...
##  $ tmin_rnd10   : Factor w/ 11 levels "-20","-10","0",..: 9 9 9 9 9 10 10 10 11 9 ...
##  $ tmax_rnd10   : Factor w/ 11 levels "0","10","20",..: 9 8 8 9 10 9 10 10 9 10 ...
##  $ tmin_rnd10_l7: Factor w/ 11 levels "-20","-10","0",..: NA NA NA NA NA NA NA 9 9 9 ...
##  $ tmax_rnd10_l7: Factor w/ 11 levels "0","10","20",..: NA NA NA NA NA NA NA 9 8 8 ...
# View(temp_round)


message("tmin accuracy round")
## tmin accuracy round
temp_round %>% 
  select(tmin_rnd10,
         tmin_rnd10_l7
         ) %>% 
  filter(!is.na(tmin_rnd10_l7)) %>% 
  rename(obs = tmin_rnd10,
         pred = tmin_rnd10_l7
         ) %>% 
  multiClassSummary(lev = levels(temp_round$tmin_rnd10)
                    )
##               Accuracy                  Kappa                Mean_F1 
##             0.34493284             0.23010401                    NaN 
##       Mean_Sensitivity       Mean_Specificity    Mean_Pos_Pred_Value 
##             0.22074711             0.93040788             0.22147284 
##    Mean_Neg_Pred_Value         Mean_Precision            Mean_Recall 
##             0.93043743             0.22147284             0.22074711 
##    Mean_Detection_Rate Mean_Balanced_Accuracy 
##             0.03135753             0.57557749
message("tmax accuracy round")
## tmax accuracy round
temp_round %>% 
  select(tmax_rnd10,
         tmax_rnd10_l7
         ) %>% 
  filter(!is.na(tmax_rnd10_l7)) %>% 
  rename(obs = tmax_rnd10,
         pred = tmax_rnd10_l7
         ) %>% 
  multiClassSummary(lev = levels(temp_round$tmax_rnd10)
                    )
##               Accuracy                  Kappa                Mean_F1 
##             0.30402930             0.18944154                    NaN 
##       Mean_Sensitivity       Mean_Specificity    Mean_Pos_Pred_Value 
##             0.20356895             0.92668665             0.20371247 
##    Mean_Neg_Pred_Value         Mean_Precision            Mean_Recall 
##             0.92670573             0.20371247             0.20356895 
##    Mean_Detection_Rate Mean_Balanced_Accuracy 
##             0.02763903             0.56512780

Formally testing accuracy for a band of 25 degrees with a lag of 7 days.

temp_bands <-
  data_no_missing %>% 
  select(el_date,
         tmin,
         tmax
         ) %>% 
  distinct() %>% 
  mutate(tmin_bands = case_when(tmin < -25 ~ "01_blw_-25",
                                -25 <= tmin & tmin < 0 ~ "02_-25to00",
                                0 <= tmin & tmin < 25 ~ "03_00to25",
                                25 <= tmin & tmin < 50 ~ "04_25to50",
                                50 <= tmin & tmin < 75 ~ "05_50to75",
                                75 <= tmin & tmin < 100 ~ "06_75to100",
                                100 <= tmin ~ "07_abv_100"
                                ) %>% as.factor(),
         tmax_bands = case_when(tmax < 0 ~ "01_blw_00",
                                0 <= tmax & tmax < 25 ~ "02_00to25",
                                25 <= tmax & tmax < 50 ~ "03_25to50",
                                50 <= tmax & tmax < 75 ~ "04_50to75",
                                75 <= tmax & tmax < 100 ~ "05_75to100",
                                100 <= tmax ~ "06_abv_100"
                                ) %>% as.factor(),
         tmin_bands_l7 = dplyr::lag(tmin_bands, n = 7),
         tmax_bands_l7 = dplyr::lag(tmax_bands, n = 7)
         )

message("temp_bands")
## temp_bands
str(temp_bands)
## 'data.frame':    1645 obs. of  7 variables:
##  $ el_date      : Date, format: "2013-07-01" "2013-07-02" ...
##  $ tmin         : int  60 61 61 60 62 71 70 70 75 65 ...
##  $ tmax         : int  75 68 74 84 87 82 89 87 85 87 ...
##  $ tmin_bands   : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ tmax_bands   : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
##  $ tmin_bands_l7: Factor w/ 5 levels "02_-25to00","03_00to25",..: NA NA NA NA NA NA NA 4 4 4 ...
##  $ tmax_bands_l7: Factor w/ 4 levels "02_00to25","03_25to50",..: NA NA NA NA NA NA NA 4 3 3 ...
# View(temp_bands)


message("tmin accuracy bands")
## tmin accuracy bands
temp_bands %>% 
  select(tmin_bands,
         tmin_bands_l7
         ) %>% 
  filter(!is.na(tmin_bands_l7)) %>% 
  rename(obs = tmin_bands,
         pred = tmin_bands_l7
         ) %>% 
  multiClassSummary(lev = levels(temp_bands$tmin_bands)
                    )
##               Accuracy                  Kappa                Mean_F1 
##              0.6800977              0.4966502                    NaN 
##       Mean_Sensitivity       Mean_Specificity    Mean_Pos_Pred_Value 
##              0.4007143              0.9039017              0.4009996 
##    Mean_Neg_Pred_Value         Mean_Precision            Mean_Recall 
##              0.9043280              0.4009996              0.4007143 
##    Mean_Detection_Rate Mean_Balanced_Accuracy 
##              0.1360195              0.6523080
message("tmax accuracy bands")
## tmax accuracy bands
temp_bands %>% 
  select(tmax_bands,
         tmax_bands_l7
         ) %>% 
  filter(!is.na(tmax_bands_l7)) %>% 
  rename(obs = tmax_bands,
         pred = tmax_bands_l7
         ) %>% 
  multiClassSummary(lev = levels(temp_bands$tmax_bands)
                    )
##               Accuracy                  Kappa                Mean_F1 
##              0.6196581              0.4519100              0.5473956 
##       Mean_Sensitivity       Mean_Specificity    Mean_Pos_Pred_Value 
##              0.5457315              0.8644321              0.5492901 
##    Mean_Neg_Pred_Value         Mean_Precision            Mean_Recall 
##              0.8647268              0.5492901              0.5457315 
##    Mean_Detection_Rate Mean_Balanced_Accuracy 
##              0.1549145              0.7050818

As the 25 degree band does appear more accurate (although less accurate than expected), here I merely formalize the inclusion of the temperature bands.

# create bands of temp
temp_bands_25F <-
  data_no_missing[ ,
                  `:=` (tmin_bands = case_when(tmin < -25 ~ "01_blw_-25",
                                               -25 <= tmin & tmin < 0 ~ "02_-25to00",
                                               0 <= tmin & tmin < 25 ~ "03_00to25",
                                               25 <= tmin & tmin < 50 ~ "04_25to50",
                                               50 <= tmin & tmin < 75 ~ "05_50to75",
                                               75 <= tmin & tmin < 100 ~ "06_75to100",
                                               100 <= tmin ~ "07_abv_100"
                                               ) %>% 
                          as.factor(),
                        tmax_bands = case_when(tmax < 0 ~ "01_blw_00",
                                               0 <= tmax & tmax < 25 ~ "02_00to25",
                                               25 <= tmax & tmax < 50 ~ "03_25to50",
                                               50 <= tmax & tmax < 75 ~ "04_50to75",
                                               75 <= tmax & tmax < 100 ~ "05_75to100",
                                               100 <= tmax ~ "06_abv_100"
                                               ) %>% 
                          as.factor()
                        )
                  ]

# create lags at 7 days
temp_bands_25F <-
  temp_bands_25F[ ,
                 `:=` (tmin_bands_l7 = dplyr::lag(tmin_bands, n = 7),
                       tmax_bands_l7 = dplyr::lag(tmax_bands, n = 7)
                       )
                 ]

# remove no-longer-needed vars
temp_bands_25F <-
  temp_bands_25F[ ,
                 c("tmin", "tmax", "tobs") := NULL
                 ]


str(temp_bands_25F)
## Classes 'data.table' and 'data.frame':   232924 obs. of  43 variables:
##  $ el_date                        : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                     : num  40010 40020 40030 40040 40050 ...
##  $ divvy_pt5mi_stn_cnt            : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus         : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus    : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus     : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub         : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub    : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub     : num  639 639 639 639 639 639 639 639 639 639 ...
##  $ divvy_mindist_miles            : num  5.75 6.65 2.96 0.1 8.13 ...
##  $ divvy_mindist_trip_cnt_cus     : num  0 10 10 0 3 10 11 3 3 3 ...
##  $ divvy_mindist_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub     : num  0 8 8 0 7 8 19 7 7 7 ...
##  $ divvy_mindist_triptime_mean_cus: num  0 2399 2399 0 1413 ...
##  $ divvy_mindist_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub: num  0 822 822 0 859 ...
##  $ el_rides                       : num  2058 3942 2070 8671 3703 ...
##  $ holiday_name                   : Factor w/ 24 levels "Casimir Pulaski Day",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ holiday_comment                : chr  NA NA NA NA ...
##  $ holiday                        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ snwd                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ snow                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ prcp                           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ year                           : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ half                           : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ quarter                        : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##  $ month                          : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day                            : Factor w/ 31 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ wday.lbl                       : Ord.factor w/ 7 levels "Sunday"<"Monday"<..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ mweek                          : Factor w/ 6 levels "1","2","3","4",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ tmin_bands                     : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ tmax_bands                     : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ tmin_bands_l7                  : Factor w/ 5 levels "02_-25to00","03_00to25",..: NA NA NA NA NA NA NA 4 4 4 ...
##  $ tmax_bands_l7                  : Factor w/ 4 levels "02_00to25","03_25to50",..: NA NA NA NA NA NA NA 4 4 4 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id"
rm(data_no_missing, temp_bands, temp_round)

Visually, investigating precipitation-related variables (prcp, snow, and snwd).

func_prcp_linegraph <-
  function(date_1, date_2) {
    temp_bands_25F %>% 
      select(el_date,
             snwd,
             snow,
             prcp
             ) %>% 
      distinct() %>% 
      mutate(wk_start = floor_date(x = el_date,
                                   unit = "week"
                                   )
             ) %>% 
      filter(dplyr::between(wk_start,
                            as_date(date_1),
                            as_date(date_2)
                            )
             ) %>% 
      gather(key = metric,
             value = value,
             snwd:prcp
             ) %>% 
      ggplot(aes(x = el_date,
                 y = value,
                 color = metric
                 )
             ) +
      geom_line() +
      # coord_flip() +
      facet_wrap(vars(metric, wk_start),
                 scales = "free",
                 ncol = 4#,
                 # labeller = "label_both"
                 ) +
      theme_minimal() +
      theme(legend.position = "none") +
      NULL
    }


func_prcp_linegraph(date_1 = "2015-02-01", date_2 = "2015-02-28")

func_prcp_linegraph(date_1 = "2015-04-01", date_2 = "2015-04-30")

func_prcp_linegraph(date_1 = "2015-06-01", date_2 = "2015-06-30")

func_prcp_linegraph(date_1 = "2015-08-01", date_2 = "2015-08-31")

func_prcp_linegraph(date_1 = "2015-10-01", date_2 = "2015-10-31")

func_prcp_linegraph(date_1 = "2015-12-01", date_2 = "2015-12-31")

rm(func_prcp_linegraph)

The precipitation variables very likely do have predictive power, but being able to accurately predict them 7 days in advance is difficult. As there doesn’t appear to be any easily/quickly identifiable pattern to produce seven-day-forward estimates for these variables, I merely delete them.

remove_prcp_vars <-
  temp_bands_25F[ ,
             c("prcp", "snow", "snwd") := NULL
             ]


## Save the data to the proper folder
saveRDS(remove_prcp_vars,
        paste0(wd,
               "/Data/Interim/",
               "remove_prcp_vars.Rds"
               )
        )

# remove_prcp_vars <-
#   readRDS(paste0(wd,
#                  "/Data/Interim/",
#                  "remove_prcp_vars.Rds"
#                  )
#           )


str(remove_prcp_vars)
## Classes 'data.table' and 'data.frame':   232924 obs. of  40 variables:
##  $ el_date                        : Date, format: "2013-07-01" "2013-07-01" ...
##  $ el_stop_id                     : num  40010 40020 40030 40040 40050 ...
##  $ divvy_pt5mi_stn_cnt            : int  0 0 0 16 0 0 17 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus         : num  1022 1022 1022 1022 1022 ...
##  $ divvy_all_triptime_mean_cus    : num  2109 2109 2109 2109 2109 ...
##  $ divvy_all_triptime_med_cus     : num  1262 1262 1262 1262 1262 ...
##  $ divvy_all_trip_cnt_dep         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub         : num  537 537 537 537 537 537 537 537 537 537 ...
##  $ divvy_all_triptime_mean_sub    : num  1050 1050 1050 1050 1050 ...
##  $ divvy_all_triptime_med_sub     : num  639 639 639 639 639 639 639 639 639 639 ...
##  $ divvy_mindist_miles            : num  5.75 6.65 2.96 0.1 8.13 ...
##  $ divvy_mindist_trip_cnt_cus     : num  0 10 10 0 3 10 11 3 3 3 ...
##  $ divvy_mindist_trip_cnt_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub     : num  0 8 8 0 7 8 19 7 7 7 ...
##  $ divvy_mindist_triptime_mean_cus: num  0 2399 2399 0 1413 ...
##  $ divvy_mindist_triptime_mean_dep: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub: num  0 822 822 0 859 ...
##  $ el_rides                       : num  2058 3942 2070 8671 3703 ...
##  $ holiday_name                   : Factor w/ 24 levels "Casimir Pulaski Day",..: NA NA NA NA NA NA NA NA NA NA ...
##  $ holiday_comment                : chr  NA NA NA NA ...
##  $ holiday                        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ year                           : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ half                           : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ quarter                        : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##  $ month                          : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day                            : Factor w/ 31 levels "1","2","3","4",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ wday.lbl                       : Ord.factor w/ 7 levels "Sunday"<"Monday"<..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ mweek                          : Factor w/ 6 levels "1","2","3","4",..: 6 6 6 6 6 6 6 6 6 6 ...
##  $ tmin_bands                     : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ tmax_bands                     : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ tmin_bands_l7                  : Factor w/ 5 levels "02_-25to00","03_00to25",..: NA NA NA NA NA NA NA 4 4 4 ...
##  $ tmax_bands_l7                  : Factor w/ 4 levels "02_00to25","03_25to50",..: NA NA NA NA NA NA NA 4 4 4 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "sorted")= chr  "el_date" "el_stop_id"
rm(temp_bands_25F)

Divvy Data

As the desire is to predict el_rides for each level of el_stop_id for 7 days in the future. As a proof of concept, the remainder of the analyses will be done for 6 values of el_stop_id: the two most frequently used el stations (by el ridership), the two least frquently used el stations, and two el stations in the middle. This is necessary to do before feature engineering the divvy data, as the divvy data are specific to each value of el_stop_id.

First, I just create a datset ranking el_stop_id by total el rides.

el_stop_id_rank <-
  remove_prcp_vars %>% 
  group_by(el_stop_id) %>% 
  summarise(el_rides_sum = sum(el_rides),
            date_min = min(el_date),
            date_max = max(el_date)
            ) %>% 
  arrange(el_rides_sum) %>% 
  mutate(days_in_use = date_max - date_min,
         row_num = row_number(el_rides_sum)
         )

So now I can just get the top, bottom, and middle to values for el_station_id.

# top
el_stop_id_top2 <-
  el_stop_id_rank %>% 
  top_n(n = 2, wt = el_rides_sum) %>% 
  pull(el_stop_id)


# bottom
el_stop_id_bot2 <-
  el_stop_id_rank %>% 
  top_n(n = -2, wt = el_rides_sum) %>% 
  pull(el_stop_id)


# middle
mid_row_num <-
  round(max(el_stop_id_rank$row_num) / 2, digits = 0)

el_stop_id_mid2 <-
  el_stop_id_rank %>% 
  filter(row_num %in% c(mid_row_num, mid_row_num + 1)
         ) %>% 
  pull(el_stop_id)

rm(mid_row_num)


# put everything together
el_stop_id_6_values <-
  c(el_stop_id_bot2, el_stop_id_mid2, el_stop_id_top2)


rm(el_stop_id_rank, el_stop_id_bot2, el_stop_id_mid2, el_stop_id_top2)

Now I can properly limit the data to the relevant el_stop_id values, and take 7-day lags of the relevant divvy variables.

# limit the data to the six values of interest
six_stations_data <-
  el_stop_id_6_values %>% 
  map(~ filter(remove_prcp_vars,
               el_stop_id == .x
               ) %>% 
        arrange(el_date)
      )

six_stations_data %>% 
  map(~ dim(.x))
## [[1]]
## [1] 1645   40
## 
## [[2]]
## [1] 1645   40
## 
## [[3]]
## [1] 1645   40
## 
## [[4]]
## [1] 1615   40
## 
## [[5]]
## [1] 1645   40
## 
## [[6]]
## [1] 1645   40
# get the proper lags
regex_criteria <- "_cus$|_dep$|sub$"

divvy_vars_lag7 <-
  six_stations_data %>% 
  map(~ select(.x,
               matches(regex_criteria)
               ) %>% 
        mutate_all(funs(l7 = dplyr::lag(., n = 7)
                        )
                   )
      )

names(divvy_vars_lag7) <- el_stop_id_6_values


pmap(.l = list(a = divvy_vars_lag7,
               b = names(divvy_vars_lag7)
               ),
     .f = function(a, b) {
       message(b)
       
       str(a)
       }
     )
## 40600
## 'data.frame':    1645 obs. of  42 variables:
##  $ divvy_pt5mi_trip_cnt_cus          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus            : num  1022 599 479 2652 2071 ...
##  $ divvy_all_triptime_mean_cus       : num  2109 2071 2292 2597 2492 ...
##  $ divvy_all_triptime_med_cus        : num  1262 1197 1143 1469 1559 ...
##  $ divvy_all_trip_cnt_dep            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub            : num  537 509 528 304 392 260 286 503 702 751 ...
##  $ divvy_all_triptime_mean_sub       : num  1050 1230 958 1009 844 ...
##  $ divvy_all_triptime_med_sub        : num  639 584 620 804 724 ...
##  $ divvy_mindist_trip_cnt_cus        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_cus_l7         : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_mean_cus_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_cus_l7     : num  NA NA NA NA NA ...
##  $ divvy_all_trip_cnt_dep_l7         : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_mean_dep_l7    : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_med_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_sub_l7         : num  NA NA NA NA NA NA NA 537 509 528 ...
##  $ divvy_all_triptime_mean_sub_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_sub_l7     : num  NA NA NA NA NA ...
##  $ divvy_mindist_trip_cnt_cus_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
## 41140
## 'data.frame':    1645 obs. of  42 variables:
##  $ divvy_pt5mi_trip_cnt_cus          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus            : num  1022 599 479 2652 2071 ...
##  $ divvy_all_triptime_mean_cus       : num  2109 2071 2292 2597 2492 ...
##  $ divvy_all_triptime_med_cus        : num  1262 1197 1143 1469 1559 ...
##  $ divvy_all_trip_cnt_dep            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub            : num  537 509 528 304 392 260 286 503 702 751 ...
##  $ divvy_all_triptime_mean_sub       : num  1050 1230 958 1009 844 ...
##  $ divvy_all_triptime_med_sub        : num  639 584 620 804 724 ...
##  $ divvy_mindist_trip_cnt_cus        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_cus_l7         : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_mean_cus_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_cus_l7     : num  NA NA NA NA NA ...
##  $ divvy_all_trip_cnt_dep_l7         : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_mean_dep_l7    : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_med_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_sub_l7         : num  NA NA NA NA NA NA NA 537 509 528 ...
##  $ divvy_all_triptime_mean_sub_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_sub_l7     : num  NA NA NA NA NA ...
##  $ divvy_mindist_trip_cnt_cus_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
## 40120
## 'data.frame':    1645 obs. of  42 variables:
##  $ divvy_pt5mi_trip_cnt_cus          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus            : num  1022 599 479 2652 2071 ...
##  $ divvy_all_triptime_mean_cus       : num  2109 2071 2292 2597 2492 ...
##  $ divvy_all_triptime_med_cus        : num  1262 1197 1143 1469 1559 ...
##  $ divvy_all_trip_cnt_dep            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub            : num  537 509 528 304 392 260 286 503 702 751 ...
##  $ divvy_all_triptime_mean_sub       : num  1050 1230 958 1009 844 ...
##  $ divvy_all_triptime_med_sub        : num  639 584 620 804 724 ...
##  $ divvy_mindist_trip_cnt_cus        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_cus_l7         : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_mean_cus_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_cus_l7     : num  NA NA NA NA NA ...
##  $ divvy_all_trip_cnt_dep_l7         : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_mean_dep_l7    : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_med_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_sub_l7         : num  NA NA NA NA NA NA NA 537 509 528 ...
##  $ divvy_all_triptime_mean_sub_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_sub_l7     : num  NA NA NA NA NA ...
##  $ divvy_mindist_trip_cnt_cus_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
## 40910
## 'data.frame':    1615 obs. of  42 variables:
##  $ divvy_pt5mi_trip_cnt_cus          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus            : num  1022 599 479 2652 2071 ...
##  $ divvy_all_triptime_mean_cus       : num  2109 2071 2292 2597 2492 ...
##  $ divvy_all_triptime_med_cus        : num  1262 1197 1143 1469 1559 ...
##  $ divvy_all_trip_cnt_dep            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub            : num  537 509 528 304 392 260 286 503 702 751 ...
##  $ divvy_all_triptime_mean_sub       : num  1050 1230 958 1009 844 ...
##  $ divvy_all_triptime_med_sub        : num  639 584 620 804 724 ...
##  $ divvy_mindist_trip_cnt_cus        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_cus_l7         : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_mean_cus_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_cus_l7     : num  NA NA NA NA NA ...
##  $ divvy_all_trip_cnt_dep_l7         : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_mean_dep_l7    : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_med_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_sub_l7         : num  NA NA NA NA NA NA NA 537 509 528 ...
##  $ divvy_all_triptime_mean_sub_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_sub_l7     : num  NA NA NA NA NA ...
##  $ divvy_mindist_trip_cnt_cus_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
## 40380
## 'data.frame':    1645 obs. of  42 variables:
##  $ divvy_pt5mi_trip_cnt_cus          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus            : num  1022 599 479 2652 2071 ...
##  $ divvy_all_triptime_mean_cus       : num  2109 2071 2292 2597 2492 ...
##  $ divvy_all_triptime_med_cus        : num  1262 1197 1143 1469 1559 ...
##  $ divvy_all_trip_cnt_dep            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub            : num  537 509 528 304 392 260 286 503 702 751 ...
##  $ divvy_all_triptime_mean_sub       : num  1050 1230 958 1009 844 ...
##  $ divvy_all_triptime_med_sub        : num  639 584 620 804 724 ...
##  $ divvy_mindist_trip_cnt_cus        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_cus_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_cus_l7         : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_mean_cus_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_cus_l7     : num  NA NA NA NA NA ...
##  $ divvy_all_trip_cnt_dep_l7         : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_mean_dep_l7    : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_med_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_sub_l7         : num  NA NA NA NA NA NA NA 537 509 528 ...
##  $ divvy_all_triptime_mean_sub_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_sub_l7     : num  NA NA NA NA NA ...
##  $ divvy_mindist_trip_cnt_cus_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
## 41660
## 'data.frame':    1645 obs. of  42 variables:
##  $ divvy_pt5mi_trip_cnt_cus          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus            : num  1022 599 479 2652 2071 ...
##  $ divvy_all_triptime_mean_cus       : num  2109 2071 2292 2597 2492 ...
##  $ divvy_all_triptime_med_cus        : num  1262 1197 1143 1469 1559 ...
##  $ divvy_all_trip_cnt_dep            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub            : num  537 509 528 304 392 260 286 503 702 751 ...
##  $ divvy_all_triptime_mean_sub       : num  1050 1230 958 1009 844 ...
##  $ divvy_all_triptime_med_sub        : num  639 584 620 804 724 ...
##  $ divvy_mindist_trip_cnt_cus        : num  16 15 14 52 53 49 21 11 14 16 ...
##  $ divvy_mindist_trip_cnt_dep        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub        : num  8 8 6 5 4 3 7 11 13 15 ...
##  $ divvy_mindist_triptime_mean_cus   : num  1132 1033 1607 2678 3116 ...
##  $ divvy_mindist_triptime_mean_dep   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub   : num  399 592 710 646 692 ...
##  $ divvy_pt5mi_trip_cnt_cus_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_cus_l7         : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_mean_cus_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_cus_l7     : num  NA NA NA NA NA ...
##  $ divvy_all_trip_cnt_dep_l7         : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_mean_dep_l7    : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_med_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_sub_l7         : num  NA NA NA NA NA NA NA 537 509 528 ...
##  $ divvy_all_triptime_mean_sub_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_sub_l7     : num  NA NA NA NA NA ...
##  $ divvy_mindist_trip_cnt_cus_l7     : num  NA NA NA NA NA NA NA 16 15 14 ...
##  $ divvy_mindist_trip_cnt_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub_l7     : num  NA NA NA NA NA NA NA 8 8 6 ...
##  $ divvy_mindist_triptime_mean_cus_l7: num  NA NA NA NA NA ...
##  $ divvy_mindist_triptime_mean_dep_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub_l7: num  NA NA NA NA NA ...
## $`40600`
## NULL
## 
## $`41140`
## NULL
## 
## $`40120`
## NULL
## 
## $`40910`
## NULL
## 
## $`40380`
## NULL
## 
## $`41660`
## NULL
# divvy_vars_lag7 %>% map(~ View(.x))

As a check, here I investigate just how accurate using a 7-day lag is for each variable.

First I’ll create the function func_sum_stats_by_col to calculate the accuracy statistics for each variable-pair (e.g., divvy_pt5mi_trip_cnt_cus and divvy_pt5mi_trip_cnt_cus_l7).

func_sum_stats_by_col <-
  function(data) {
    dat = data[8:nrow(data), ]
    
    preProcValues = preProcess(dat, method = c("center", "scale"))
    dat_cent_scale = predict(preProcValues, dat)
    
    col_pos = seq(1:21)
    
    sum_stats =
      pmap(.l = list(a = col_pos),
           .f = function(a) {
             
             colnames(dat_cent_scale)[a] = "obs"
             colnames(dat_cent_scale)[a + 21] = "pred"
             
             stats = defaultSummary(dat_cent_scale)
             
             return(stats)
             }
           )
    
    return(sum_stats)
    }

Now I simply use the func_sum_stats_by_col function for data on each value of el_stop_id, then turn the results into a dataframe for easier understanding.

acrcy_stats_lags_df <-
  pmap(.l = list(a = divvy_vars_lag7),
       .f = function(a) {
         res = func_sum_stats_by_col(a)
         
         names(res) = names(a)[1:21]
         
         res_df = bind_rows(res) %>% 
           t() %>% 
           as.data.frame() %>% 
           rename(RMSE = V1,
                  rSqrd = V2,
                  MAE = V3
                  ) %>% 
           rownames_to_column(var = "var_og")
         
         return(res_df)
         }
       )

# View(acrcy_stats_lags_df$`40600`)


rm(func_sum_stats_by_col)

Now I just put all of the six datasets together as one, and then calculate some summary statistics.

Overall, using a lag of 7 has varying results depending on the variable. For example, using the median of the six MAE values, divvy_pt5mi_triptime_mean_cus is fairly accurate, whereas divvy_all_triptime_mean_cus. Interestingly, all of the values for ...pt5mi... are the most accurate. Hopefully, this accuracy will be sufficient for predicing el_rides.

acrcy_stats_lag_1df <-
  bind_rows(acrcy_stats_lags_df,
            .id = "el_stop_id"
            ) %>% 
  arrange(var_og,
          MAE,
          el_stop_id
          )

# View(acrcy_stats_lag_1df)


acrcy_stats_sum_stats <-
  acrcy_stats_lag_1df %>% 
  group_by(var_og) %>% 
  summarise(rmse_mean = mean(RMSE, na.rm = TRUE),
            rmse_med = median(RMSE, na.rm = TRUE),
            rsqrd_mean = mean(rSqrd, na.rm = TRUE),
            rsqrd_med = median(rSqrd, na.rm = TRUE),
            mae_mean = mean(MAE, na.rm = TRUE),
            mae_med = median(MAE, na.rm = TRUE)
            )

acrcy_stats_sum_stats %>% 
  arrange(mae_med)
# View(acrcy_stats_sum_stats %>% 
#        arrange(mae_med)
#      )


rm(acrcy_stats_lags_df, acrcy_stats_lag_1df, acrcy_stats_sum_stats)

Now I simply add the lagged variables back into the “main” datasets in place of the un-lagged variables.

# get the lagged variables
divvy_vars_lag7_lag_vars <-
  divvy_vars_lag7 %>% 
  map(~ select(.x,
               matches("_l7$")
               )
      )


# then bind them to the "main" data
six_stations_divvy_data_lags <-
  pmap(.l = list(a = six_stations_data,
                 b = divvy_vars_lag7_lag_vars
                 ),
       .f = function(a, b) {
         new_dat = a %>% 
           select(-matches(regex_criteria)
                  ) %>% 
           bind_cols(b)
         
         return(new_dat)
         }
       )

names(six_stations_divvy_data_lags) <- names(divvy_vars_lag7_lag_vars)


ncol(remove_prcp_vars)
## [1] 40
ncol(six_stations_divvy_data_lags$`41140`)
## [1] 40
colnames(six_stations_divvy_data_lags$`41140`)
##  [1] "el_date"                           
##  [2] "el_stop_id"                        
##  [3] "divvy_pt5mi_stn_cnt"               
##  [4] "divvy_mindist_miles"               
##  [5] "el_rides"                          
##  [6] "holiday_name"                      
##  [7] "holiday_comment"                   
##  [8] "holiday"                           
##  [9] "year"                              
## [10] "half"                              
## [11] "quarter"                           
## [12] "month"                             
## [13] "day"                               
## [14] "wday.lbl"                          
## [15] "mweek"                             
## [16] "tmin_bands"                        
## [17] "tmax_bands"                        
## [18] "tmin_bands_l7"                     
## [19] "tmax_bands_l7"                     
## [20] "divvy_pt5mi_trip_cnt_cus_l7"       
## [21] "divvy_pt5mi_triptime_mean_cus_l7"  
## [22] "divvy_pt5mi_trip_cnt_dep_l7"       
## [23] "divvy_pt5mi_triptime_mean_dep_l7"  
## [24] "divvy_pt5mi_trip_cnt_sub_l7"       
## [25] "divvy_pt5mi_triptime_mean_sub_l7"  
## [26] "divvy_all_trip_cnt_cus_l7"         
## [27] "divvy_all_triptime_mean_cus_l7"    
## [28] "divvy_all_triptime_med_cus_l7"     
## [29] "divvy_all_trip_cnt_dep_l7"         
## [30] "divvy_all_triptime_mean_dep_l7"    
## [31] "divvy_all_triptime_med_dep_l7"     
## [32] "divvy_all_trip_cnt_sub_l7"         
## [33] "divvy_all_triptime_mean_sub_l7"    
## [34] "divvy_all_triptime_med_sub_l7"     
## [35] "divvy_mindist_trip_cnt_cus_l7"     
## [36] "divvy_mindist_trip_cnt_dep_l7"     
## [37] "divvy_mindist_trip_cnt_sub_l7"     
## [38] "divvy_mindist_triptime_mean_cus_l7"
## [39] "divvy_mindist_triptime_mean_dep_l7"
## [40] "divvy_mindist_triptime_mean_sub_l7"
# View(six_stations_divvy_data_lags$`41140`)


rm(remove_prcp_vars, divvy_vars_lag7, divvy_vars_lag7_lag_vars, six_stations_data, regex_criteria)

Holidays

Here I simply take the 7-day lag of the holiday-related variables, and turn relace NA values in holiday_name with a factor level for “not a holiday”.

six_stations_all_lags <-
  pmap(.l = list(a = six_stations_divvy_data_lags),
       .f = function(a) {
         dat = a %>% 
           mutate(holiday_name = if_else(is.na(holiday_name),
                                         "--Not_Holiday--",
                                         as.character(holiday_name)
                                         ) %>% factor,
                  holiday_name_l7 = dplyr::lag(holiday_name, n = 7),
                  holiday_comment_l7 = dplyr::lag(holiday_comment, n = 7),
                  holiday_l7 = dplyr::lag(holiday, n = 7)
                   )
         
         # dat = dat[8:nrow(dat), ]
         
         return(dat)
         }
      )


str(six_stations_all_lags$`41140`)
## 'data.frame':    1645 obs. of  43 variables:
##  $ el_date                           : Date, format: "2013-07-01" "2013-07-02" ...
##  $ el_stop_id                        : num  41140 41140 41140 41140 41140 ...
##  $ divvy_pt5mi_stn_cnt               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_miles               : num  1.49 1.49 1.49 1.49 1.49 ...
##  $ el_rides                          : num  1179 1048 1177 687 951 ...
##  $ holiday_name                      : Factor w/ 22 levels "--Not_Holiday--",..: 1 1 1 10 1 1 1 1 1 1 ...
##  $ holiday_comment                   : chr  NA NA NA "" ...
##  $ holiday                           : logi  FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ year                              : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ half                              : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ quarter                           : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##  $ month                             : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day                               : Factor w/ 31 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ wday.lbl                          : Ord.factor w/ 7 levels "Sunday"<"Monday"<..: 2 3 4 5 6 7 1 2 3 4 ...
##  $ mweek                             : Factor w/ 6 levels "1","2","3","4",..: 6 1 1 1 1 1 1 2 2 2 ...
##  $ tmin_bands                        : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ tmax_bands                        : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
##  $ tmin_bands_l7                     : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ tmax_bands_l7                     : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
##  $ divvy_pt5mi_trip_cnt_cus_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_cus_l7         : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_mean_cus_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_cus_l7     : num  NA NA NA NA NA ...
##  $ divvy_all_trip_cnt_dep_l7         : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_mean_dep_l7    : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_med_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_sub_l7         : num  NA NA NA NA NA NA NA 537 509 528 ...
##  $ divvy_all_triptime_mean_sub_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_sub_l7     : num  NA NA NA NA NA ...
##  $ divvy_mindist_trip_cnt_cus_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ holiday_name_l7                   : Factor w/ 22 levels "--Not_Holiday--",..: NA NA NA NA NA NA NA 1 1 1 ...
##  $ holiday_comment_l7                : chr  NA NA NA NA ...
##  $ holiday_l7                        : logi  NA NA NA NA NA NA ...
# View(six_stations_all_lags$`41140`)


rm(six_stations_divvy_data_lags, el_stop_id_6_values)

El Rides

Determining the optimal number of lags of el_ridesto include (based on the ACF).

# function to measure ACF for each lag
func_tidy_acf <- function(data, value, lags = 0:(7 * 8) # 7 days * 8 weeks as the default
                     ) {
  value_expr = enquo(value)
  
  acf_values = data %>% 
    pull(!!value_expr) %>% 
    acf(lag.max = tail(lags, 1),
        plot = FALSE
        ) %>% 
    .$acf %>% 
    .[,,1]
  
  ret = tibble(acf = acf_values) %>% 
    rowid_to_column(var = "lag") %>% 
    mutate(lag = lag - 1) %>% 
    filter(lag %in% lags)
  
  return(ret)
}



max_lag <- 7 * 52 * 2 # 7 days * 52 weeks * 2 years

pmap(.l = list(a = six_stations_all_lags,
               b = names(six_stations_all_lags)
               ),
     .f = function(a, b) {
       func_tidy_acf(a,
                     value = el_rides,
                     lags = 0:max_lag
                     ) %>% 
         # limited in this way as lags of 7n appear to be the most relevant
         filter(lag == 0 |
                  lag %% 7 == 0
                ) %>%
         mutate(url = b)
       }
     )
## $`40600`
## # A tibble: 105 x 3
##      lag   acf url  
##    <dbl> <dbl> <chr>
##  1     0 1     40600
##  2     7 0.846 40600
##  3    14 0.826 40600
##  4    21 0.806 40600
##  5    28 0.805 40600
##  6    35 0.797 40600
##  7    42 0.780 40600
##  8    49 0.757 40600
##  9    56 0.746 40600
## 10    63 0.729 40600
## # ... with 95 more rows
## 
## $`41140`
## # A tibble: 105 x 3
##      lag   acf url  
##    <dbl> <dbl> <chr>
##  1     0 1.000 41140
##  2     7 0.877 41140
##  3    14 0.850 41140
##  4    21 0.819 41140
##  5    28 0.797 41140
##  6    35 0.777 41140
##  7    42 0.741 41140
##  8    49 0.704 41140
##  9    56 0.676 41140
## 10    63 0.635 41140
## # ... with 95 more rows
## 
## $`40120`
## # A tibble: 105 x 3
##      lag   acf url  
##    <dbl> <dbl> <chr>
##  1     0 1     40120
##  2     7 0.857 40120
##  3    14 0.833 40120
##  4    21 0.812 40120
##  5    28 0.816 40120
##  6    35 0.811 40120
##  7    42 0.785 40120
##  8    49 0.768 40120
##  9    56 0.752 40120
## 10    63 0.738 40120
## # ... with 95 more rows
## 
## $`40910`
## # A tibble: 105 x 3
##      lag   acf url  
##    <dbl> <dbl> <chr>
##  1     0 1     40910
##  2     7 0.851 40910
##  3    14 0.793 40910
##  4    21 0.734 40910
##  5    28 0.691 40910
##  6    35 0.640 40910
##  7    42 0.592 40910
##  8    49 0.535 40910
##  9    56 0.482 40910
## 10    63 0.427 40910
## # ... with 95 more rows
## 
## $`40380`
## # A tibble: 105 x 3
##      lag   acf url  
##    <dbl> <dbl> <chr>
##  1     0 1     40380
##  2     7 0.874 40380
##  3    14 0.853 40380
##  4    21 0.841 40380
##  5    28 0.854 40380
##  6    35 0.854 40380
##  7    42 0.830 40380
##  8    49 0.823 40380
##  9    56 0.815 40380
## 10    63 0.810 40380
## # ... with 95 more rows
## 
## $`41660`
## # A tibble: 105 x 3
##      lag   acf url  
##    <dbl> <dbl> <chr>
##  1     0 1     41660
##  2     7 0.810 41660
##  3    14 0.783 41660
##  4    21 0.764 41660
##  5    28 0.783 41660
##  6    35 0.776 41660
##  7    42 0.754 41660
##  8    49 0.744 41660
##  9    56 0.727 41660
## 10    63 0.720 41660
## # ... with 95 more rows
pmap(.l = list(a = six_stations_all_lags,
               b = names(six_stations_all_lags)
               ),
     .f = function(a, b) {
       func_tidy_acf(a,
                     value = el_rides,
                     lags = 0:max_lag
                     ) %>% 
         # limited in this way as lags of 7n appear to be the most relevant
         filter(lag == 0 |
                  lag %% 7 == 0
                ) %>%
         ggplot(aes(x = lag,
                    y = acf
                    )
                ) +
         geom_segment(aes(xend = lag,
                          yend = 0
                          ),
                      color = "blue"
                      ) +
         geom_vline(xintercept = 7,
                    size = 2,
                    color = "red"
                    ) +
         annotate("text", label = "1 Week Mark",
                  x = 18,
                  y = 0.9,
                  color = "red",
                  size = 3,
                  hjust = 0
                  ) +
         labs(title = paste0("el_stop_id: ", b),
              subtitle = "ACF (lags of 7): el_rides") +
         theme_minimal() +
         NULL
       }
     )
## $`40600`

## 
## $`41140`

## 
## $`40120`

## 
## $`40910`

## 
## $`40380`

## 
## $`41660`

optimal_lag_setting <-
  six_stations_all_lags %>% 
  map(~ func_tidy_acf(.x,
                      value = el_rides,
                      lags = 0:max_lag
                      ) %>% 
        filter(lag != 0) %>% 
        top_n(n = 4, wt = acf) %>% 
        arrange(desc(acf)) %>% 
        # filter(acf == max(acf)
        #        ) %>% 
        pull(lag)
      )

optimal_lag_setting
## $`40600`
## [1]  7 14 21 28
## 
## $`41140`
## [1]  7 14 21 28
## 
## $`40120`
## [1]  7 14 28 21
## 
## $`40910`
## [1]  7 14 21  1
## 
## $`40380`
## [1]  7 28 35 14
## 
## $`41660`
## [1]  7 14 28 35
rm(max_lag, optimal_lag_setting, func_tidy_acf)

The optimal lags are slightly different for each el_stop_id, but adding lags of 7, 14, 21, and 28 covers almost all of the top for lags for each el_stop_id value.

el_rides_lags <-
  six_stations_all_lags %>% 
  map(~ mutate(.x,
               el_rides_l07 = dplyr::lag(el_rides, n = 7),
               el_rides_l14 = dplyr::lag(el_rides, n = 14),
               el_rides_l21 = dplyr::lag(el_rides, n = 21),
               el_rides_l28 = dplyr::lag(el_rides, n = 28),
               )
      )

message("el_rides_lags")
## el_rides_lags
str(el_rides_lags$`41140`)
## 'data.frame':    1645 obs. of  47 variables:
##  $ el_date                           : Date, format: "2013-07-01" "2013-07-02" ...
##  $ el_stop_id                        : num  41140 41140 41140 41140 41140 ...
##  $ divvy_pt5mi_stn_cnt               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_miles               : num  1.49 1.49 1.49 1.49 1.49 ...
##  $ el_rides                          : num  1179 1048 1177 687 951 ...
##  $ holiday_name                      : Factor w/ 22 levels "--Not_Holiday--",..: 1 1 1 10 1 1 1 1 1 1 ...
##  $ holiday_comment                   : chr  NA NA NA "" ...
##  $ holiday                           : logi  FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ year                              : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ half                              : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ quarter                           : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##  $ month                             : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day                               : Factor w/ 31 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ wday.lbl                          : Ord.factor w/ 7 levels "Sunday"<"Monday"<..: 2 3 4 5 6 7 1 2 3 4 ...
##  $ mweek                             : Factor w/ 6 levels "1","2","3","4",..: 6 1 1 1 1 1 1 2 2 2 ...
##  $ tmin_bands                        : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ tmax_bands                        : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
##  $ tmin_bands_l7                     : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ tmax_bands_l7                     : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
##  $ divvy_pt5mi_trip_cnt_cus_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_cus_l7         : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_mean_cus_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_cus_l7     : num  NA NA NA NA NA ...
##  $ divvy_all_trip_cnt_dep_l7         : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_mean_dep_l7    : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_med_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_sub_l7         : num  NA NA NA NA NA NA NA 537 509 528 ...
##  $ divvy_all_triptime_mean_sub_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_sub_l7     : num  NA NA NA NA NA ...
##  $ divvy_mindist_trip_cnt_cus_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ holiday_name_l7                   : Factor w/ 22 levels "--Not_Holiday--",..: NA NA NA NA NA NA NA 1 1 1 ...
##  $ holiday_comment_l7                : chr  NA NA NA NA ...
##  $ holiday_l7                        : logi  NA NA NA NA NA NA ...
##  $ el_rides_l07                      : num  NA NA NA NA NA ...
##  $ el_rides_l14                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ el_rides_l21                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ el_rides_l28                      : num  NA NA NA NA NA NA NA NA NA NA ...
# View(el_rides_lags$`41140`)


rm(six_stations_all_lags)

Similarly, here I’ll take rolling averages of el_rides with windows of 7, 14, 21, and 28 days.

el_rides_ma_values <-
  el_rides_lags %>% 
  map(~ mutate(.x,
               el_rides_ma07 =
                 rollapply(data = el_rides,
                           width = 7,
                           mean,
                           na.rm = TRUE,
                           align = "right",
                           partial = FALSE,
                           fill = NA
                           ),
               el_rides_ma14 =
                 rollapply(data = el_rides,
                           width = 14,
                           mean,
                           na.rm = TRUE,
                           align = "right",
                           partial = FALSE,
                           fill = NA
                           ),
               el_rides_ma21 =
                 rollapply(data = el_rides,
                           width = 21,
                           mean,
                           na.rm = TRUE,
                           align = "right",
                           partial = FALSE,
                           fill = NA
                           ),
               el_rides_ma28 =
                 rollapply(data = el_rides,
                           width = 28,
                           mean,
                           na.rm = TRUE,
                           align = "right",
                           partial = FALSE,
                           fill = NA
                           )
               )
      )

message("el_rides_ma_values")
## el_rides_ma_values
str(el_rides_ma_values$`41140`)
## 'data.frame':    1645 obs. of  51 variables:
##  $ el_date                           : Date, format: "2013-07-01" "2013-07-02" ...
##  $ el_stop_id                        : num  41140 41140 41140 41140 41140 ...
##  $ divvy_pt5mi_stn_cnt               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_miles               : num  1.49 1.49 1.49 1.49 1.49 ...
##  $ el_rides                          : num  1179 1048 1177 687 951 ...
##  $ holiday_name                      : Factor w/ 22 levels "--Not_Holiday--",..: 1 1 1 10 1 1 1 1 1 1 ...
##  $ holiday_comment                   : chr  NA NA NA "" ...
##  $ holiday                           : logi  FALSE FALSE FALSE TRUE FALSE FALSE ...
##  $ year                              : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ half                              : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ quarter                           : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##  $ month                             : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 7 7 7 7 7 7 7 ...
##  $ day                               : Factor w/ 31 levels "1","2","3","4",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ wday.lbl                          : Ord.factor w/ 7 levels "Sunday"<"Monday"<..: 2 3 4 5 6 7 1 2 3 4 ...
##  $ mweek                             : Factor w/ 6 levels "1","2","3","4",..: 6 1 1 1 1 1 1 2 2 2 ...
##  $ tmin_bands                        : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ tmax_bands                        : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
##  $ tmin_bands_l7                     : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 5 4 ...
##  $ tmax_bands_l7                     : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
##  $ divvy_pt5mi_trip_cnt_cus_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub_l7       : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub_l7  : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_cus_l7         : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_mean_cus_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_cus_l7     : num  NA NA NA NA NA ...
##  $ divvy_all_trip_cnt_dep_l7         : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_mean_dep_l7    : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_triptime_med_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_all_trip_cnt_sub_l7         : num  NA NA NA NA NA NA NA 537 509 528 ...
##  $ divvy_all_triptime_mean_sub_l7    : num  NA NA NA NA NA ...
##  $ divvy_all_triptime_med_sub_l7     : num  NA NA NA NA NA ...
##  $ divvy_mindist_trip_cnt_cus_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub_l7     : num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub_l7: num  NA NA NA NA NA NA NA 0 0 0 ...
##  $ holiday_name_l7                   : Factor w/ 22 levels "--Not_Holiday--",..: NA NA NA NA NA NA NA 1 1 1 ...
##  $ holiday_comment_l7                : chr  NA NA NA NA ...
##  $ holiday_l7                        : logi  NA NA NA NA NA NA ...
##  $ el_rides_l07                      : num  NA NA NA NA NA ...
##  $ el_rides_l14                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ el_rides_l21                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ el_rides_l28                      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ el_rides_ma07                     : num  NA NA NA NA NA ...
##  $ el_rides_ma14                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ el_rides_ma21                     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ el_rides_ma28                     : num  NA NA NA NA NA NA NA NA NA NA ...
# View(el_rides_ma_values$`41140`)


rm(el_rides_lags)

As the lags and windows can only start computing values with the 29th value, here I will simply remove the first 28 rows of data (rather than imputing the lag/mov avg values).

remove_first_28na_rows <-
  pmap(.l = list(a = el_rides_ma_values),
       .f = function(a) {
         dat = a[29:nrow(a), ]
         
         return(dat)
         }
       )


message("remove_first_28na_rows")
## remove_first_28na_rows
str(remove_first_28na_rows$`41140`)
## 'data.frame':    1617 obs. of  51 variables:
##  $ el_date                           : Date, format: "2013-07-29" "2013-07-30" ...
##  $ el_stop_id                        : num  41140 41140 41140 41140 41140 ...
##  $ divvy_pt5mi_stn_cnt               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_miles               : num  1.49 1.49 1.49 1.49 1.49 ...
##  $ el_rides                          : num  1084 1098 1019 1204 1159 ...
##  $ holiday_name                      : Factor w/ 22 levels "--Not_Holiday--",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday_comment                   : chr  NA NA NA NA ...
##  $ holiday                           : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ year                              : Factor w/ 6 levels "2013","2014",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ half                              : Factor w/ 2 levels "1","2": 2 2 2 2 2 2 2 2 2 2 ...
##  $ quarter                           : Factor w/ 4 levels "1","2","3","4": 3 3 3 3 3 3 3 3 3 3 ...
##  $ month                             : Factor w/ 12 levels "1","2","3","4",..: 7 7 7 8 8 8 8 8 8 8 ...
##  $ day                               : Factor w/ 31 levels "1","2","3","4",..: 29 30 31 1 2 3 4 5 6 7 ...
##  $ wday.lbl                          : Ord.factor w/ 7 levels "Sunday"<"Monday"<..: 2 3 4 5 6 7 1 2 3 4 ...
##  $ mweek                             : Factor w/ 6 levels "1","2","3","4",..: 5 5 5 5 1 1 1 2 2 2 ...
##  $ tmin_bands                        : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ tmax_bands                        : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
##  $ tmin_bands_l7                     : Factor w/ 5 levels "02_-25to00","03_00to25",..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ tmax_bands_l7                     : Factor w/ 4 levels "02_00to25","03_25to50",..: 4 3 3 4 4 4 4 4 4 4 ...
##  $ divvy_pt5mi_trip_cnt_cus_l7       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_cus_l7  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_dep_l7       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_dep_l7  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_trip_cnt_sub_l7       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_pt5mi_triptime_mean_sub_l7  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_cus_l7         : num  1705 1492 1472 1585 1166 ...
##  $ divvy_all_triptime_mean_cus_l7    : num  2165 1798 1826 1823 1828 ...
##  $ divvy_all_triptime_med_cus_l7     : num  1441 1274 1218 1278 1240 ...
##  $ divvy_all_trip_cnt_dep_l7         : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_mean_dep_l7    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_triptime_med_dep_l7     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_all_trip_cnt_sub_l7         : num  1029 1179 1191 1225 991 ...
##  $ divvy_all_triptime_mean_sub_l7    : num  819 746 761 845 704 ...
##  $ divvy_all_triptime_med_sub_l7     : num  635 614 620 617 597 ...
##  $ divvy_mindist_trip_cnt_cus_l7     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_dep_l7     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_trip_cnt_sub_l7     : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_cus_l7: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_dep_l7: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ divvy_mindist_triptime_mean_sub_l7: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ holiday_name_l7                   : Factor w/ 22 levels "--Not_Holiday--",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ holiday_comment_l7                : chr  NA NA NA NA ...
##  $ holiday_l7                        : logi  FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ el_rides_l07                      : num  1140 1043 1081 1084 1094 ...
##  $ el_rides_l14                      : num  1167 1093 1102 1051 1101 ...
##  $ el_rides_l21                      : num  1063 1065 1127 1150 1210 ...
##  $ el_rides_l28                      : num  1179 1048 1177 687 951 ...
##  $ el_rides_ma07                     : num  956 964 955 972 982 ...
##  $ el_rides_ma14                     : num  971 971 965 976 980 ...
##  $ el_rides_ma21                     : num  990 992 987 989 987 ...
##  $ el_rides_ma28                     : num  968 970 965 983 991 ...
# View(remove_first_28na_rows$`41140`)


saveRDS(remove_first_28na_rows,
        paste0(wd,
               "/Data/Interim/",
               "remove_first_28na_rows.Rds"
               )
        )


# remove_first_28na_rows <-
#   readRDS(paste0(wd,
#                  "/Data/Interim/",
#                  "remove_first_28na_rows.Rds"
#                  )
#           )


remove_first_28na_rows %>% 
  map(~ #select(.x,
        #        -matches("holiday")
        #        ) %>% 
        # miss_var_summary()
        miss_var_summary(.x)
      )
## $`40600`
## # A tibble: 51 x 3
##    variable            n_miss pct_miss
##    <chr>                <int>    <dbl>
##  1 holiday_comment_l7    1550     95.9
##  2 holiday_comment       1549     95.8
##  3 el_date                  0      0  
##  4 el_stop_id               0      0  
##  5 divvy_pt5mi_stn_cnt      0      0  
##  6 divvy_mindist_miles      0      0  
##  7 el_rides                 0      0  
##  8 holiday_name             0      0  
##  9 holiday                  0      0  
## 10 year                     0      0  
## # ... with 41 more rows
## 
## $`41140`
## # A tibble: 51 x 3
##    variable            n_miss pct_miss
##    <chr>                <int>    <dbl>
##  1 holiday_comment_l7    1550     95.9
##  2 holiday_comment       1549     95.8
##  3 el_date                  0      0  
##  4 el_stop_id               0      0  
##  5 divvy_pt5mi_stn_cnt      0      0  
##  6 divvy_mindist_miles      0      0  
##  7 el_rides                 0      0  
##  8 holiday_name             0      0  
##  9 holiday                  0      0  
## 10 year                     0      0  
## # ... with 41 more rows
## 
## $`40120`
## # A tibble: 51 x 3
##    variable            n_miss pct_miss
##    <chr>                <int>    <dbl>
##  1 holiday_comment_l7    1550     95.9
##  2 holiday_comment       1549     95.8
##  3 el_date                  0      0  
##  4 el_stop_id               0      0  
##  5 divvy_pt5mi_stn_cnt      0      0  
##  6 divvy_mindist_miles      0      0  
##  7 el_rides                 0      0  
##  8 holiday_name             0      0  
##  9 holiday                  0      0  
## 10 year                     0      0  
## # ... with 41 more rows
## 
## $`40910`
## # A tibble: 51 x 3
##    variable            n_miss pct_miss
##    <chr>                <int>    <dbl>
##  1 holiday_comment_l7    1521     95.8
##  2 holiday_comment       1520     95.8
##  3 el_date                  0      0  
##  4 el_stop_id               0      0  
##  5 divvy_pt5mi_stn_cnt      0      0  
##  6 divvy_mindist_miles      0      0  
##  7 el_rides                 0      0  
##  8 holiday_name             0      0  
##  9 holiday                  0      0  
## 10 year                     0      0  
## # ... with 41 more rows
## 
## $`40380`
## # A tibble: 51 x 3
##    variable            n_miss pct_miss
##    <chr>                <int>    <dbl>
##  1 holiday_comment_l7    1550     95.9
##  2 holiday_comment       1549     95.8
##  3 el_date                  0      0  
##  4 el_stop_id               0      0  
##  5 divvy_pt5mi_stn_cnt      0      0  
##  6 divvy_mindist_miles      0      0  
##  7 el_rides                 0      0  
##  8 holiday_name             0      0  
##  9 holiday                  0      0  
## 10 year                     0      0  
## # ... with 41 more rows
## 
## $`41660`
## # A tibble: 51 x 3
##    variable            n_miss pct_miss
##    <chr>                <int>    <dbl>
##  1 holiday_comment_l7    1550     95.9
##  2 holiday_comment       1549     95.8
##  3 el_date                  0      0  
##  4 el_stop_id               0      0  
##  5 divvy_pt5mi_stn_cnt      0      0  
##  6 divvy_mindist_miles      0      0  
##  7 el_rides                 0      0  
##  8 holiday_name             0      0  
##  9 holiday                  0      0  
## 10 year                     0      0  
## # ... with 41 more rows
rm(el_rides_ma_values)