Chicago Crash Data Analysis

### Made some changes in variable selection. Otherwise complete replicate of
### https://juliasilge.com/blog/chicago-traffic-model/

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.0     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(RSocrata)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## The following object is masked from 'package:purrr':
## 
##     transpose
library(themis)
## Loading required package: recipes
## 
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
## 
##     fixed
## The following object is masked from 'package:stats':
## 
##     step
## Registered S3 methods overwritten by 'themis':
##   method                  from   
##   bake.step_downsample    recipes
##   bake.step_upsample      recipes
##   prep.step_downsample    recipes
##   prep.step_upsample      recipes
##   tidy.step_downsample    recipes
##   tidy.step_upsample      recipes
##   tunable.step_downsample recipes
##   tunable.step_upsample   recipes
## 
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
## 
##     step_downsample, step_upsample
library(baguette)
## Loading required package: parsnip
### install.packages("baguette")
library(lubridate)
library(ggplot2)
theme_set(theme_bw(base_size = 16))

years_ago <- today() - years(4)
crash_url <- glue::glue("https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if?$where=CRASH_DATE > '{years_ago}'")
crash_raw <- as_tibble(read.socrata(crash_url))
names(crash_raw)
##  [1] "crash_record_id"               "rd_no"                        
##  [3] "crash_date_est_i"              "crash_date"                   
##  [5] "posted_speed_limit"            "traffic_control_device"       
##  [7] "device_condition"              "weather_condition"            
##  [9] "lighting_condition"            "first_crash_type"             
## [11] "trafficway_type"               "lane_cnt"                     
## [13] "alignment"                     "roadway_surface_cond"         
## [15] "road_defect"                   "report_type"                  
## [17] "crash_type"                    "intersection_related_i"       
## [19] "private_property_i"            "hit_and_run_i"                
## [21] "damage"                        "date_police_notified"         
## [23] "prim_contributory_cause"       "sec_contributory_cause"       
## [25] "street_no"                     "street_direction"             
## [27] "street_name"                   "beat_of_occurrence"           
## [29] "photos_taken_i"                "statements_taken_i"           
## [31] "dooring_i"                     "work_zone_i"                  
## [33] "work_zone_type"                "workers_present_i"            
## [35] "num_units"                     "most_severe_injury"           
## [37] "injuries_total"                "injuries_fatal"               
## [39] "injuries_incapacitating"       "injuries_non_incapacitating"  
## [41] "injuries_reported_not_evident" "injuries_no_indication"       
## [43] "injuries_unknown"              "crash_hour"                   
## [45] "crash_day_of_week"             "crash_month"                  
## [47] "latitude"                      "longitude"                    
## [49] "location"
crash <- crash_raw %>%
  arrange(desc(crash_date)) %>%
  transmute(
    injuries = if_else(injuries_total > 0, "injuries", "none"),
    crash_date,
    crash_hour,
    report_type = if_else(report_type == "", "UNKNOWN", report_type),
    num_units,
    posted_speed_limit,
    weather_condition,
    lighting_condition,
    roadway_surface_cond,
    lighting_condition,
    alignment,
    crash_type,
    first_crash_type,
    trafficway_type,
    prim_contributory_cause,
    latitude, longitude
  ) %>%
  na.omit()


dim(crash)
## [1] 423119     16
head(crash)
## # A tibble: 6 x 16
##   injuries crash_date          crash_hour report_type num_units posted_speed_li~
##   <chr>    <dttm>                   <int> <chr>           <int>            <int>
## 1 none     2021-05-15 01:24:00          1 NOT ON SCE~         2               30
## 2 none     2021-05-15 01:18:00          1 ON SCENE            3               30
## 3 none     2021-05-15 00:28:00          0 ON SCENE            1               30
## 4 injuries 2021-05-15 00:24:00          0 ON SCENE            2               30
## 5 none     2021-05-14 23:10:00         23 NOT ON SCE~         2               30
## 6 none     2021-05-14 23:06:00         23 ON SCENE            2               30
## # ... with 10 more variables: weather_condition <chr>,
## #   lighting_condition <chr>, roadway_surface_cond <chr>, alignment <chr>,
## #   crash_type <chr>, first_crash_type <chr>, trafficway_type <chr>,
## #   prim_contributory_cause <chr>, latitude <dbl>, longitude <dbl>
crash$year= year(crash$crash_date)
table(crash$year)
## 
##   2017   2018   2019   2020   2021 
##  62047 118245 116574  91273  34980
crash= subset(crash, year==2018| year==2019| year==2020)
dim(crash)
## [1] 326092     17
theme_set(theme_bw(base_size = 18))
crash %>%
  mutate(crash_date = floor_date(crash_date, unit = "week")) %>%
  count(crash_date, injuries) %>%
  filter(
    crash_date != last(crash_date),
    crash_date != first(crash_date)
  ) %>%
  ggplot(aes(crash_date, n, color = injuries)) +
  geom_line(size = 1.5, alpha = 0.7) +
  scale_y_continuous(limits = (c(0, NA))) +
  labs(
    x = NULL, y = "Number of traffic crashes per week",
    color = "Injuries?"
  )

crash %>%
  filter(latitude > 0) %>%
  ggplot(aes(longitude, latitude, color = injuries)) +
  geom_point(size = 0.5, alpha = 0.4) +
  labs(color = NULL) +
  scale_color_manual(values = c("deeppink4", "gray80")) +
  coord_fixed()

names(crash)
##  [1] "injuries"                "crash_date"             
##  [3] "crash_hour"              "report_type"            
##  [5] "num_units"               "posted_speed_limit"     
##  [7] "weather_condition"       "lighting_condition"     
##  [9] "roadway_surface_cond"    "alignment"              
## [11] "crash_type"              "first_crash_type"       
## [13] "trafficway_type"         "prim_contributory_cause"
## [15] "latitude"                "longitude"              
## [17] "year"
crash= crash[, -c(2, 3, 4, 15:17)]
names(crash)
##  [1] "injuries"                "num_units"              
##  [3] "posted_speed_limit"      "weather_condition"      
##  [5] "lighting_condition"      "roadway_surface_cond"   
##  [7] "alignment"               "crash_type"             
##  [9] "first_crash_type"        "trafficway_type"        
## [11] "prim_contributory_cause"
library(tidymodels)
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom     0.7.5     v rsample   0.0.9
## v dials     0.0.9     v tune      0.1.3
## v infer     0.5.4     v workflows 0.2.2
## v modeldata 0.1.0     v yardstick 0.0.8
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x data.table::between()     masks dplyr::between()
## x scales::discard()         masks purrr::discard()
## x dplyr::filter()           masks stats::filter()
## x data.table::first()       masks dplyr::first()
## x recipes::fixed()          masks stringr::fixed()
## x dplyr::lag()              masks stats::lag()
## x data.table::last()        masks dplyr::last()
## x yardstick::spec()         masks readr::spec()
## x recipes::step()           masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample()   masks recipes::step_upsample()
## x data.table::transpose()   masks purrr::transpose()
set.seed(2021)
crash_split <- initial_split(crash, strata = injuries)
crash_train <- training(crash_split)
crash_test <- testing(crash_split)

set.seed(2020)
crash_folds <- vfold_cv(crash_train, strata = injuries)
crash_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 x 2
##    splits                 id    
##    <list>                 <chr> 
##  1 <split [220112/24458]> Fold01
##  2 <split [220112/24458]> Fold02
##  3 <split [220113/24457]> Fold03
##  4 <split [220113/24457]> Fold04
##  5 <split [220113/24457]> Fold05
##  6 <split [220113/24457]> Fold06
##  7 <split [220113/24457]> Fold07
##  8 <split [220113/24457]> Fold08
##  9 <split [220114/24456]> Fold09
## 10 <split [220114/24456]> Fold10
crash_rec <- recipe(injuries ~ ., data = crash_train) %>%
  step_other(weather_condition, first_crash_type,
             trafficway_type, prim_contributory_cause,
             other = "OTHER"
  ) %>%
  step_downsample(injuries)

bag_spec <- bag_tree(min_n = 10) %>%
  set_engine("rpart", times = 25) %>%
  set_mode("classification")

crash_wf <- workflow() %>%
  add_recipe(crash_rec) %>%
  add_model(bag_spec)


doParallel::registerDoParallel()
crash_res <- fit_resamples(
  crash_wf,
  crash_folds,
  control = control_resamples(save_pred = TRUE)
)



crash_fit <- last_fit(crash_wf, crash_split)
collect_metrics(crash_fit)
## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.878 Preprocessor1_Model1
## 2 roc_auc  binary         0.958 Preprocessor1_Model1
crash_imp <- crash_fit$.workflow[[1]] %>%
  pull_workflow_fit()

crash_imp$fit$imp %>%
  slice_max(value, n = 10) %>%
  ggplot(aes(value, fct_reorder(term, value))) +
  geom_col(alpha = 0.8, fill = "midnightblue") +
  labs(x = "Variable importance score", y = NULL)

collect_predictions(crash_fit) %>%
  roc_curve(injuries, .pred_injuries) %>%
  ggplot(aes(x = 1 - specificity, y = sensitivity)) +
  geom_line(size = 1.5, color = "midnightblue") +
  geom_abline(
    lty = 2, alpha = 0.5,
    color = "gray50",
    size = 1.2
  ) +
  coord_equal()