### Made some changes in variable selection. Otherwise complete replicate of
### https://juliasilge.com/blog/chicago-traffic-model/
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.0 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(RSocrata)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## The following object is masked from 'package:purrr':
##
## transpose
## Loading required package: recipes
##
## Attaching package: 'recipes'
## The following object is masked from 'package:stringr':
##
## fixed
## The following object is masked from 'package:stats':
##
## step
## Registered S3 methods overwritten by 'themis':
## method from
## bake.step_downsample recipes
## bake.step_upsample recipes
## prep.step_downsample recipes
## prep.step_upsample recipes
## tidy.step_downsample recipes
## tidy.step_upsample recipes
## tunable.step_downsample recipes
## tunable.step_upsample recipes
##
## Attaching package: 'themis'
## The following objects are masked from 'package:recipes':
##
## step_downsample, step_upsample
## Loading required package: parsnip
### install.packages("baguette")
library(lubridate)
library(ggplot2)
theme_set(theme_bw(base_size = 16))
years_ago <- today() - years(4)
crash_url <- glue::glue("https://data.cityofchicago.org/Transportation/Traffic-Crashes-Crashes/85ca-t3if?$where=CRASH_DATE > '{years_ago}'")
crash_raw <- as_tibble(read.socrata(crash_url))
names(crash_raw)
## [1] "crash_record_id" "rd_no"
## [3] "crash_date_est_i" "crash_date"
## [5] "posted_speed_limit" "traffic_control_device"
## [7] "device_condition" "weather_condition"
## [9] "lighting_condition" "first_crash_type"
## [11] "trafficway_type" "lane_cnt"
## [13] "alignment" "roadway_surface_cond"
## [15] "road_defect" "report_type"
## [17] "crash_type" "intersection_related_i"
## [19] "private_property_i" "hit_and_run_i"
## [21] "damage" "date_police_notified"
## [23] "prim_contributory_cause" "sec_contributory_cause"
## [25] "street_no" "street_direction"
## [27] "street_name" "beat_of_occurrence"
## [29] "photos_taken_i" "statements_taken_i"
## [31] "dooring_i" "work_zone_i"
## [33] "work_zone_type" "workers_present_i"
## [35] "num_units" "most_severe_injury"
## [37] "injuries_total" "injuries_fatal"
## [39] "injuries_incapacitating" "injuries_non_incapacitating"
## [41] "injuries_reported_not_evident" "injuries_no_indication"
## [43] "injuries_unknown" "crash_hour"
## [45] "crash_day_of_week" "crash_month"
## [47] "latitude" "longitude"
## [49] "location"
crash <- crash_raw %>%
arrange(desc(crash_date)) %>%
transmute(
injuries = if_else(injuries_total > 0, "injuries", "none"),
crash_date,
crash_hour,
report_type = if_else(report_type == "", "UNKNOWN", report_type),
num_units,
posted_speed_limit,
weather_condition,
lighting_condition,
roadway_surface_cond,
lighting_condition,
alignment,
crash_type,
first_crash_type,
trafficway_type,
prim_contributory_cause,
latitude, longitude
) %>%
na.omit()
dim(crash)
## [1] 423119 16
## # A tibble: 6 x 16
## injuries crash_date crash_hour report_type num_units posted_speed_li~
## <chr> <dttm> <int> <chr> <int> <int>
## 1 none 2021-05-15 01:24:00 1 NOT ON SCE~ 2 30
## 2 none 2021-05-15 01:18:00 1 ON SCENE 3 30
## 3 none 2021-05-15 00:28:00 0 ON SCENE 1 30
## 4 injuries 2021-05-15 00:24:00 0 ON SCENE 2 30
## 5 none 2021-05-14 23:10:00 23 NOT ON SCE~ 2 30
## 6 none 2021-05-14 23:06:00 23 ON SCENE 2 30
## # ... with 10 more variables: weather_condition <chr>,
## # lighting_condition <chr>, roadway_surface_cond <chr>, alignment <chr>,
## # crash_type <chr>, first_crash_type <chr>, trafficway_type <chr>,
## # prim_contributory_cause <chr>, latitude <dbl>, longitude <dbl>
crash$year= year(crash$crash_date)
table(crash$year)
##
## 2017 2018 2019 2020 2021
## 62047 118245 116574 91273 34980
crash= subset(crash, year==2018| year==2019| year==2020)
dim(crash)
## [1] 326092 17
theme_set(theme_bw(base_size = 18))
crash %>%
mutate(crash_date = floor_date(crash_date, unit = "week")) %>%
count(crash_date, injuries) %>%
filter(
crash_date != last(crash_date),
crash_date != first(crash_date)
) %>%
ggplot(aes(crash_date, n, color = injuries)) +
geom_line(size = 1.5, alpha = 0.7) +
scale_y_continuous(limits = (c(0, NA))) +
labs(
x = NULL, y = "Number of traffic crashes per week",
color = "Injuries?"
)

crash %>%
filter(latitude > 0) %>%
ggplot(aes(longitude, latitude, color = injuries)) +
geom_point(size = 0.5, alpha = 0.4) +
labs(color = NULL) +
scale_color_manual(values = c("deeppink4", "gray80")) +
coord_fixed()

## [1] "injuries" "crash_date"
## [3] "crash_hour" "report_type"
## [5] "num_units" "posted_speed_limit"
## [7] "weather_condition" "lighting_condition"
## [9] "roadway_surface_cond" "alignment"
## [11] "crash_type" "first_crash_type"
## [13] "trafficway_type" "prim_contributory_cause"
## [15] "latitude" "longitude"
## [17] "year"
crash= crash[, -c(2, 3, 4, 15:17)]
names(crash)
## [1] "injuries" "num_units"
## [3] "posted_speed_limit" "weather_condition"
## [5] "lighting_condition" "roadway_surface_cond"
## [7] "alignment" "crash_type"
## [9] "first_crash_type" "trafficway_type"
## [11] "prim_contributory_cause"
## -- Attaching packages -------------------------------------- tidymodels 0.1.2 --
## v broom 0.7.5 v rsample 0.0.9
## v dials 0.0.9 v tune 0.1.3
## v infer 0.5.4 v workflows 0.2.2
## v modeldata 0.1.0 v yardstick 0.0.8
## -- Conflicts ----------------------------------------- tidymodels_conflicts() --
## x data.table::between() masks dplyr::between()
## x scales::discard() masks purrr::discard()
## x dplyr::filter() masks stats::filter()
## x data.table::first() masks dplyr::first()
## x recipes::fixed() masks stringr::fixed()
## x dplyr::lag() masks stats::lag()
## x data.table::last() masks dplyr::last()
## x yardstick::spec() masks readr::spec()
## x recipes::step() masks stats::step()
## x themis::step_downsample() masks recipes::step_downsample()
## x themis::step_upsample() masks recipes::step_upsample()
## x data.table::transpose() masks purrr::transpose()
set.seed(2021)
crash_split <- initial_split(crash, strata = injuries)
crash_train <- training(crash_split)
crash_test <- testing(crash_split)
set.seed(2020)
crash_folds <- vfold_cv(crash_train, strata = injuries)
crash_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 x 2
## splits id
## <list> <chr>
## 1 <split [220112/24458]> Fold01
## 2 <split [220112/24458]> Fold02
## 3 <split [220113/24457]> Fold03
## 4 <split [220113/24457]> Fold04
## 5 <split [220113/24457]> Fold05
## 6 <split [220113/24457]> Fold06
## 7 <split [220113/24457]> Fold07
## 8 <split [220113/24457]> Fold08
## 9 <split [220114/24456]> Fold09
## 10 <split [220114/24456]> Fold10
crash_rec <- recipe(injuries ~ ., data = crash_train) %>%
step_other(weather_condition, first_crash_type,
trafficway_type, prim_contributory_cause,
other = "OTHER"
) %>%
step_downsample(injuries)
bag_spec <- bag_tree(min_n = 10) %>%
set_engine("rpart", times = 25) %>%
set_mode("classification")
crash_wf <- workflow() %>%
add_recipe(crash_rec) %>%
add_model(bag_spec)
doParallel::registerDoParallel()
crash_res <- fit_resamples(
crash_wf,
crash_folds,
control = control_resamples(save_pred = TRUE)
)
crash_fit <- last_fit(crash_wf, crash_split)
collect_metrics(crash_fit)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.878 Preprocessor1_Model1
## 2 roc_auc binary 0.958 Preprocessor1_Model1
crash_imp <- crash_fit$.workflow[[1]] %>%
pull_workflow_fit()
crash_imp$fit$imp %>%
slice_max(value, n = 10) %>%
ggplot(aes(value, fct_reorder(term, value))) +
geom_col(alpha = 0.8, fill = "midnightblue") +
labs(x = "Variable importance score", y = NULL)

collect_predictions(crash_fit) %>%
roc_curve(injuries, .pred_injuries) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_line(size = 1.5, color = "midnightblue") +
geom_abline(
lty = 2, alpha = 0.5,
color = "gray50",
size = 1.2
) +
coord_equal()
