data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data2.csv")
figs <- read_csv(data) %>%
as_tibble() %>%
rename(n = `n`) %>%
mutate(eff_target = if_else(eff_target==1, "Top", "NoTop")) %>%
# Remove columns that can't be used in model
select(-target, -D1, -D2, -eff_TV, -top, -eff_top, -fv, -DST, -seed, -E, -E4, -TE, -split_800, -split_801, -split_802, -split_803, -split_804, -split_805, -RST, -X) %>%
rename(target = eff_target) %>%
#change character data to factors
mutate_if(is.character, factor) %>%
mutate(race_date = mdy(race_date)) %>%
mutate(target = factor(target, c("Top", "NoTop")))
# Change Top to first level
# levels(figs$target)
skim(figs)
| Name | figs |
| Number of rows | 35617 |
| Number of columns | 60 |
| _______________________ | |
| Column type frequency: | |
| Date | 1 |
| factor | 14 |
| numeric | 45 |
| ________________________ | |
| Group variables | None |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| race_date | 0 | 1 | 2016-04-13 | 2021-12-20 | 2019-10-31 | 1433 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| target | 0 | 1 | FALSE | 2 | NoT: 27061, Top: 8556 |
| horse | 0 | 1 | FALSE | 3387 | CHA: 57, VIC: 57, STA: 54, EL : 53 |
| surface_cond | 0 | 1 | FALSE | 13 | fst: 16366, frm: 10368, wsm: 2913, vsl: 1771 |
| precip | 0 | 1 | FALSE | 3 | cle: 34259, rai: 1278, sno: 80 |
| wind | 0 | 1 | FALSE | 3 | cal: 31586, hvy: 2581, vhv: 1450 |
| gender | 0 | 1 | FALSE | 2 | Mal: 24217, Fem: 11400 |
| distance | 0 | 1 | FALSE | 22 | 8.0: 9316, 6.0: 7624, 8.5: 4621, 7.0: 3011 |
| jky | 0 | 1 | FALSE | 729 | I O: 1034, L S: 930, M F: 915, J O: 879 |
| trk_code | 0 | 1 | FALSE | 145 | GP: 8459, AQU: 4858, BEL: 3366, SA: 2892 |
| surf | 0 | 1 | FALSE | 5 | Dir: 21482, Tur: 11757, Syn: 1271, Inn: 1038 |
| s_cond | 0 | 1 | FALSE | 8 | Fst: 18433, Frm: 10362, Gd: 3275, Sly: 2180 |
| form_cycle | 0 | 1 | FALSE | 24 | AX: 6600, CX: 3762, BX: 3536, A1: 3387 |
| race_type | 0 | 1 | FALSE | 2 | Spr: 18816, Rou: 16801 |
| trk | 0 | 1 | FALSE | 130 | GP: 8377, AQ: 4769, BE: 3359, SA: 2887 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| n | 0 | 1 | 9.86 | 8.51 | 1.00 | 3.00 | 7.00 | 14.00 | 57.00 | ▇▂▁▁▁ |
| cls_val | 0 | 1 | 1.99 | 0.89 | 1.00 | 1.15 | 1.62 | 3.00 | 12.59 | ▇▁▁▁▁ |
| cls_mv | 0 | 1 | 0.07 | 0.68 | -11.49 | 0.00 | 0.00 | 0.00 | 11.39 | ▁▁▇▁▁ |
| yr | 0 | 1 | 2019.22 | 0.99 | 2016.00 | 2019.00 | 2019.00 | 2020.00 | 2021.00 | ▁▃▇▇▂ |
| age | 0 | 1 | 3.64 | 1.30 | 1.00 | 3.00 | 3.00 | 4.00 | 11.00 | ▇▆▁▁▁ |
| ftl | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| clm_by_hot | 0 | 1 | -0.93 | 0.34 | -1.00 | -1.00 | -1.00 | -1.00 | 1.00 | ▇▁▁▁▁ |
| start | 0 | 1 | 0.03 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 0.90 | ▇▁▁▁▁ |
| bled | 0 | 1 | 0.00 | 0.02 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| off_turf | 0 | 1 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| bfnr | 0 | 1 | 0.01 | 0.11 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| lame | 0 | 1 | 0.00 | 0.02 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| lsx | 0 | 1 | 0.00 | 0.09 | -1.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▁▁▇▁▁ |
| blnkrs | 0 | 1 | 0.04 | 0.27 | -1.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▁▁▇▁▁ |
| age_wks | 0 | 1 | 204.45 | 65.52 | 37.00 | 154.00 | 190.00 | 240.00 | 559.00 | ▂▇▂▁▁ |
| wt | 2 | 1 | 120.32 | 2.70 | 108.00 | 119.00 | 120.00 | 122.00 | 143.00 | ▁▇▂▁▁ |
| off_odds | 0 | 1 | 12.17 | 17.09 | 0.11 | 2.50 | 6.00 | 13.00 | 99.00 | ▇▁▁▁▁ |
| fld | 2 | 1 | 8.39 | 2.21 | 2.00 | 7.00 | 8.00 | 10.00 | 30.00 | ▅▇▁▁▁ |
| L3 | 0 | 1 | 18.01 | 7.03 | 1.75 | 13.33 | 16.92 | 21.42 | 99.00 | ▇▃▁▁▁ |
| L5 | 0 | 1 | 18.26 | 6.77 | 1.75 | 13.70 | 17.25 | 21.62 | 99.00 | ▇▃▁▁▁ |
| L7 | 0 | 1 | 18.43 | 6.66 | 1.75 | 13.96 | 17.43 | 21.75 | 99.00 | ▇▃▁▁▁ |
| rest | 0 | 1 | 43.85 | 53.38 | 0.00 | 21.00 | 28.00 | 43.00 | 1081.00 | ▇▁▁▁▁ |
| avg_rest | 0 | 1 | 39.75 | 21.66 | 7.25 | 25.67 | 34.70 | 46.78 | 489.00 | ▇▁▁▁▁ |
| efforts_last90 | 0 | 1 | 0.90 | 0.95 | 0.00 | 0.00 | 1.00 | 1.00 | 8.00 | ▇▂▁▁▁ |
| Lag1 | 0 | 1 | 0.21 | 0.40 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| Lag2 | 0 | 1 | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| Lag3 | 0 | 1 | 0.17 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| Lag4 | 0 | 1 | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| Lag5 | 0 | 1 | 0.14 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| LP1 | 0 | 1 | -1.91 | 2.27 | -5.00 | -5.00 | -2.00 | 0.00 | 1.00 | ▆▂▃▂▇ |
| LP2 | 0 | 1 | -1.91 | 2.16 | -5.00 | -4.00 | -2.00 | 0.00 | 1.00 | ▆▂▆▂▇ |
| LP3 | 0 | 1 | -1.92 | 2.05 | -5.00 | -4.00 | -2.00 | 0.00 | 1.00 | ▆▂▇▂▇ |
| LP4 | 0 | 1 | -1.92 | 1.95 | -5.00 | -3.00 | -2.00 | 0.00 | 1.00 | ▅▁▇▁▆ |
| LP5 | 0 | 1 | -1.93 | 1.85 | -5.00 | -3.00 | -2.00 | 0.00 | 1.00 | ▃▁▇▁▅ |
| pp_id | 0 | 1 | 27487.78 | 16975.26 | 1.00 | 12623.00 | 26040.00 | 41459.00 | 59178.00 | ▇▇▆▇▆ |
| won | 0 | 1 | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| splt_1 | 0 | 1 | 4.48 | 6.00 | 0.00 | 0.00 | 2.89 | 6.79 | 51.00 | ▇▁▁▁▁ |
| splt_2 | 0 | 1 | 2.36 | 3.93 | 0.00 | 0.00 | 0.00 | 3.81 | 35.91 | ▇▁▁▁▁ |
| splt_3 | 0 | 1 | 2.60 | 5.16 | 0.00 | 0.00 | 0.00 | 3.57 | 45.30 | ▇▁▁▁▁ |
| splt_4 | 0 | 1 | 3.53 | 5.16 | 0.00 | 0.00 | 1.23 | 5.27 | 46.29 | ▇▁▁▁▁ |
| splt_5 | 0 | 1 | 1.34 | 2.75 | 0.00 | 0.00 | 0.00 | 1.93 | 27.99 | ▇▁▁▁▁ |
| splt_6 | 0 | 1 | 2.85 | 4.09 | 0.00 | 0.00 | 0.00 | 3.97 | 30.87 | ▇▂▁▁▁ |
| splt_7 | 0 | 1 | 4.68 | 5.87 | 0.00 | 0.00 | 2.90 | 7.42 | 37.86 | ▇▂▁▁▁ |
| splt_8 | 0 | 1 | 11.88 | 9.05 | 0.00 | 3.93 | 10.63 | 17.73 | 50.57 | ▇▆▂▁▁ |
| splt_9 | 0 | 1 | 10.57 | 8.58 | 0.00 | 2.97 | 9.35 | 16.01 | 47.05 | ▇▅▂▁▁ |
set.seed(123)
figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)
figs_split
## <Analysis/Assess/Total>
## <26713/8904/35617>
set.seed(234)
figs_boot <- bootstraps(figs_train)
figs_boot
## # Bootstrap sampling
## # A tibble: 25 x 2
## splits id
## <list> <chr>
## 1 <split [26713/9810]> Bootstrap01
## 2 <split [26713/9822]> Bootstrap02
## 3 <split [26713/9857]> Bootstrap03
## 4 <split [26713/9783]> Bootstrap04
## 5 <split [26713/9855]> Bootstrap05
## 6 <split [26713/9837]> Bootstrap06
## 7 <split [26713/9817]> Bootstrap07
## 8 <split [26713/9801]> Bootstrap08
## 9 <split [26713/9788]> Bootstrap09
## 10 <split [26713/9846]> Bootstrap10
## # … with 15 more rows
glm_spec <- logistic_reg() %>%
set_engine("glm")
glm_spec
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
figs_rec <- recipe(target ~ ., data=figs_train) %>%
update_role(horse, new_role = "ID") %>%
update_role(pp_id, new_role = "ID") %>%
update_role(won, new_role = "ID") %>%
step_date(race_date, features = c("month", "year")) %>%
step_other(jky , threshold = 0.015) %>%
step_other(trk_code , threshold = 0.02) %>%
step_other(distance, threshold = 0.02) %>%
step_dummy(distance) %>%
step_dummy(gender) %>%
step_dummy(trk_code) %>%
step_dummy(jky) %>%
step_dummy(form_cycle) %>%
step_dummy(surf) %>%
step_dummy(wind) %>%
step_dummy(precip) %>%
step_dummy(s_cond) %>%
step_dummy(race_type) %>%
step_dummy(surface_cond) %>%
step_dummy(race_date_month) %>%
step_rm(yr, trk) %>%
step_rm(race_date, n) %>%
step_zv(all_predictors(), -all_outcomes()) %>%
step_corr(all_numeric(), -all_outcomes()) %>%
themis::step_downsample(target)
figs_prep <- prep(figs_rec)
juiced <- juice(figs_prep)
skim(juiced)
| Name | juiced |
| Number of rows | 12834 |
| Number of columns | 140 |
| _______________________ | |
| Column type frequency: | |
| factor | 2 |
| numeric | 138 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| horse | 0 | 1 | FALSE | 2955 | PLA: 22, RAD: 20, HON: 19, SAL: 19 |
| target | 0 | 1 | FALSE | 2 | Top: 6417, NoT: 6417 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| cls_val | 0 | 1 | 1.97 | 0.89 | 1.00 | 1.15 | 1.50 | 3.00 | 12.59 | ▇▁▁▁▁ |
| cls_mv | 0 | 1 | 0.10 | 0.70 | -7.51 | 0.00 | 0.00 | 0.00 | 11.39 | ▁▇▂▁▁ |
| ftl | 0 | 1 | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| clm_by_hot | 0 | 1 | -0.93 | 0.32 | -1.00 | -1.00 | -1.00 | -1.00 | 1.00 | ▇▁▁▁▁ |
| start | 0 | 1 | 0.02 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 0.90 | ▇▁▁▁▁ |
| bled | 0 | 1 | 0.00 | 0.01 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| off_turf | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| bfnr | 0 | 1 | 0.01 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| lame | 0 | 1 | 0.00 | 0.02 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| lsx | 0 | 1 | 0.00 | 0.10 | -1.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▁▁▇▁▁ |
| blnkrs | 0 | 1 | 0.04 | 0.27 | -1.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▁▁▇▁▁ |
| age_wks | 0 | 1 | 198.15 | 62.81 | 44.00 | 151.00 | 183.00 | 230.00 | 557.00 | ▃▇▂▁▁ |
| wt | 1 | 1 | 120.31 | 2.70 | 108.00 | 118.00 | 120.00 | 122.00 | 140.00 | ▁▇▆▁▁ |
| off_odds | 0 | 1 | 11.92 | 16.87 | 0.11 | 2.50 | 5.00 | 13.00 | 99.00 | ▇▁▁▁▁ |
| fld | 1 | 1 | 8.42 | 2.20 | 2.00 | 7.00 | 8.00 | 10.00 | 30.00 | ▅▇▁▁▁ |
| L7 | 0 | 1 | 18.65 | 6.66 | 3.25 | 14.14 | 17.62 | 22.05 | 99.00 | ▇▂▁▁▁ |
| rest | 0 | 1 | 45.27 | 53.38 | 0.00 | 21.00 | 29.00 | 46.00 | 730.00 | ▇▁▁▁▁ |
| avg_rest | 0 | 1 | 40.42 | 22.76 | 9.33 | 26.00 | 34.83 | 47.14 | 297.00 | ▇▁▁▁▁ |
| efforts_last90 | 0 | 1 | 0.98 | 0.94 | 0.00 | 0.00 | 1.00 | 2.00 | 7.00 | ▇▂▁▁▁ |
| Lag1 | 0 | 1 | 0.23 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| Lag2 | 0 | 1 | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| Lag3 | 0 | 1 | 0.16 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| Lag4 | 0 | 1 | 0.14 | 0.35 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| Lag5 | 0 | 1 | 0.13 | 0.33 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| LP1 | 0 | 1 | -1.62 | 2.27 | -5.00 | -4.00 | -1.00 | 0.00 | 1.00 | ▅▁▂▁▇ |
| LP2 | 0 | 1 | -1.75 | 2.11 | -5.00 | -3.00 | -2.00 | 0.00 | 1.00 | ▅▁▆▂▇ |
| LP3 | 0 | 1 | -1.85 | 1.98 | -5.00 | -3.00 | -2.00 | 0.00 | 1.00 | ▅▁▇▁▇ |
| LP4 | 0 | 1 | -1.86 | 1.87 | -5.00 | -2.00 | -2.00 | 0.00 | 1.00 | ▃▁▇▁▅ |
| LP5 | 0 | 1 | -1.90 | 1.76 | -5.00 | -2.00 | -2.00 | -1.00 | 1.00 | ▃▁▇▁▃ |
| pp_id | 0 | 1 | 27262.91 | 16891.16 | 3.00 | 12649.50 | 25503.50 | 41077.50 | 59175.00 | ▇▇▆▇▅ |
| won | 0 | 1 | 0.24 | 0.43 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| splt_1 | 0 | 1 | 4.36 | 5.76 | 0.00 | 0.00 | 2.93 | 6.71 | 51.00 | ▇▁▁▁▁ |
| splt_2 | 0 | 1 | 2.38 | 3.81 | 0.00 | 0.00 | 0.00 | 3.82 | 33.07 | ▇▁▁▁▁ |
| splt_3 | 0 | 1 | 2.31 | 4.87 | 0.00 | 0.00 | 0.00 | 2.92 | 45.30 | ▇▁▁▁▁ |
| splt_4 | 0 | 1 | 3.43 | 4.99 | 0.00 | 0.00 | 1.63 | 4.89 | 46.29 | ▇▁▁▁▁ |
| splt_5 | 0 | 1 | 1.31 | 2.69 | 0.00 | 0.00 | 0.00 | 1.96 | 26.99 | ▇▁▁▁▁ |
| splt_6 | 0 | 1 | 2.70 | 3.91 | 0.00 | 0.00 | 0.00 | 3.94 | 30.87 | ▇▁▁▁▁ |
| splt_7 | 0 | 1 | 4.45 | 5.63 | 0.00 | 0.00 | 2.90 | 6.87 | 37.86 | ▇▂▁▁▁ |
| splt_8 | 0 | 1 | 11.22 | 8.69 | 0.00 | 3.00 | 9.85 | 16.55 | 50.57 | ▇▅▂▁▁ |
| splt_9 | 0 | 1 | 10.09 | 8.22 | 0.00 | 2.96 | 8.38 | 14.97 | 43.48 | ▇▅▂▁▁ |
| race_date_year | 0 | 1 | 2019.20 | 1.00 | 2016.00 | 2019.00 | 2019.00 | 2020.00 | 2021.00 | ▁▃▇▇▂ |
| distance_X5.5F | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| distance_X6.0F | 0 | 1 | 0.21 | 0.40 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| distance_X6.5F | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| distance_X7.0F | 0 | 1 | 0.09 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| distance_X8.0F | 0 | 1 | 0.27 | 0.44 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▃ |
| distance_X8.5F | 0 | 1 | 0.14 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| distance_X9.0F | 0 | 1 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| distance_other | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| gender_Male | 0 | 1 | 0.68 | 0.47 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▃▁▁▁▇ |
| trk_code_BEL | 0 | 1 | 0.09 | 0.29 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| trk_code_CD | 0 | 1 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| trk_code_DMR | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| trk_code_GP | 0 | 1 | 0.24 | 0.43 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| trk_code_GPW | 0 | 1 | 0.03 | 0.18 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| trk_code_KEE | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| trk_code_SA | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| trk_code_SAR | 0 | 1 | 0.05 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| trk_code_WO | 0 | 1 | 0.03 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| trk_code_other | 0 | 1 | 0.22 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| jky_E.Cancel | 0 | 1 | 0.02 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_E.Jaramillo | 0 | 1 | 0.02 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_E.Zayas | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_I.Ortiz..Jr. | 0 | 1 | 0.03 | 0.17 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_J.Alvarado | 0 | 1 | 0.02 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_J.Lezcano | 0 | 1 | 0.02 | 0.14 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_J.Ortiz | 0 | 1 | 0.03 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_J.Rosario | 0 | 1 | 0.02 | 0.14 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_L.Reyes | 0 | 1 | 0.02 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_L.Saez | 0 | 1 | 0.03 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_M.Franco | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_M.Vasquez | 0 | 1 | 0.02 | 0.13 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_P.Lopez | 0 | 1 | 0.02 | 0.12 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_R.Maragh | 0 | 1 | 0.02 | 0.12 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_T.Gaffalione | 0 | 1 | 0.02 | 0.15 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| jky_other | 0 | 1 | 0.67 | 0.47 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▃▁▁▁▇ |
| form_cycle_A2 | 0 | 1 | 0.11 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_A3 | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_AX | 0 | 1 | 0.18 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| form_cycle_B1 | 0 | 1 | 0.06 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_B2 | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_B3 | 0 | 1 | 0.03 | 0.17 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_BX | 0 | 1 | 0.09 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_C1 | 0 | 1 | 0.05 | 0.22 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_C2 | 0 | 1 | 0.04 | 0.18 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_C3 | 0 | 1 | 0.03 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_CX | 0 | 1 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_D1 | 0 | 1 | 0.02 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_D2 | 0 | 1 | 0.02 | 0.14 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_D3 | 0 | 1 | 0.01 | 0.12 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_DX | 0 | 1 | 0.04 | 0.20 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_E1 | 0 | 1 | 0.01 | 0.10 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_E2 | 0 | 1 | 0.01 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_E3 | 0 | 1 | 0.00 | 0.07 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_EX | 0 | 1 | 0.02 | 0.12 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_F1 | 0 | 1 | 0.00 | 0.06 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_F2 | 0 | 1 | 0.00 | 0.06 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_F3 | 0 | 1 | 0.00 | 0.04 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| form_cycle_FX | 0 | 1 | 0.00 | 0.07 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surf_Inner.Dirt | 0 | 1 | 0.00 | 0.04 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surf_Inner.Turf | 0 | 1 | 0.03 | 0.17 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surf_Synth | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surf_Turf | 0 | 1 | 0.35 | 0.48 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▅ |
| wind_hvy | 0 | 1 | 0.07 | 0.26 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| wind_vhvy | 0 | 1 | 0.04 | 0.19 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| precip_rain | 0 | 1 | 0.03 | 0.18 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| precip_snow | 0 | 1 | 0.00 | 0.04 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| s_cond_Fst | 0 | 1 | 0.51 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
| s_cond_Gd | 0 | 1 | 0.09 | 0.29 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| s_cond_My | 0 | 1 | 0.02 | 0.14 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| s_cond_Sf | 0 | 1 | 0.00 | 0.07 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| s_cond_Sly | 0 | 1 | 0.06 | 0.23 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| s_cond_Trk_condit | 0 | 1 | 0.00 | 0.03 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| s_cond_Yl | 0 | 1 | 0.01 | 0.11 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_type_Sprint | 0 | 1 | 0.52 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
| surface_cond_frz | 0 | 1 | 0.00 | 0.03 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surface_cond_fst | 0 | 1 | 0.45 | 0.50 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▆ |
| surface_cond_gd | 0 | 1 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surface_cond_gd.slw | 0 | 1 | 0.03 | 0.17 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surface_cond_hvy | 0 | 1 | 0.00 | 0.02 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surface_cond_puaf | 0 | 1 | 0.01 | 0.08 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surface_cond_sft | 0 | 1 | 0.01 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surface_cond_vbw | 0 | 1 | 0.00 | 0.03 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surface_cond_vslw | 0 | 1 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surface_cond_wf | 0 | 1 | 0.01 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surface_cond_wsm | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| surface_cond_yld | 0 | 1 | 0.01 | 0.11 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_date_month_Feb | 0 | 1 | 0.09 | 0.29 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_date_month_Mar | 0 | 1 | 0.06 | 0.24 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_date_month_Apr | 0 | 1 | 0.05 | 0.22 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_date_month_May | 0 | 1 | 0.07 | 0.25 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_date_month_Jun | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_date_month_Jul | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_date_month_Aug | 0 | 1 | 0.09 | 0.29 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_date_month_Sep | 0 | 1 | 0.09 | 0.29 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_date_month_Oct | 0 | 1 | 0.09 | 0.29 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_date_month_Nov | 0 | 1 | 0.10 | 0.30 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| race_date_month_Dec | 0 | 1 | 0.09 | 0.28 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
figs_wf <- workflow() %>%
add_recipe(figs_rec)
members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)
doParallel::registerDoParallel()
glm_rs <- figs_wf %>%
add_model(glm_spec) %>%
fit_resamples(
resamples = figs_boot,
metrics = members_metrics,
control = control_resamples(save_pred = TRUE)
)
glm_rs
## # Resampling results
## # Bootstrap sampling
## # A tibble: 25 x 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [2 ×… <tibble [9,810 × …
## 2 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,822 × …
## 3 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,857 × …
## 4 <split [26713/97… Bootstrap… <tibble [4 × 4… <tibble [2 ×… <tibble [9,783 × …
## 5 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,855 × …
## 6 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,837 × …
## 7 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,817 × …
## 8 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,801 × …
## 9 <split [26713/97… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,788 × …
## 10 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,846 × …
## # … with 15 more rows
collect_metrics(glm_rs)
## # A tibble: 4 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.679 25 0.000834 Preprocessor1_Model1
## 2 roc_auc binary 0.757 25 0.000891 Preprocessor1_Model1
## 3 sens binary 0.699 25 0.00219 Preprocessor1_Model1
## 4 spec binary 0.673 25 0.00133 Preprocessor1_Model1
glm_rs %>%
conf_mat_resampled()
## # A tibble: 4 x 3
## Prediction Truth Freq
## <fct> <fct> <dbl>
## 1 Top Top 1657.
## 2 Top NoTop 2435.
## 3 NoTop Top 715.
## 4 NoTop NoTop 5009.
glm_rs %>%
collect_predictions() %>%
group_by(id) %>%
roc_curve(target, .pred_Top) %>%
ggplot(aes(1-specificity, sensitivity, color=id)) +
geom_abline(lty=2, color="gray80", size= 1.5) +
geom_path(show.legend =FALSE, alpha = 0.6, size =1.2) +
coord_equal()
figs_final <- figs_wf %>%
add_model(glm_spec) %>%
last_fit(figs_split)
figs_final
## # Resampling results
## # Manual resampling
## # A tibble: 1 x 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [26713… train/test… <tibble [2 … <tibble [1… <tibble [8,904 … <workflo…
collect_metrics(figs_final)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.671 Preprocessor1_Model1
## 2 roc_auc binary 0.756 Preprocessor1_Model1
collect_predictions(figs_final) %>%
conf_mat(target, .pred_class)
## Truth
## Prediction Top NoTop
## Top 1519 2306
## NoTop 620 4458
options(scipen = 999)
figs_final$.workflow[[1]] %>%
tidy(exponentiate=TRUE) %>%
arrange(p.value) %>%
DT::datatable()
figs_final %>%
pull(.workflow) %>%
pluck(1) %>%
tidy() %>%
filter(term != "(Intercept)") %>%
ggplot(aes(estimate, fct_reorder(term, estimate))) +
geom_vline(xintercept = 0, color = "gray50", lty = 2, size = 1.2) +
geom_point(size = 2, color = "#85144B") +
labs(y = NULL, x = "Coefficent from logistic regression")