Get the data from Github

data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data2.csv")

figs <- read_csv(data) %>% 
  as_tibble() %>% 
  rename(n = `n`) %>% 
  mutate(eff_target = if_else(eff_target==1, "Top", "NoTop")) %>% 
  # Remove columns that can't be used in model
  select(-target, -D1, -D2, -eff_TV, -top, -eff_top, -fv, -DST, -seed, -E, -E4, -TE, -split_800, -split_801, -split_802, -split_803, -split_804, -split_805, -RST, -X) %>% 
  rename(target = eff_target) %>% 
  #change character data to factors
  mutate_if(is.character, factor) %>% 
  mutate(race_date = mdy(race_date)) %>% 
  mutate(target = factor(target, c("Top", "NoTop")))
  # Change Top to first level
  # levels(figs$target)

  
skim(figs)
Data summary
Name figs
Number of rows 35617
Number of columns 60
_______________________
Column type frequency:
Date 1
factor 14
numeric 45
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
race_date 0 1 2016-04-13 2021-12-20 2019-10-31 1433

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
target 0 1 FALSE 2 NoT: 27061, Top: 8556
horse 0 1 FALSE 3387 CHA: 57, VIC: 57, STA: 54, EL : 53
surface_cond 0 1 FALSE 13 fst: 16366, frm: 10368, wsm: 2913, vsl: 1771
precip 0 1 FALSE 3 cle: 34259, rai: 1278, sno: 80
wind 0 1 FALSE 3 cal: 31586, hvy: 2581, vhv: 1450
gender 0 1 FALSE 2 Mal: 24217, Fem: 11400
distance 0 1 FALSE 22 8.0: 9316, 6.0: 7624, 8.5: 4621, 7.0: 3011
jky 0 1 FALSE 729 I O: 1034, L S: 930, M F: 915, J O: 879
trk_code 0 1 FALSE 145 GP: 8459, AQU: 4858, BEL: 3366, SA: 2892
surf 0 1 FALSE 5 Dir: 21482, Tur: 11757, Syn: 1271, Inn: 1038
s_cond 0 1 FALSE 8 Fst: 18433, Frm: 10362, Gd: 3275, Sly: 2180
form_cycle 0 1 FALSE 24 AX: 6600, CX: 3762, BX: 3536, A1: 3387
race_type 0 1 FALSE 2 Spr: 18816, Rou: 16801
trk 0 1 FALSE 130 GP: 8377, AQ: 4769, BE: 3359, SA: 2887

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
n 0 1 9.86 8.51 1.00 3.00 7.00 14.00 57.00 ▇▂▁▁▁
cls_val 0 1 1.99 0.89 1.00 1.15 1.62 3.00 12.59 ▇▁▁▁▁
cls_mv 0 1 0.07 0.68 -11.49 0.00 0.00 0.00 11.39 ▁▁▇▁▁
yr 0 1 2019.22 0.99 2016.00 2019.00 2019.00 2020.00 2021.00 ▁▃▇▇▂
age 0 1 3.64 1.30 1.00 3.00 3.00 4.00 11.00 ▇▆▁▁▁
ftl 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
clm_by_hot 0 1 -0.93 0.34 -1.00 -1.00 -1.00 -1.00 1.00 ▇▁▁▁▁
start 0 1 0.03 0.09 0.00 0.00 0.00 0.00 0.90 ▇▁▁▁▁
bled 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
off_turf 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
bfnr 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lame 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lsx 0 1 0.00 0.09 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
blnkrs 0 1 0.04 0.27 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
age_wks 0 1 204.45 65.52 37.00 154.00 190.00 240.00 559.00 ▂▇▂▁▁
wt 2 1 120.32 2.70 108.00 119.00 120.00 122.00 143.00 ▁▇▂▁▁
off_odds 0 1 12.17 17.09 0.11 2.50 6.00 13.00 99.00 ▇▁▁▁▁
fld 2 1 8.39 2.21 2.00 7.00 8.00 10.00 30.00 ▅▇▁▁▁
L3 0 1 18.01 7.03 1.75 13.33 16.92 21.42 99.00 ▇▃▁▁▁
L5 0 1 18.26 6.77 1.75 13.70 17.25 21.62 99.00 ▇▃▁▁▁
L7 0 1 18.43 6.66 1.75 13.96 17.43 21.75 99.00 ▇▃▁▁▁
rest 0 1 43.85 53.38 0.00 21.00 28.00 43.00 1081.00 ▇▁▁▁▁
avg_rest 0 1 39.75 21.66 7.25 25.67 34.70 46.78 489.00 ▇▁▁▁▁
efforts_last90 0 1 0.90 0.95 0.00 0.00 1.00 1.00 8.00 ▇▂▁▁▁
Lag1 0 1 0.21 0.40 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag2 0 1 0.19 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag3 0 1 0.17 0.37 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag4 0 1 0.15 0.36 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag5 0 1 0.14 0.34 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
LP1 0 1 -1.91 2.27 -5.00 -5.00 -2.00 0.00 1.00 ▆▂▃▂▇
LP2 0 1 -1.91 2.16 -5.00 -4.00 -2.00 0.00 1.00 ▆▂▆▂▇
LP3 0 1 -1.92 2.05 -5.00 -4.00 -2.00 0.00 1.00 ▆▂▇▂▇
LP4 0 1 -1.92 1.95 -5.00 -3.00 -2.00 0.00 1.00 ▅▁▇▁▆
LP5 0 1 -1.93 1.85 -5.00 -3.00 -2.00 0.00 1.00 ▃▁▇▁▅
pp_id 0 1 27487.78 16975.26 1.00 12623.00 26040.00 41459.00 59178.00 ▇▇▆▇▆
won 0 1 0.19 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
splt_1 0 1 4.48 6.00 0.00 0.00 2.89 6.79 51.00 ▇▁▁▁▁
splt_2 0 1 2.36 3.93 0.00 0.00 0.00 3.81 35.91 ▇▁▁▁▁
splt_3 0 1 2.60 5.16 0.00 0.00 0.00 3.57 45.30 ▇▁▁▁▁
splt_4 0 1 3.53 5.16 0.00 0.00 1.23 5.27 46.29 ▇▁▁▁▁
splt_5 0 1 1.34 2.75 0.00 0.00 0.00 1.93 27.99 ▇▁▁▁▁
splt_6 0 1 2.85 4.09 0.00 0.00 0.00 3.97 30.87 ▇▂▁▁▁
splt_7 0 1 4.68 5.87 0.00 0.00 2.90 7.42 37.86 ▇▂▁▁▁
splt_8 0 1 11.88 9.05 0.00 3.93 10.63 17.73 50.57 ▇▆▂▁▁
splt_9 0 1 10.57 8.58 0.00 2.97 9.35 16.01 47.05 ▇▅▂▁▁

Build Model

We can start by splitting our data into training and testing sets.

set.seed(123)

figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)


figs_split
## <Analysis/Assess/Total>
## <26713/8904/35617>

Create Bootstrap

Next, let’s create bootstrap resamples of the training data, to evaluate our models.

set.seed(234)
figs_boot <- bootstraps(figs_train)
figs_boot
## # Bootstrap sampling 
## # A tibble: 25 x 2
##    splits               id         
##    <list>               <chr>      
##  1 <split [26713/9810]> Bootstrap01
##  2 <split [26713/9822]> Bootstrap02
##  3 <split [26713/9857]> Bootstrap03
##  4 <split [26713/9783]> Bootstrap04
##  5 <split [26713/9855]> Bootstrap05
##  6 <split [26713/9837]> Bootstrap06
##  7 <split [26713/9817]> Bootstrap07
##  8 <split [26713/9801]> Bootstrap08
##  9 <split [26713/9788]> Bootstrap09
## 10 <split [26713/9846]> Bootstrap10
## # … with 15 more rows

Create Model Specification

We create a model specification utilizing the glm package

glm_spec <- logistic_reg() %>% 
  set_engine("glm")

glm_spec
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm

Create a Recipe

The recipe performs preprocessing necessary to fit a logistic regression model

figs_rec <- recipe(target ~ ., data=figs_train) %>%
  update_role(horse, new_role = "ID") %>% 
  update_role(pp_id, new_role = "ID") %>% 
  update_role(won, new_role = "ID") %>% 
  step_date(race_date, features = c("month", "year")) %>% 
  step_other(jky , threshold = 0.015) %>% 
  step_other(trk_code , threshold = 0.02) %>% 
  step_other(distance, threshold = 0.02) %>% 
  step_dummy(distance) %>% 
  step_dummy(gender) %>% 
  step_dummy(trk_code) %>% 
  step_dummy(jky) %>% 
  step_dummy(form_cycle) %>% 
  step_dummy(surf) %>% 
  step_dummy(wind) %>% 
  step_dummy(precip) %>% 
  step_dummy(s_cond) %>% 
  step_dummy(race_type) %>% 
  step_dummy(surface_cond) %>% 
  step_dummy(race_date_month) %>%
  step_rm(yr, trk) %>% 
  step_rm(race_date, n) %>% 
  step_zv(all_predictors(), -all_outcomes()) %>% 
  step_corr(all_numeric(), -all_outcomes()) %>% 
  themis::step_downsample(target)
  
  

figs_prep <- prep(figs_rec)

juiced <- juice(figs_prep) 

skim(juiced)
Data summary
Name juiced
Number of rows 12834
Number of columns 140
_______________________
Column type frequency:
factor 2
numeric 138
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
horse 0 1 FALSE 2955 PLA: 22, RAD: 20, HON: 19, SAL: 19
target 0 1 FALSE 2 Top: 6417, NoT: 6417

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
cls_val 0 1 1.97 0.89 1.00 1.15 1.50 3.00 12.59 ▇▁▁▁▁
cls_mv 0 1 0.10 0.70 -7.51 0.00 0.00 0.00 11.39 ▁▇▂▁▁
ftl 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
clm_by_hot 0 1 -0.93 0.32 -1.00 -1.00 -1.00 -1.00 1.00 ▇▁▁▁▁
start 0 1 0.02 0.09 0.00 0.00 0.00 0.00 0.90 ▇▁▁▁▁
bled 0 1 0.00 0.01 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
off_turf 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
bfnr 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lame 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lsx 0 1 0.00 0.10 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
blnkrs 0 1 0.04 0.27 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
age_wks 0 1 198.15 62.81 44.00 151.00 183.00 230.00 557.00 ▃▇▂▁▁
wt 1 1 120.31 2.70 108.00 118.00 120.00 122.00 140.00 ▁▇▆▁▁
off_odds 0 1 11.92 16.87 0.11 2.50 5.00 13.00 99.00 ▇▁▁▁▁
fld 1 1 8.42 2.20 2.00 7.00 8.00 10.00 30.00 ▅▇▁▁▁
L7 0 1 18.65 6.66 3.25 14.14 17.62 22.05 99.00 ▇▂▁▁▁
rest 0 1 45.27 53.38 0.00 21.00 29.00 46.00 730.00 ▇▁▁▁▁
avg_rest 0 1 40.42 22.76 9.33 26.00 34.83 47.14 297.00 ▇▁▁▁▁
efforts_last90 0 1 0.98 0.94 0.00 0.00 1.00 2.00 7.00 ▇▂▁▁▁
Lag1 0 1 0.23 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag2 0 1 0.19 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag3 0 1 0.16 0.37 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag4 0 1 0.14 0.35 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag5 0 1 0.13 0.33 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
LP1 0 1 -1.62 2.27 -5.00 -4.00 -1.00 0.00 1.00 ▅▁▂▁▇
LP2 0 1 -1.75 2.11 -5.00 -3.00 -2.00 0.00 1.00 ▅▁▆▂▇
LP3 0 1 -1.85 1.98 -5.00 -3.00 -2.00 0.00 1.00 ▅▁▇▁▇
LP4 0 1 -1.86 1.87 -5.00 -2.00 -2.00 0.00 1.00 ▃▁▇▁▅
LP5 0 1 -1.90 1.76 -5.00 -2.00 -2.00 -1.00 1.00 ▃▁▇▁▃
pp_id 0 1 27262.91 16891.16 3.00 12649.50 25503.50 41077.50 59175.00 ▇▇▆▇▅
won 0 1 0.24 0.43 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
splt_1 0 1 4.36 5.76 0.00 0.00 2.93 6.71 51.00 ▇▁▁▁▁
splt_2 0 1 2.38 3.81 0.00 0.00 0.00 3.82 33.07 ▇▁▁▁▁
splt_3 0 1 2.31 4.87 0.00 0.00 0.00 2.92 45.30 ▇▁▁▁▁
splt_4 0 1 3.43 4.99 0.00 0.00 1.63 4.89 46.29 ▇▁▁▁▁
splt_5 0 1 1.31 2.69 0.00 0.00 0.00 1.96 26.99 ▇▁▁▁▁
splt_6 0 1 2.70 3.91 0.00 0.00 0.00 3.94 30.87 ▇▁▁▁▁
splt_7 0 1 4.45 5.63 0.00 0.00 2.90 6.87 37.86 ▇▂▁▁▁
splt_8 0 1 11.22 8.69 0.00 3.00 9.85 16.55 50.57 ▇▅▂▁▁
splt_9 0 1 10.09 8.22 0.00 2.96 8.38 14.97 43.48 ▇▅▂▁▁
race_date_year 0 1 2019.20 1.00 2016.00 2019.00 2019.00 2020.00 2021.00 ▁▃▇▇▂
distance_X5.5F 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X6.0F 0 1 0.21 0.40 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
distance_X6.5F 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X7.0F 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X8.0F 0 1 0.27 0.44 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
distance_X8.5F 0 1 0.14 0.34 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X9.0F 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_other 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
gender_Male 0 1 0.68 0.47 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
trk_code_BEL 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_CD 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_DMR 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_GP 0 1 0.24 0.43 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
trk_code_GPW 0 1 0.03 0.18 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_KEE 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_SA 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_SAR 0 1 0.05 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_WO 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_other 0 1 0.22 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
jky_E.Cancel 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_E.Jaramillo 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_E.Zayas 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_I.Ortiz..Jr. 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Alvarado 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Lezcano 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Ortiz 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Rosario 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_L.Reyes 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_L.Saez 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_M.Franco 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_M.Vasquez 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_P.Lopez 0 1 0.02 0.12 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_R.Maragh 0 1 0.02 0.12 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_T.Gaffalione 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_other 0 1 0.67 0.47 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
form_cycle_A2 0 1 0.11 0.31 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_A3 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_AX 0 1 0.18 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
form_cycle_B1 0 1 0.06 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_B2 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_B3 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_BX 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_C1 0 1 0.05 0.22 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_C2 0 1 0.04 0.18 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_C3 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_CX 0 1 0.10 0.30 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_D1 0 1 0.02 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_D2 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_D3 0 1 0.01 0.12 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_DX 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_E1 0 1 0.01 0.10 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_E2 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_E3 0 1 0.00 0.07 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_EX 0 1 0.02 0.12 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_F1 0 1 0.00 0.06 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_F2 0 1 0.00 0.06 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_F3 0 1 0.00 0.04 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_FX 0 1 0.00 0.07 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Inner.Dirt 0 1 0.00 0.04 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Inner.Turf 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Synth 0 1 0.04 0.19 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Turf 0 1 0.35 0.48 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
wind_hvy 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
wind_vhvy 0 1 0.04 0.19 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
precip_rain 0 1 0.03 0.18 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
precip_snow 0 1 0.00 0.04 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Fst 0 1 0.51 0.50 0.00 0.00 1.00 1.00 1.00 ▇▁▁▁▇
s_cond_Gd 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_My 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Sf 0 1 0.00 0.07 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Sly 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Trk_condit 0 1 0.00 0.03 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Yl 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_type_Sprint 0 1 0.52 0.50 0.00 0.00 1.00 1.00 1.00 ▇▁▁▁▇
surface_cond_frz 0 1 0.00 0.03 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_fst 0 1 0.45 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
surface_cond_gd 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_gd.slw 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_hvy 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_puaf 0 1 0.01 0.08 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_sft 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_vbw 0 1 0.00 0.03 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_vslw 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_wf 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_wsm 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_yld 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Feb 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Mar 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Apr 0 1 0.05 0.22 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_May 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Jun 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Jul 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Aug 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Sep 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Oct 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Nov 0 1 0.10 0.30 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Dec 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁

Setup a Workflow

We create the workflow by adding the formula and recipe

figs_wf <- workflow() %>% 
  add_recipe(figs_rec) 

Add Model to Workflow and Fit to each of the resamples.

members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

doParallel::registerDoParallel()

glm_rs <- figs_wf %>% 
  add_model(glm_spec) %>% 
  fit_resamples(
    resamples = figs_boot,
    metrics = members_metrics,
    control = control_resamples(save_pred = TRUE)
  )

glm_rs
## # Resampling results
## # Bootstrap sampling 
## # A tibble: 25 x 5
##    splits            id         .metrics        .notes        .predictions      
##    <list>            <chr>      <list>          <list>        <list>            
##  1 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [2 ×… <tibble [9,810 × …
##  2 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,822 × …
##  3 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,857 × …
##  4 <split [26713/97… Bootstrap… <tibble [4 × 4… <tibble [2 ×… <tibble [9,783 × …
##  5 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,855 × …
##  6 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,837 × …
##  7 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,817 × …
##  8 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,801 × …
##  9 <split [26713/97… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,788 × …
## 10 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,846 × …
## # … with 15 more rows

Evaluate Model

Now we use collect_metrics to see the results of our modeling. Here we use accuracy and roc to evaluate the model

collect_metrics(glm_rs)
## # A tibble: 4 x 6
##   .metric  .estimator  mean     n  std_err .config             
##   <chr>    <chr>      <dbl> <int>    <dbl> <chr>               
## 1 accuracy binary     0.679    25 0.000834 Preprocessor1_Model1
## 2 roc_auc  binary     0.757    25 0.000891 Preprocessor1_Model1
## 3 sens     binary     0.699    25 0.00219  Preprocessor1_Model1
## 4 spec     binary     0.673    25 0.00133  Preprocessor1_Model1

Now lets look at the confusion matrix

glm_rs %>% 
  conf_mat_resampled()
## # A tibble: 4 x 3
##   Prediction Truth  Freq
##   <fct>      <fct> <dbl>
## 1 Top        Top   1657.
## 2 Top        NoTop 2435.
## 3 NoTop      Top    715.
## 4 NoTop      NoTop 5009.

Finally, lets plot and roc curve to visualize our roc resutls

glm_rs %>% 
  collect_predictions() %>% 
  group_by(id) %>%   
  roc_curve(target, .pred_Top) %>% 
  ggplot(aes(1-specificity, sensitivity, color=id)) +
  geom_abline(lty=2, color="gray80", size= 1.5) +
  geom_path(show.legend =FALSE, alpha = 0.6, size =1.2) +
  coord_equal()

Finalize Model

Now lets peform the last_fit() to finalize our model against the test dataset

figs_final <- figs_wf %>% 
  add_model(glm_spec) %>% 
  last_fit(figs_split)

figs_final
## # Resampling results
## # Manual resampling 
## # A tibble: 1 x 6
##   splits         id          .metrics     .notes      .predictions     .workflow
##   <list>         <chr>       <list>       <list>      <list>           <list>   
## 1 <split [26713… train/test… <tibble [2 … <tibble [1… <tibble [8,904 … <workflo…

The metrics and prediction have been calculated against the test data

collect_metrics(figs_final)
## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.671 Preprocessor1_Model1
## 2 roc_auc  binary         0.756 Preprocessor1_Model1
collect_predictions(figs_final) %>% 
  conf_mat(target, .pred_class)
##           Truth
## Prediction  Top NoTop
##      Top   1519  2306
##      NoTop  620  4458

The coefficients (which we can get out using tidy()) have been estimated using the training data. If we use exponentiate = TRUE, we have odds ratios.

options(scipen = 999)

figs_final$.workflow[[1]] %>% 
  tidy(exponentiate=TRUE) %>% 
  arrange(p.value) %>%
  DT::datatable()
figs_final %>%
  pull(.workflow) %>%
  pluck(1) %>%
  tidy() %>%
  filter(term != "(Intercept)") %>%
  ggplot(aes(estimate, fct_reorder(term, estimate))) +
  geom_vline(xintercept = 0, color = "gray50", lty = 2, size = 1.2) +
  geom_point(size = 2, color = "#85144B") +
  labs(y = NULL, x = "Coefficent from logistic regression")