Get the data from Github

data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data2.csv")

figs <- read_csv(data) %>% 
  as_tibble() %>% 
  rename(n = `n`) %>% 
  mutate(eff_target = if_else(eff_target==1,"Top","NoTop")) %>% 
  # Remove columns that can't be used in model
  select(-target, -D1, -D2, -eff_TV, -top, -eff_top, -fv, -DST, -seed, -E, -E4, -TE, -split_800, -split_801, -split_802, -split_803, -split_804, -split_805, -won, -RST, -X) %>% 
  rename(target = eff_target) %>% 
  #change character data to factors
  mutate_if(is.character, factor) %>% 
  mutate(race_date = mdy(race_date))
  #change distance to numeric
  
  
skim(figs)
Data summary
Name figs
Number of rows 35617
Number of columns 59
_______________________
Column type frequency:
Date 1
factor 14
numeric 44
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
race_date 0 1 2016-04-13 2021-12-20 2019-10-31 1433

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
target 0 1 FALSE 2 NoT: 27061, Top: 8556
horse 0 1 FALSE 3387 CHA: 57, VIC: 57, STA: 54, EL : 53
surface_cond 0 1 FALSE 13 fst: 16366, frm: 10368, wsm: 2913, vsl: 1771
precip 0 1 FALSE 3 cle: 34259, rai: 1278, sno: 80
wind 0 1 FALSE 3 cal: 31586, hvy: 2581, vhv: 1450
gender 0 1 FALSE 2 Mal: 24217, Fem: 11400
distance 0 1 FALSE 22 8.0: 9316, 6.0: 7624, 8.5: 4621, 7.0: 3011
jky 0 1 FALSE 729 I O: 1034, L S: 930, M F: 915, J O: 879
trk_code 0 1 FALSE 145 GP: 8459, AQU: 4858, BEL: 3366, SA: 2892
surf 0 1 FALSE 5 Dir: 21482, Tur: 11757, Syn: 1271, Inn: 1038
s_cond 0 1 FALSE 8 Fst: 18433, Frm: 10362, Gd: 3275, Sly: 2180
form_cycle 0 1 FALSE 24 AX: 6600, CX: 3762, BX: 3536, A1: 3387
race_type 0 1 FALSE 2 Spr: 18816, Rou: 16801
trk 0 1 FALSE 130 GP: 8377, AQ: 4769, BE: 3359, SA: 2887

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
n 0 1 9.86 8.51 1.00 3.00 7.00 14.00 57.00 ▇▂▁▁▁
cls_val 0 1 1.99 0.89 1.00 1.15 1.62 3.00 12.59 ▇▁▁▁▁
cls_mv 0 1 0.07 0.68 -11.49 0.00 0.00 0.00 11.39 ▁▁▇▁▁
yr 0 1 2019.22 0.99 2016.00 2019.00 2019.00 2020.00 2021.00 ▁▃▇▇▂
age 0 1 3.64 1.30 1.00 3.00 3.00 4.00 11.00 ▇▆▁▁▁
ftl 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
clm_by_hot 0 1 -0.93 0.34 -1.00 -1.00 -1.00 -1.00 1.00 ▇▁▁▁▁
start 0 1 0.03 0.09 0.00 0.00 0.00 0.00 0.90 ▇▁▁▁▁
bled 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
off_turf 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
bfnr 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lame 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lsx 0 1 0.00 0.09 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
blnkrs 0 1 0.04 0.27 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
age_wks 0 1 204.45 65.52 37.00 154.00 190.00 240.00 559.00 ▂▇▂▁▁
wt 2 1 120.32 2.70 108.00 119.00 120.00 122.00 143.00 ▁▇▂▁▁
off_odds 0 1 12.17 17.09 0.11 2.50 6.00 13.00 99.00 ▇▁▁▁▁
fld 2 1 8.39 2.21 2.00 7.00 8.00 10.00 30.00 ▅▇▁▁▁
L3 0 1 18.01 7.03 1.75 13.33 16.92 21.42 99.00 ▇▃▁▁▁
L5 0 1 18.26 6.77 1.75 13.70 17.25 21.62 99.00 ▇▃▁▁▁
L7 0 1 18.43 6.66 1.75 13.96 17.43 21.75 99.00 ▇▃▁▁▁
rest 0 1 43.85 53.38 0.00 21.00 28.00 43.00 1081.00 ▇▁▁▁▁
avg_rest 0 1 39.75 21.66 7.25 25.67 34.70 46.78 489.00 ▇▁▁▁▁
efforts_last90 0 1 0.90 0.95 0.00 0.00 1.00 1.00 8.00 ▇▂▁▁▁
Lag1 0 1 0.21 0.40 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag2 0 1 0.19 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag3 0 1 0.17 0.37 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag4 0 1 0.15 0.36 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag5 0 1 0.14 0.34 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
LP1 0 1 -1.91 2.27 -5.00 -5.00 -2.00 0.00 1.00 ▆▂▃▂▇
LP2 0 1 -1.91 2.16 -5.00 -4.00 -2.00 0.00 1.00 ▆▂▆▂▇
LP3 0 1 -1.92 2.05 -5.00 -4.00 -2.00 0.00 1.00 ▆▂▇▂▇
LP4 0 1 -1.92 1.95 -5.00 -3.00 -2.00 0.00 1.00 ▅▁▇▁▆
LP5 0 1 -1.93 1.85 -5.00 -3.00 -2.00 0.00 1.00 ▃▁▇▁▅
pp_id 0 1 27487.78 16975.26 1.00 12623.00 26040.00 41459.00 59178.00 ▇▇▆▇▆
splt_1 0 1 4.48 6.00 0.00 0.00 2.89 6.79 51.00 ▇▁▁▁▁
splt_2 0 1 2.36 3.93 0.00 0.00 0.00 3.81 35.91 ▇▁▁▁▁
splt_3 0 1 2.60 5.16 0.00 0.00 0.00 3.57 45.30 ▇▁▁▁▁
splt_4 0 1 3.53 5.16 0.00 0.00 1.23 5.27 46.29 ▇▁▁▁▁
splt_5 0 1 1.34 2.75 0.00 0.00 0.00 1.93 27.99 ▇▁▁▁▁
splt_6 0 1 2.85 4.09 0.00 0.00 0.00 3.97 30.87 ▇▂▁▁▁
splt_7 0 1 4.68 5.87 0.00 0.00 2.90 7.42 37.86 ▇▂▁▁▁
splt_8 0 1 11.88 9.05 0.00 3.93 10.63 17.73 50.57 ▇▆▂▁▁
splt_9 0 1 10.57 8.58 0.00 2.97 9.35 16.01 47.05 ▇▅▂▁▁

Build Model

We can start by splitting our data into training and testing sets.

set.seed(123)

figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)

figs_split
## <Analysis/Assess/Total>
## <26713/8904/35617>

Create Model Specification

We create a model specification utilizing the xgboost package

xgb_spec <- boost_tree(
  trees = 1000,
  tree_depth = tune(),
  min_n = tune(),
  loss_reduction = tune(),
  sample_size = tune(),
  mtry = tune(),
  learn_rate = tune(),
) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

xgb_spec
## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   loss_reduction = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

Create a Grid for the Hyperparameter value options

xgb_grid <- grid_latin_hypercube(
  tree_depth(),
  min_n(),
  loss_reduction(),
  sample_size = sample_prop(),
  finalize(mtry(), figs_train),
  learn_rate(),
  size = 30
)

Create a Recipe

The recipe performs preprocessing necessary to fit a logistic regression model

figs_rec <- recipe(target ~ ., data=figs_train) %>%
  update_role(horse, new_role = "ID") %>% 
  update_role(pp_id, new_role = "ID") %>% 
  step_other(jky , threshold = 0.015) %>% 
  step_other(trk_code , threshold = 0.02) %>% 
  step_other(distance, threshold = 0.02) %>% 
  step_dummy(distance) %>% 
  step_dummy(gender) %>% 
  step_dummy(trk_code) %>% 
  step_dummy(jky) %>% 
  step_dummy(form_cycle) %>% 
  step_dummy(surf) %>% 
  step_dummy(wind) %>% 
  step_dummy(precip) %>% 
  step_dummy(s_cond) %>% 
  step_dummy(race_type) %>% 
  step_dummy(surface_cond) %>% 
  step_rm(yr, trk) %>% 
  step_date(race_date, features = c("month", "year")) %>% 
  step_dummy(race_date_month) %>% 
  step_rm(race_date, n) %>% 
  themis::step_upsample(target)
  
  

figs_prep <- prep(figs_rec)

juiced <- juice(figs_prep)
skim(juiced)
Data summary
Name juiced
Number of rows 40592
Number of columns 142
_______________________
Column type frequency:
factor 2
numeric 140
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
horse 0 1 FALSE 3302 EYE: 68, OUR: 61, VIC: 57, STA: 56
target 0 1 FALSE 2 NoT: 20296, Top: 20296

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
cls_val 0 1 1.97 0.89 1.00 1.15 1.50 3.00 12.59 ▇▁▁▁▁
cls_mv 0 1 0.10 0.70 -11.49 0.00 0.00 0.00 11.39 ▁▁▇▁▁
age 0 1 3.51 1.26 1.00 3.00 3.00 4.00 11.00 ▇▅▁▁▁
ftl 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
clm_by_hot 0 1 -0.93 0.32 -1.00 -1.00 -1.00 -1.00 1.00 ▇▁▁▁▁
start 0 1 0.02 0.09 0.00 0.00 0.00 0.00 0.90 ▇▁▁▁▁
bled 0 1 0.00 0.01 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
off_turf 0 1 0.04 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
bfnr 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lame 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lsx 0 1 0.00 0.09 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
blnkrs 0 1 0.04 0.27 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
age_wks 0 1 197.86 62.73 44.00 150.00 183.00 230.00 559.00 ▃▇▂▁▁
wt 1 1 120.31 2.70 108.00 118.00 120.00 122.00 140.00 ▁▇▆▁▁
off_odds 0 1 12.10 17.13 0.11 2.50 5.00 13.00 99.00 ▇▁▁▁▁
fld 1 1 8.42 2.19 2.00 7.00 8.00 10.00 30.00 ▅▇▁▁▁
L3 0 1 18.02 6.91 1.75 13.33 17.00 21.50 99.00 ▇▃▁▁▁
L5 0 1 18.41 6.67 1.75 13.85 17.40 21.85 99.00 ▇▃▁▁▁
L7 0 1 18.63 6.56 1.75 14.18 17.67 22.06 99.00 ▇▃▁▁▁
rest 0 1 45.37 54.07 0.00 21.00 29.00 46.00 850.00 ▇▁▁▁▁
avg_rest 0 1 40.44 23.06 8.75 26.00 34.67 47.14 340.33 ▇▁▁▁▁
efforts_last90 0 1 0.98 0.94 0.00 0.00 1.00 2.00 8.00 ▇▃▁▁▁
Lag1 0 1 0.23 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag2 0 1 0.19 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag3 0 1 0.16 0.37 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag4 0 1 0.14 0.35 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag5 0 1 0.12 0.33 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
LP1 0 1 -1.61 2.27 -5.00 -4.00 -1.00 0.00 1.00 ▅▁▂▁▇
LP2 0 1 -1.75 2.11 -5.00 -3.00 -2.00 0.00 1.00 ▅▁▆▂▇
LP3 0 1 -1.83 1.98 -5.00 -3.00 -2.00 0.00 1.00 ▅▁▇▁▇
LP4 0 1 -1.86 1.87 -5.00 -2.00 -2.00 0.00 1.00 ▃▁▇▁▅
LP5 0 1 -1.90 1.76 -5.00 -2.00 -2.00 -1.00 1.00 ▃▁▇▁▃
pp_id 0 1 27292.06 16846.47 1.00 12675.75 25536.00 41076.25 59176.00 ▇▇▆▇▅
splt_1 0 1 4.38 5.77 0.00 0.00 2.93 6.72 51.00 ▇▁▁▁▁
splt_2 0 1 2.40 3.84 0.00 0.00 0.00 3.83 35.91 ▇▁▁▁▁
splt_3 0 1 2.31 4.84 0.00 0.00 0.00 2.92 45.30 ▇▁▁▁▁
splt_4 0 1 3.43 4.97 0.00 0.00 1.68 4.92 46.29 ▇▁▁▁▁
splt_5 0 1 1.32 2.67 0.00 0.00 0.00 1.96 27.99 ▇▁▁▁▁
splt_6 0 1 2.74 3.94 0.00 0.00 0.00 3.95 30.87 ▇▂▁▁▁
splt_7 0 1 4.45 5.62 0.00 0.00 2.90 6.87 37.86 ▇▂▁▁▁
splt_8 0 1 11.26 8.68 0.00 3.00 9.90 16.69 50.57 ▇▅▂▁▁
splt_9 0 1 10.07 8.20 0.00 2.96 8.32 15.01 47.05 ▇▅▂▁▁
distance_X5.5F 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X6.0F 0 1 0.21 0.40 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
distance_X6.5F 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X7.0F 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X8.0F 0 1 0.27 0.44 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
distance_X8.5F 0 1 0.13 0.34 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X9.0F 0 1 0.05 0.22 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_other 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
gender_Male 0 1 0.67 0.47 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
trk_code_BEL 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_CD 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_DMR 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_GP 0 1 0.24 0.43 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
trk_code_GPW 0 1 0.03 0.18 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_KEE 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_SA 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_SAR 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_WO 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_other 0 1 0.22 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
jky_E.Cancel 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_E.Jaramillo 0 1 0.02 0.12 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_E.Zayas 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_I.Ortiz..Jr. 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Alvarado 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Lezcano 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Ortiz 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Rosario 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_L.Reyes 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_L.Saez 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_M.Franco 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_M.Vasquez 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_P.Lopez 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_R.Maragh 0 1 0.01 0.12 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_T.Gaffalione 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_other 0 1 0.67 0.47 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
form_cycle_A2 0 1 0.11 0.31 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_A3 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_AX 0 1 0.18 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
form_cycle_B1 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_B2 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_B3 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_BX 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_C1 0 1 0.05 0.22 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_C2 0 1 0.03 0.18 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_C3 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_CX 0 1 0.10 0.30 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_D1 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_D2 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_D3 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_DX 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_E1 0 1 0.01 0.10 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_E2 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_E3 0 1 0.01 0.07 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_EX 0 1 0.02 0.12 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_F1 0 1 0.00 0.06 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_F2 0 1 0.00 0.05 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_F3 0 1 0.00 0.04 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_FX 0 1 0.00 0.06 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Inner.Dirt 0 1 0.00 0.05 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Inner.Turf 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Synth 0 1 0.04 0.19 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Turf 0 1 0.35 0.48 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
wind_hvy 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
wind_vhvy 0 1 0.04 0.19 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
precip_rain 0 1 0.03 0.18 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
precip_snow 0 1 0.00 0.04 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Fst 0 1 0.51 0.50 0.00 0.00 1.00 1.00 1.00 ▇▁▁▁▇
s_cond_Gd 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_My 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Sf 0 1 0.00 0.07 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Sly 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Trk_condit 0 1 0.00 0.03 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Yl 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_type_Sprint 0 1 0.52 0.50 0.00 0.00 1.00 1.00 1.00 ▇▁▁▁▇
surface_cond_frz 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_fst 0 1 0.45 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
surface_cond_gd 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_gd.slw 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_hvy 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_puaf 0 1 0.01 0.08 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_sft 0 1 0.01 0.08 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_vbw 0 1 0.00 0.03 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_vslw 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_wf 0 1 0.01 0.10 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_wsm 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_yld 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_year 0 1 2019.20 1.00 2016.00 2019.00 2019.00 2020.00 2021.00 ▁▃▇▇▂
race_date_month_Feb 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Mar 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Apr 0 1 0.05 0.22 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_May 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Jun 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Jul 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Aug 0 1 0.10 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Sep 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Oct 0 1 0.10 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Nov 0 1 0.10 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Dec 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁

Setup a Workflow

We create the workflow by adding the recipe and the model

xgb_wf <- workflow() %>% 
  add_recipe(figs_rec) %>% 
  add_model(xgb_spec)

xgb_wf   
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 19 Recipe Steps
## 
## ● step_other()
## ● step_other()
## ● step_other()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● ...
## ● and 9 more steps.
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   loss_reduction = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

Now, let’s create cross-validation resamples for tuning our xgboost model

set.seed(123)
vb_folds <- vfold_cv(figs_train, strata = target)

vb_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 x 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [24041/2672]> Fold01
##  2 <split [24041/2672]> Fold02
##  3 <split [24041/2672]> Fold03
##  4 <split [24041/2672]> Fold04
##  5 <split [24041/2672]> Fold05
##  6 <split [24041/2672]> Fold06
##  7 <split [24042/2671]> Fold07
##  8 <split [24043/2670]> Fold08
##  9 <split [24043/2670]> Fold09
## 10 <split [24043/2670]> Fold10

Tune The model

We use tune_grid() along gwith our tunable workflow, our resamples and our grid of parameters. We will use control_grid(save_pred=TRUE) to preserve the prediction for later analysis.

members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

doParallel::registerDoParallel()

set.seed(234)

xgb_res <- tune_grid(
  xgb_wf,
  resamples = vb_folds,
  metrics = members_metrics,
  control = control_grid(save_pred = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
xgb_res
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 x 5
##    splits             id    .metrics         .notes         .predictions        
##    <list>             <chr> <list>           <list>         <list>              
##  1 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  2 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  3 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  4 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  5 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  6 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  7 <split [24042/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,710 × 1…
##  8 <split [24043/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,700 × 1…
##  9 <split [24043/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,700 × 1…
## 10 <split [24043/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,700 × 1…

Evaluate Model

Now we use collect_metrics to see the results of our modeling. Here we use accuracy and roc to evaluate the model

collect_metrics(xgb_res)
## # A tibble: 40 x 12
##     mtry min_n tree_depth    learn_rate loss_reduction sample_size .metric 
##    <int> <int>      <int>         <dbl>          <dbl>       <dbl> <chr>   
##  1   110    17          1 0.00000719        0.147            0.454 accuracy
##  2   110    17          1 0.00000719        0.147            0.454 roc_auc 
##  3   110    17          1 0.00000719        0.147            0.454 sens    
##  4   110    17          1 0.00000719        0.147            0.454 spec    
##  5    48    19          6 0.00000000167     2.75             0.578 accuracy
##  6    48    19          6 0.00000000167     2.75             0.578 roc_auc 
##  7    48    19          6 0.00000000167     2.75             0.578 sens    
##  8    48    19          6 0.00000000167     2.75             0.578 spec    
##  9    63     6          7 0.000000268       0.00000126       0.966 accuracy
## 10    63     6          7 0.000000268       0.00000126       0.966 roc_auc 
## # … with 30 more rows, and 5 more variables: .estimator <chr>, mean <dbl>,
## #   n <int>, std_err <dbl>, .config <chr>

Visualize Tuning Results

xgb_res %>% 
  collect_metrics() %>% 
  filter(.metric == "roc_auc") %>% 
  select(mean, mtry:sample_size) %>% 
  pivot_longer(mtry:sample_size,
               values_to = "value",
               names_to = "parameter"
  ) %>% 
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(x = NULL, y = "AUC")

What’s the Best Model

show_best(xgb_res, "roc_auc")
## # A tibble: 5 x 12
##    mtry min_n tree_depth   learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>        <dbl>          <dbl>       <dbl> <chr>  
## 1    95    27         10 0.0153        0.0000000510        0.837 roc_auc
## 2   137    13         12 0.00877       0.00000000110       0.720 roc_auc
## 3    63     6          7 0.000000268   0.00000126          0.966 roc_auc
## 4   125    37          8 0.0000000130  0.00418             0.308 roc_auc
## 5    30    30         14 0.0000587     0.000535            0.808 roc_auc
## # … with 5 more variables: .estimator <chr>, mean <dbl>, n <int>,
## #   std_err <dbl>, .config <chr>
best_auc <- select_best(xgb_res, "roc_auc")
best_auc
## # A tibble: 1 x 7
##    mtry min_n tree_depth learn_rate loss_reduction sample_size .config          
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>            
## 1    95    27         10     0.0153   0.0000000510       0.837 Preprocessor1_Mo…

Finalize Model

Now lets peform the last_fit() to finalize our model against the test dataset

final_xgb <- finalize_workflow(
  xgb_wf, 
  best_auc
)

final_xgb
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 19 Recipe Steps
## 
## ● step_other()
## ● step_other()
## ● step_other()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● ...
## ● and 9 more steps.
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = 95
##   trees = 1000
##   min_n = 27
##   tree_depth = 10
##   learn_rate = 0.0153160089416973
##   loss_reduction = 5.10282054010883e-08
##   sample_size = 0.837027746313252
## 
## Computational engine: xgboost

The metrics and prediction have been calculated against the test data

What are the important variables

final_xgb %>% 
  fit(data = figs_train) %>% 
  pull_workflow_fit() %>% 
  vip(geom = "point", num_features=25L) +  
  labs(
    x = NULL,
    y = NULL,
    title= "Feature Importance") + theme_fivethirtyeight()
## [02:42:29] WARNING: amalgamation/../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

Final Results

final_res <- last_fit(final_xgb, figs_split)

collect_metrics(final_res)
## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.748 Preprocessor1_Model1
## 2 roc_auc  binary         0.793 Preprocessor1_Model1

Confusion Matrix

collect_predictions(final_res) %>% 
  conf_mat(target, .pred_class)
##           Truth
## Prediction NoTop  Top
##      NoTop  5372  854
##      Top    1393 1285

ROC Curve

final_res %>% 
  collect_predictions() %>% 
  roc_curve(target, .pred_Top, event_level = "second") %>% 
  ggplot(aes(x=1-specificity, y = sensitivity)) +
  geom_line(size = 1.5, color = "midnightblue") +
  geom_abline(
    lty = 2, alpha =0.5,
    color = "gray50",
    size = 1.2
  )

Predictions

xgb_predicitions <- final_res %>% 
  collect_predictions() %>% 
  mutate(correct = case_when(target == .pred_class ~ "Correct",
                             TRUE ~ "Incorrect"))
datatable(xgb_predicitions)