library(lubridate)
library(tidyverse)
library(tidymodels)
library(skimr)
library(ggthemes)
library(DBI)
library(RMariaDB)
library(odbc)
library(RcppRoll)
library(magick)
library(RCurl)
library(vip)
library(themis)
library(DT)
library(gt)

Get the data from Github

data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data2.csv")

figs <- read_csv(data) %>% 
  as_tibble() %>% 
  rename(n = `n`) %>%
  mutate(eff_target = if_else(eff_target==1,"Top","NoTop")) %>% 
  # Remove columns that can't be used in model
  select(-target, -D1, -D2, -eff_TV, -top, -eff_top, -fv, -DST, -seed, -E, -E4, -TE, -split_800, -split_801, -split_802, -split_803, -split_804, -split_805, -won, -RST, -X) %>%
  rename(target = eff_target) %>% 
  #change character data to factors
  mutate_if(is.character, factor) %>%
  mutate(race_date = mdy(race_date)) %>% 
  mutate(fld = if_else(is.na(fld), 8, fld)) %>%
  mutate(wt = if_else(is.na(wt), 120, wt))  
  
  
skim(figs)
Data summary
Name figs
Number of rows 35617
Number of columns 59
_______________________
Column type frequency:
Date 1
factor 14
numeric 44
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
race_date 0 1 2016-04-13 2021-12-20 2019-10-31 1433

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
target 0 1 FALSE 2 NoT: 27061, Top: 8556
horse 0 1 FALSE 3387 CHA: 57, VIC: 57, STA: 54, EL : 53
surface_cond 0 1 FALSE 13 fst: 16366, frm: 10368, wsm: 2913, vsl: 1771
precip 0 1 FALSE 3 cle: 34259, rai: 1278, sno: 80
wind 0 1 FALSE 3 cal: 31586, hvy: 2581, vhv: 1450
gender 0 1 FALSE 2 Mal: 24217, Fem: 11400
distance 0 1 FALSE 22 8.0: 9316, 6.0: 7624, 8.5: 4621, 7.0: 3011
jky 0 1 FALSE 729 I O: 1034, L S: 930, M F: 915, J O: 879
trk_code 0 1 FALSE 145 GP: 8459, AQU: 4858, BEL: 3366, SA: 2892
surf 0 1 FALSE 5 Dir: 21482, Tur: 11757, Syn: 1271, Inn: 1038
s_cond 0 1 FALSE 8 Fst: 18433, Frm: 10362, Gd: 3275, Sly: 2180
form_cycle 0 1 FALSE 24 AX: 6600, CX: 3762, BX: 3536, A1: 3387
race_type 0 1 FALSE 2 Spr: 18816, Rou: 16801
trk 0 1 FALSE 130 GP: 8377, AQ: 4769, BE: 3359, SA: 2887

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
n 0 1 9.86 8.51 1.00 3.00 7.00 14.00 57.00 ▇▂▁▁▁
cls_val 0 1 1.99 0.89 1.00 1.15 1.62 3.00 12.59 ▇▁▁▁▁
cls_mv 0 1 0.07 0.68 -11.49 0.00 0.00 0.00 11.39 ▁▁▇▁▁
yr 0 1 2019.22 0.99 2016.00 2019.00 2019.00 2020.00 2021.00 ▁▃▇▇▂
age 0 1 3.64 1.30 1.00 3.00 3.00 4.00 11.00 ▇▆▁▁▁
ftl 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
clm_by_hot 0 1 -0.93 0.34 -1.00 -1.00 -1.00 -1.00 1.00 ▇▁▁▁▁
start 0 1 0.03 0.09 0.00 0.00 0.00 0.00 0.90 ▇▁▁▁▁
bled 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
off_turf 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
bfnr 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lame 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lsx 0 1 0.00 0.09 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
blnkrs 0 1 0.04 0.27 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
age_wks 0 1 204.45 65.52 37.00 154.00 190.00 240.00 559.00 ▂▇▂▁▁
wt 0 1 120.32 2.70 108.00 119.00 120.00 122.00 143.00 ▁▇▂▁▁
off_odds 0 1 12.17 17.09 0.11 2.50 6.00 13.00 99.00 ▇▁▁▁▁
fld 0 1 8.39 2.21 2.00 7.00 8.00 10.00 30.00 ▅▇▁▁▁
L3 0 1 18.01 7.03 1.75 13.33 16.92 21.42 99.00 ▇▃▁▁▁
L5 0 1 18.26 6.77 1.75 13.70 17.25 21.62 99.00 ▇▃▁▁▁
L7 0 1 18.43 6.66 1.75 13.96 17.43 21.75 99.00 ▇▃▁▁▁
rest 0 1 43.85 53.38 0.00 21.00 28.00 43.00 1081.00 ▇▁▁▁▁
avg_rest 0 1 39.75 21.66 7.25 25.67 34.70 46.78 489.00 ▇▁▁▁▁
efforts_last90 0 1 0.90 0.95 0.00 0.00 1.00 1.00 8.00 ▇▂▁▁▁
Lag1 0 1 0.21 0.40 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag2 0 1 0.19 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag3 0 1 0.17 0.37 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag4 0 1 0.15 0.36 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag5 0 1 0.14 0.34 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
LP1 0 1 -1.91 2.27 -5.00 -5.00 -2.00 0.00 1.00 ▆▂▃▂▇
LP2 0 1 -1.91 2.16 -5.00 -4.00 -2.00 0.00 1.00 ▆▂▆▂▇
LP3 0 1 -1.92 2.05 -5.00 -4.00 -2.00 0.00 1.00 ▆▂▇▂▇
LP4 0 1 -1.92 1.95 -5.00 -3.00 -2.00 0.00 1.00 ▅▁▇▁▆
LP5 0 1 -1.93 1.85 -5.00 -3.00 -2.00 0.00 1.00 ▃▁▇▁▅
pp_id 0 1 27487.78 16975.26 1.00 12623.00 26040.00 41459.00 59178.00 ▇▇▆▇▆
splt_1 0 1 4.48 6.00 0.00 0.00 2.89 6.79 51.00 ▇▁▁▁▁
splt_2 0 1 2.36 3.93 0.00 0.00 0.00 3.81 35.91 ▇▁▁▁▁
splt_3 0 1 2.60 5.16 0.00 0.00 0.00 3.57 45.30 ▇▁▁▁▁
splt_4 0 1 3.53 5.16 0.00 0.00 1.23 5.27 46.29 ▇▁▁▁▁
splt_5 0 1 1.34 2.75 0.00 0.00 0.00 1.93 27.99 ▇▁▁▁▁
splt_6 0 1 2.85 4.09 0.00 0.00 0.00 3.97 30.87 ▇▂▁▁▁
splt_7 0 1 4.68 5.87 0.00 0.00 2.90 7.42 37.86 ▇▂▁▁▁
splt_8 0 1 11.88 9.05 0.00 3.93 10.63 17.73 50.57 ▇▆▂▁▁
splt_9 0 1 10.57 8.58 0.00 2.97 9.35 16.01 47.05 ▇▅▂▁▁

Build Model

We can start by splitting our data into training and testing sets.

set.seed(123)

figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)

figs_split
## <Analysis/Assess/Total>
## <26713/8904/35617>

Create a Recipe

The recipe performs preprocessing necessary to fit a logistic regression model

figs_rec <- recipe(target ~ ., data=figs_train) %>%
  update_role(horse, new_role = "ID") %>% 
  update_role(pp_id, new_role = "ID") %>% 
  step_other(jky , threshold = 0.015) %>% 
  step_other(trk_code , threshold = 0.02) %>% 
  step_other(distance, threshold = 0.02) %>% 
  step_dummy(distance) %>% 
  step_dummy(gender) %>% 
  step_dummy(trk_code) %>% 
  step_dummy(jky) %>% 
  step_dummy(form_cycle) %>% 
  step_dummy(surf) %>% 
  step_dummy(wind) %>% 
  step_dummy(precip) %>% 
  step_dummy(s_cond) %>% 
  step_dummy(race_type) %>% 
  step_dummy(surface_cond) %>% 
  step_rm(yr, trk) %>% 
  step_date(race_date, features = c("month", "year")) %>% 
  step_dummy(race_date_month) %>% 
  step_rm(race_date, n) %>% 
  themis::step_upsample(target)
  
  

figs_prep <- prep(figs_rec)

juiced <- juice(figs_prep)
skim(juiced)
Data summary
Name juiced
Number of rows 40592
Number of columns 142
_______________________
Column type frequency:
factor 2
numeric 140
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
horse 0 1 FALSE 3302 EYE: 66, SAL: 60, STA: 60, EL : 59
target 0 1 FALSE 2 NoT: 20296, Top: 20296

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
cls_val 0 1 1.97 0.89 1.00 1.15 1.50 3.00 12.59 ▇▁▁▁▁
cls_mv 0 1 0.10 0.71 -11.49 0.00 0.00 0.00 11.39 ▁▁▇▁▁
age 0 1 3.52 1.25 1.00 3.00 3.00 4.00 11.00 ▇▅▁▁▁
ftl 0 1 0.06 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
clm_by_hot 0 1 -0.93 0.32 -1.00 -1.00 -1.00 -1.00 1.00 ▇▁▁▁▁
start 0 1 0.02 0.09 0.00 0.00 0.00 0.00 0.90 ▇▁▁▁▁
bled 0 1 0.00 0.01 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
off_turf 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
bfnr 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lame 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
lsx 0 1 0.00 0.09 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
blnkrs 0 1 0.04 0.27 -1.00 0.00 0.00 0.00 1.00 ▁▁▇▁▁
age_wks 0 1 198.11 62.63 44.00 151.00 183.00 230.00 559.00 ▃▇▂▁▁
wt 0 1 120.34 2.71 108.00 119.00 120.00 122.00 140.00 ▁▇▆▁▁
off_odds 0 1 12.00 17.01 0.11 2.50 5.00 13.00 99.00 ▇▁▁▁▁
fld 0 1 8.44 2.21 2.00 7.00 8.00 10.00 30.00 ▅▇▁▁▁
L3 0 1 18.01 6.90 1.75 13.33 16.96 21.42 99.00 ▇▃▁▁▁
L5 0 1 18.41 6.67 1.75 13.88 17.40 21.83 99.00 ▇▃▁▁▁
L7 0 1 18.63 6.56 1.75 14.18 17.64 22.00 99.00 ▇▃▁▁▁
rest 0 1 45.67 54.66 0.00 21.00 29.00 46.00 850.00 ▇▁▁▁▁
avg_rest 0 1 40.56 23.28 8.75 26.00 34.67 47.25 340.33 ▇▁▁▁▁
efforts_last90 0 1 0.98 0.94 0.00 0.00 1.00 2.00 8.00 ▇▃▁▁▁
Lag1 0 1 0.23 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag2 0 1 0.19 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag3 0 1 0.16 0.37 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag4 0 1 0.14 0.35 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
Lag5 0 1 0.12 0.33 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
LP1 0 1 -1.62 2.27 -5.00 -4.00 -1.00 0.00 1.00 ▅▁▂▁▇
LP2 0 1 -1.76 2.11 -5.00 -3.00 -2.00 0.00 1.00 ▆▁▆▂▇
LP3 0 1 -1.84 1.99 -5.00 -3.00 -2.00 0.00 1.00 ▅▁▇▁▇
LP4 0 1 -1.88 1.87 -5.00 -2.00 -2.00 0.00 1.00 ▃▁▇▁▅
LP5 0 1 -1.91 1.76 -5.00 -2.00 -2.00 -1.00 1.00 ▃▁▇▁▃
pp_id 0 1 27257.13 16858.55 1.00 12675.75 25486.50 41011.25 59176.00 ▇▇▆▇▅
splt_1 0 1 4.32 5.74 0.00 0.00 2.92 6.68 51.00 ▇▁▁▁▁
splt_2 0 1 2.39 3.85 0.00 0.00 0.00 3.81 35.91 ▇▁▁▁▁
splt_3 0 1 2.32 4.86 0.00 0.00 0.00 2.92 45.30 ▇▁▁▁▁
splt_4 0 1 3.39 4.96 0.00 0.00 1.02 4.87 46.29 ▇▁▁▁▁
splt_5 0 1 1.30 2.68 0.00 0.00 0.00 1.92 27.99 ▇▁▁▁▁
splt_6 0 1 2.73 3.95 0.00 0.00 0.00 3.94 30.87 ▇▂▁▁▁
splt_7 0 1 4.45 5.63 0.00 0.00 2.89 6.86 37.86 ▇▂▁▁▁
splt_8 0 1 11.22 8.68 0.00 3.00 9.84 16.67 50.57 ▇▅▂▁▁
splt_9 0 1 10.01 8.18 0.00 2.96 8.20 14.91 47.05 ▇▅▂▁▁
distance_X5.5F 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X6.0F 0 1 0.21 0.41 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
distance_X6.5F 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X7.0F 0 1 0.08 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X8.0F 0 1 0.27 0.45 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▃
distance_X8.5F 0 1 0.13 0.34 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_X9.0F 0 1 0.05 0.22 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
distance_other 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
gender_Male 0 1 0.67 0.47 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
trk_code_BEL 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_CD 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_DMR 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_GP 0 1 0.24 0.43 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
trk_code_GPW 0 1 0.03 0.18 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_KEE 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_SA 0 1 0.08 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_SAR 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_WO 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
trk_code_other 0 1 0.23 0.42 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
jky_E.Cancel 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_E.Jaramillo 0 1 0.02 0.12 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_E.Zayas 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_I.Ortiz..Jr. 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Alvarado 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Lezcano 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Ortiz 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_J.Rosario 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_L.Reyes 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_L.Saez 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_M.Franco 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_M.Vasquez 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_P.Lopez 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_R.Maragh 0 1 0.01 0.12 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_T.Gaffalione 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
jky_other 0 1 0.67 0.47 0.00 0.00 1.00 1.00 1.00 ▃▁▁▁▇
form_cycle_A2 0 1 0.11 0.31 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_A3 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_AX 0 1 0.18 0.39 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▂
form_cycle_B1 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_B2 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_B3 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_BX 0 1 0.09 0.28 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_C1 0 1 0.05 0.22 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_C2 0 1 0.04 0.18 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_C3 0 1 0.03 0.16 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_CX 0 1 0.10 0.30 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_D1 0 1 0.02 0.15 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_D2 0 1 0.02 0.13 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_D3 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_DX 0 1 0.04 0.20 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_E1 0 1 0.01 0.10 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_E2 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_E3 0 1 0.01 0.07 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_EX 0 1 0.01 0.12 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_F1 0 1 0.00 0.07 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_F2 0 1 0.00 0.06 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_F3 0 1 0.00 0.04 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
form_cycle_FX 0 1 0.00 0.07 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Inner.Dirt 0 1 0.00 0.04 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Inner.Turf 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Synth 0 1 0.04 0.19 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surf_Turf 0 1 0.35 0.48 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▅
wind_hvy 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
wind_vhvy 0 1 0.04 0.19 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
precip_rain 0 1 0.03 0.18 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
precip_snow 0 1 0.00 0.04 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Fst 0 1 0.51 0.50 0.00 0.00 1.00 1.00 1.00 ▇▁▁▁▇
s_cond_Gd 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_My 0 1 0.02 0.14 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Sf 0 1 0.00 0.06 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Sly 0 1 0.06 0.23 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Trk_condit 0 1 0.00 0.04 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
s_cond_Yl 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_type_Sprint 0 1 0.52 0.50 0.00 0.00 1.00 1.00 1.00 ▇▁▁▁▇
surface_cond_frz 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_fst 0 1 0.45 0.50 0.00 0.00 0.00 1.00 1.00 ▇▁▁▁▆
surface_cond_gd 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_gd.slw 0 1 0.03 0.17 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_hvy 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_puaf 0 1 0.01 0.08 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_sft 0 1 0.01 0.09 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_vbw 0 1 0.00 0.03 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_vslw 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_wf 0 1 0.01 0.10 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_wsm 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
surface_cond_yld 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_year 0 1 2019.20 1.00 2016.00 2019.00 2019.00 2020.00 2021.00 ▁▃▇▇▂
race_date_month_Feb 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Mar 0 1 0.06 0.24 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Apr 0 1 0.05 0.22 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_May 0 1 0.07 0.25 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Jun 0 1 0.07 0.26 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Jul 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Aug 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Sep 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Oct 0 1 0.10 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Nov 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁
race_date_month_Dec 0 1 0.09 0.29 0.00 0.00 0.00 0.00 1.00 ▇▁▁▁▁

Tuning the Random Forest Model

We create a tuning specification to facilitate tuning the random forest. Tree is set to 1000, min_n and mtry are parameters that will be tuned. We also set the model engine to ranger and specify a classifcation model.

tune_spec <- rand_forest(
  mtry = tune(),
  trees = 1000,
  min_n = tune()
  ) %>% 
  set_mode("classification") %>% 
  set_engine("ranger")

Setup a Workflow for the model

tune_wf <- workflow() %>% 
  add_recipe(figs_rec) %>% 
  add_model(tune_spec)

set.seed(234)

figs_folds <- vfold_cv(figs_train)

Tune hyperparameters

The inital training takes place in the following code chunk. We employ parallel processing to epadite the tuning process. After the tuning is complete collect_metrics and display the results

set.seed(234)

members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

doParallel::registerDoParallel(cores=28)

set.seed(345)

tune_res <- tune_grid(
  tune_wf,
  resamples = figs_folds,
  metrics = members_metrics,
  grid = 10
)

tune_res %>% 
  collect_metrics()
## # A tibble: 40 x 8
##     mtry min_n .metric  .estimator  mean     n std_err .config              
##    <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
##  1    29    30 accuracy binary     0.774    10 0.00257 Preprocessor1_Model01
##  2    29    30 roc_auc  binary     0.775    10 0.00272 Preprocessor1_Model01
##  3    29    30 sens     binary     0.888    10 0.00194 Preprocessor1_Model01
##  4    29    30 spec     binary     0.416    10 0.00656 Preprocessor1_Model01
##  5   112    12 accuracy binary     0.777    10 0.00271 Preprocessor1_Model02
##  6   112    12 roc_auc  binary     0.775    10 0.00262 Preprocessor1_Model02
##  7   112    12 sens     binary     0.905    10 0.00166 Preprocessor1_Model02
##  8   112    12 spec     binary     0.372    10 0.00569 Preprocessor1_Model02
##  9    65    27 accuracy binary     0.775    10 0.00229 Preprocessor1_Model03
## 10    65    27 roc_auc  binary     0.776    10 0.00285 Preprocessor1_Model03
## # … with 30 more rows

Visualize The Tuning Results

tune_res %>% 
  collect_metrics() %>% 
  filter(.metric == "roc_auc") %>% 
  select(mean, min_n, mtry) %>% 
  pivot_longer(min_n:mtry,
               values_to = "value",
               names_to = "parameter") %>% 
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(show.legend = FALSE) +
  facet_wrap(~ parameter, scales = "free_x")

Secondary Turning

Our initial tuning provided insight to the optimal parameters. We will now tune the parameters a second time to to obtain the optimal parameter settings.

rf_grid <- grid_regular(
  mtry(range = c(30, 40)),
  min_n(range = c(14, 16)),
  levels = 4
)



set.seed(456)
regular_res <- tune_grid(
  tune_wf,
  resamples = figs_folds,
  grid = rf_grid
)


regular_res %>% 
  collect_metrics()
## # A tibble: 24 x 8
##     mtry min_n .metric  .estimator  mean     n std_err .config              
##    <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
##  1    30    14 accuracy binary     0.778    10 0.00255 Preprocessor1_Model01
##  2    30    14 roc_auc  binary     0.775    10 0.00279 Preprocessor1_Model01
##  3    33    14 accuracy binary     0.779    10 0.00247 Preprocessor1_Model02
##  4    33    14 roc_auc  binary     0.775    10 0.00277 Preprocessor1_Model02
##  5    36    14 accuracy binary     0.778    10 0.00195 Preprocessor1_Model03
##  6    36    14 roc_auc  binary     0.775    10 0.00293 Preprocessor1_Model03
##  7    40    14 accuracy binary     0.778    10 0.00208 Preprocessor1_Model04
##  8    40    14 roc_auc  binary     0.776    10 0.00265 Preprocessor1_Model04
##  9    30    15 accuracy binary     0.778    10 0.00227 Preprocessor1_Model05
## 10    30    15 roc_auc  binary     0.775    10 0.00265 Preprocessor1_Model05
## # … with 14 more rows

Visualize the final tuning results

regular_res %>% 
  collect_metrics() %>% 
  filter(.metric == "roc_auc") %>% 
  mutate(min_n = factor(min_n)) %>% 
  ggplot(aes(mtry, mean, color=min_n)) +
  geom_line(alpha =0.5, size = 1.5) + 
  geom_point() + theme_fivethirtyeight() + labs(title="Tuning Grid")

Pick the Best Model

We utilize the roc_auc metric as our metric to pick the best model.

best_auc <- select_best(regular_res, "roc_auc")

final_rf <- finalize_model(
  tune_spec,
  best_auc
  )

Variable Importance

We utilize the VIP package to identify the important variables.

fpchart <- final_rf %>% 
  set_engine("ranger", importance = "permutation") %>% 
  fit(target ~.,
      data = juice(figs_prep) %>% select(-horse)) %>% 
  vip(geom = "point", num_features=25L) +  
  labs(
    x = NULL,
    y = NULL,
    title= "Feature Importance") 

fpchart + theme_fivethirtyeight()

Finalize Model

The metrics for the final model are calculated and displayed below.

final_wf <- workflow() %>% 
  add_recipe(figs_rec) %>% 
  add_model(final_rf)

final_res <- final_wf %>% 
  last_fit(figs_split)

final_result <- final_res %>% 
  collect_metrics()

gt(final_result)
.metric .estimator .estimate .config
accuracy binary 0.7717880 Preprocessor1_Model1
roc_auc binary 0.7705095 Preprocessor1_Model1

ROC Curve

final_res %>% 
  collect_predictions() %>% 
  roc_curve(target, .pred_Top, event_level = "second") %>% 
  ggplot(aes(x=1-specificity, y = sensitivity )) +
  geom_line(size = 1.5, color = "midnightblue") +
  geom_abline(
    lty = 2, alpha =0.5,
    color = "gray50",
    size = 1.2
    )  + 
  labs(title= "ROC Curve", x="1-Specificity",  y= "Sensitivity") + theme_fivethirtyeight()

Confusion Matrix

The confusion matrix for the final model is diplayed below:

collect_predictions(final_res) %>% 
  conf_mat(target, .pred_class)
##           Truth
## Prediction NoTop  Top
##      NoTop  6109 1376
##      Top     656  763

Model Predictions

Finally, model predictions on the test dataset are set forth in the data table below.

preTab <- final_res %>% 
  collect_predictions() %>% 
  mutate(correct = case_when(target == .pred_class ~ "Correct", 
                             TRUE ~ "Incorrect"))

datatable(preTab)