XGBoost 2 - Data 698

Get the data from Github

data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data2.csv")

figs <- read_csv(data) %>% 
  as_tibble() %>% 
  rename(n = `ï»¿n`) %>% 
  mutate(eff_target = if_else(eff_target==1,"Top","NoTop")) %>% 
  # Remove columns that can't be used in model
  select(-target, -D1, -D2, -eff_TV, -top, -eff_top, -fv, -DST, -seed, -E, -E4, -TE, -split_800, -split_801, -split_802, -split_803, -split_804, -split_805, -won, -RST, -X) %>% 
  rename(target = eff_target) %>% 
  #change character data to factors
  mutate_if(is.character, factor) %>% 
  mutate(race_date = mdy(race_date))
  #change distance to numeric
  
  
skim(figs)

Data summary
Name	figs
Number of rows	35617
Number of columns	59
_______________________
Column type frequency:
Date	1
factor	14
numeric	44
________________________
Group variables	None

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
race_date	0	1	2016-04-13	2021-12-20	2019-10-31	1433

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
target	1	FALSE	2	NoT: 27061, Top: 8556
horse	1	FALSE	3387	CHA: 57, VIC: 57, STA: 54, EL : 53
surface_cond	1	FALSE	13	fst: 16366, frm: 10368, wsm: 2913, vsl: 1771
precip	1	FALSE	3	cle: 34259, rai: 1278, sno: 80
wind	1	FALSE	3	cal: 31586, hvy: 2581, vhv: 1450
gender	1	FALSE	2	Mal: 24217, Fem: 11400
distance	1	FALSE	22	8.0: 9316, 6.0: 7624, 8.5: 4621, 7.0: 3011
jky	1	FALSE	729	I O: 1034, L S: 930, M F: 915, J O: 879
trk_code	1	FALSE	145	GP: 8459, AQU: 4858, BEL: 3366, SA: 2892
surf	1	FALSE	5	Dir: 21482, Tur: 11757, Syn: 1271, Inn: 1038
s_cond	1	FALSE	8	Fst: 18433, Frm: 10362, Gd: 3275, Sly: 2180
form_cycle	1	FALSE	24	AX: 6600, CX: 3762, BX: 3536, A1: 3387
race_type	1	FALSE	2	Spr: 18816, Rou: 16801
trk	1	FALSE	130	GP: 8377, AQ: 4769, BE: 3359, SA: 2887

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
n	0	1	9.86	8.51	1.00	3.00	7.00	14.00	57.00	▇▂▁▁▁
cls_val	0	1	1.99	0.89	1.00	1.15	1.62	3.00	12.59	▇▁▁▁▁
cls_mv	0	1	0.07	0.68	-11.49	0.00	0.00	0.00	11.39	▁▁▇▁▁
yr	0	1	2019.22	0.99	2016.00	2019.00	2019.00	2020.00	2021.00	▁▃▇▇▂
age	0	1	3.64	1.30	1.00	3.00	3.00	4.00	11.00	▇▆▁▁▁
ftl	0	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
clm_by_hot	0	1	-0.93	0.34	-1.00	-1.00	-1.00	-1.00	1.00	▇▁▁▁▁
start	0	1	0.03	0.09	0.00	0.00	0.00	0.00	0.90	▇▁▁▁▁
bled	0	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
off_turf	0	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
bfnr	0	1	0.01	0.11	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lame	0	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lsx	0	1	0.00	0.09	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
blnkrs	0	1	0.04	0.27	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
age_wks	0	1	204.45	65.52	37.00	154.00	190.00	240.00	559.00	▂▇▂▁▁
wt	2	1	120.32	2.70	108.00	119.00	120.00	122.00	143.00	▁▇▂▁▁
off_odds	0	1	12.17	17.09	0.11	2.50	6.00	13.00	99.00	▇▁▁▁▁
fld	2	1	8.39	2.21	2.00	7.00	8.00	10.00	30.00	▅▇▁▁▁
L3	0	1	18.01	7.03	1.75	13.33	16.92	21.42	99.00	▇▃▁▁▁
L5	0	1	18.26	6.77	1.75	13.70	17.25	21.62	99.00	▇▃▁▁▁
L7	0	1	18.43	6.66	1.75	13.96	17.43	21.75	99.00	▇▃▁▁▁
rest	0	1	43.85	53.38	0.00	21.00	28.00	43.00	1081.00	▇▁▁▁▁
avg_rest	0	1	39.75	21.66	7.25	25.67	34.70	46.78	489.00	▇▁▁▁▁
efforts_last90	0	1	0.90	0.95	0.00	0.00	1.00	1.00	8.00	▇▂▁▁▁
Lag1	0	1	0.21	0.40	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag2	0	1	0.19	0.39	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag3	0	1	0.17	0.37	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag4	0	1	0.15	0.36	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag5	0	1	0.14	0.34	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
LP1	0	1	-1.91	2.27	-5.00	-5.00	-2.00	0.00	1.00	▆▂▃▂▇
LP2	0	1	-1.91	2.16	-5.00	-4.00	-2.00	0.00	1.00	▆▂▆▂▇
LP3	0	1	-1.92	2.05	-5.00	-4.00	-2.00	0.00	1.00	▆▂▇▂▇
LP4	0	1	-1.92	1.95	-5.00	-3.00	-2.00	0.00	1.00	▅▁▇▁▆
LP5	0	1	-1.93	1.85	-5.00	-3.00	-2.00	0.00	1.00	▃▁▇▁▅
pp_id	0	1	27487.78	16975.26	1.00	12623.00	26040.00	41459.00	59178.00	▇▇▆▇▆
splt_1	0	1	4.48	6.00	0.00	0.00	2.89	6.79	51.00	▇▁▁▁▁
splt_2	0	1	2.36	3.93	0.00	0.00	0.00	3.81	35.91	▇▁▁▁▁
splt_3	0	1	2.60	5.16	0.00	0.00	0.00	3.57	45.30	▇▁▁▁▁
splt_4	0	1	3.53	5.16	0.00	0.00	1.23	5.27	46.29	▇▁▁▁▁
splt_5	0	1	1.34	2.75	0.00	0.00	0.00	1.93	27.99	▇▁▁▁▁
splt_6	0	1	2.85	4.09	0.00	0.00	0.00	3.97	30.87	▇▂▁▁▁
splt_7	0	1	4.68	5.87	0.00	0.00	2.90	7.42	37.86	▇▂▁▁▁
splt_8	0	1	11.88	9.05	0.00	3.93	10.63	17.73	50.57	▇▆▂▁▁
splt_9	0	1	10.57	8.58	0.00	2.97	9.35	16.01	47.05	▇▅▂▁▁

Build Model

We can start by splitting our data into training and testing sets.

set.seed(123)

figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)

figs_split

## <Analysis/Assess/Total>
## <26713/8904/35617>

Create Model Specification

We create a model specification utilizing the xgboost package

xgb_spec <- boost_tree(
  trees = 1000,
  tree_depth = tune(),
  min_n = tune(),
  loss_reduction = tune(),
  sample_size = tune(),
  mtry = tune(),
  learn_rate = tune(),
) %>% 
  set_engine("xgboost") %>% 
  set_mode("classification")

xgb_spec

## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   loss_reduction = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

Create a Grid for the Hyperparameter value options

xgb_grid <- grid_latin_hypercube(
  tree_depth(),
  min_n(),
  loss_reduction(),
  sample_size = sample_prop(),
  finalize(mtry(), figs_train),
  learn_rate(),
  size = 30
)

Create a Recipe

The recipe performs preprocessing necessary to fit a logistic regression model

figs_rec <- recipe(target ~ ., data=figs_train) %>%
  update_role(horse, new_role = "ID") %>% 
  update_role(pp_id, new_role = "ID") %>% 
  step_other(jky , threshold = 0.015) %>% 
  step_other(trk_code , threshold = 0.02) %>% 
  step_other(distance, threshold = 0.02) %>% 
  step_dummy(distance) %>% 
  step_dummy(gender) %>% 
  step_dummy(trk_code) %>% 
  step_dummy(jky) %>% 
  step_dummy(form_cycle) %>% 
  step_dummy(surf) %>% 
  step_dummy(wind) %>% 
  step_dummy(precip) %>% 
  step_dummy(s_cond) %>% 
  step_dummy(race_type) %>% 
  step_dummy(surface_cond) %>% 
  step_rm(yr, trk) %>% 
  step_date(race_date, features = c("month", "year")) %>% 
  step_dummy(race_date_month) %>% 
  step_rm(race_date, n) %>% 
  themis::step_upsample(target)
  
  

figs_prep <- prep(figs_rec)

juiced <- juice(figs_prep)
skim(juiced)

Data summary
Name	juiced
Number of rows	40592
Number of columns	142
_______________________
Column type frequency:
factor	2
numeric	140
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
horse	0	1	FALSE	3302	EYE: 68, OUR: 61, VIC: 57, STA: 56
target	0	1	FALSE	2	NoT: 20296, Top: 20296

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
cls_val	0	1	1.97	0.89	1.00	1.15	1.50	3.00	12.59	▇▁▁▁▁
cls_mv	0	1	0.10	0.70	-11.49	0.00	0.00	0.00	11.39	▁▁▇▁▁
age	0	1	3.51	1.26	1.00	3.00	3.00	4.00	11.00	▇▅▁▁▁
ftl	0	1	0.07	0.25	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
clm_by_hot	0	1	-0.93	0.32	-1.00	-1.00	-1.00	-1.00	1.00	▇▁▁▁▁
start	0	1	0.02	0.09	0.00	0.00	0.00	0.00	0.90	▇▁▁▁▁
bled	0	1	0.00	0.01	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
off_turf	0	1	0.04	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
bfnr	0	1	0.01	0.09	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lame	0	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lsx	0	1	0.00	0.09	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
blnkrs	0	1	0.04	0.27	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
age_wks	0	1	197.86	62.73	44.00	150.00	183.00	230.00	559.00	▃▇▂▁▁
wt	1	1	120.31	2.70	108.00	118.00	120.00	122.00	140.00	▁▇▆▁▁
off_odds	0	1	12.10	17.13	0.11	2.50	5.00	13.00	99.00	▇▁▁▁▁
fld	1	1	8.42	2.19	2.00	7.00	8.00	10.00	30.00	▅▇▁▁▁
L3	0	1	18.02	6.91	1.75	13.33	17.00	21.50	99.00	▇▃▁▁▁
L5	0	1	18.41	6.67	1.75	13.85	17.40	21.85	99.00	▇▃▁▁▁
L7	0	1	18.63	6.56	1.75	14.18	17.67	22.06	99.00	▇▃▁▁▁
rest	0	1	45.37	54.07	0.00	21.00	29.00	46.00	850.00	▇▁▁▁▁
avg_rest	0	1	40.44	23.06	8.75	26.00	34.67	47.14	340.33	▇▁▁▁▁
efforts_last90	0	1	0.98	0.94	0.00	0.00	1.00	2.00	8.00	▇▃▁▁▁
Lag1	0	1	0.23	0.42	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag2	0	1	0.19	0.39	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag3	0	1	0.16	0.37	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag4	0	1	0.14	0.35	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag5	0	1	0.12	0.33	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
LP1	0	1	-1.61	2.27	-5.00	-4.00	-1.00	0.00	1.00	▅▁▂▁▇
LP2	0	1	-1.75	2.11	-5.00	-3.00	-2.00	0.00	1.00	▅▁▆▂▇
LP3	0	1	-1.83	1.98	-5.00	-3.00	-2.00	0.00	1.00	▅▁▇▁▇
LP4	0	1	-1.86	1.87	-5.00	-2.00	-2.00	0.00	1.00	▃▁▇▁▅
LP5	0	1	-1.90	1.76	-5.00	-2.00	-2.00	-1.00	1.00	▃▁▇▁▃
pp_id	0	1	27292.06	16846.47	1.00	12675.75	25536.00	41076.25	59176.00	▇▇▆▇▅
splt_1	0	1	4.38	5.77	0.00	0.00	2.93	6.72	51.00	▇▁▁▁▁
splt_2	0	1	2.40	3.84	0.00	0.00	0.00	3.83	35.91	▇▁▁▁▁
splt_3	0	1	2.31	4.84	0.00	0.00	0.00	2.92	45.30	▇▁▁▁▁
splt_4	0	1	3.43	4.97	0.00	0.00	1.68	4.92	46.29	▇▁▁▁▁
splt_5	0	1	1.32	2.67	0.00	0.00	0.00	1.96	27.99	▇▁▁▁▁
splt_6	0	1	2.74	3.94	0.00	0.00	0.00	3.95	30.87	▇▂▁▁▁
splt_7	0	1	4.45	5.62	0.00	0.00	2.90	6.87	37.86	▇▂▁▁▁
splt_8	0	1	11.26	8.68	0.00	3.00	9.90	16.69	50.57	▇▅▂▁▁
splt_9	0	1	10.07	8.20	0.00	2.96	8.32	15.01	47.05	▇▅▂▁▁
distance_X5.5F	0	1	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X6.0F	0	1	0.21	0.40	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
distance_X6.5F	0	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X7.0F	0	1	0.09	0.28	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X8.0F	0	1	0.27	0.44	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▃
distance_X8.5F	0	1	0.13	0.34	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X9.0F	0	1	0.05	0.22	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_other	0	1	0.06	0.23	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
gender_Male	0	1	0.67	0.47	0.00	0.00	1.00	1.00	1.00	▃▁▁▁▇
trk_code_BEL	0	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_CD	0	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_DMR	0	1	0.04	0.20	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_GP	0	1	0.24	0.43	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
trk_code_GPW	0	1	0.03	0.18	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_KEE	0	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_SA	0	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_SAR	0	1	0.06	0.23	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_WO	0	1	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_other	0	1	0.22	0.42	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
jky_E.Cancel	0	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_E.Jaramillo	0	1	0.02	0.12	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_E.Zayas	0	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_I.Ortiz..Jr.	0	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Alvarado	0	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Lezcano	0	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Ortiz	0	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Rosario	0	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_L.Reyes	0	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_L.Saez	0	1	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_M.Franco	0	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_M.Vasquez	0	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_P.Lopez	0	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_R.Maragh	0	1	0.01	0.12	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_T.Gaffalione	0	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_other	0	1	0.67	0.47	0.00	0.00	1.00	1.00	1.00	▃▁▁▁▇
form_cycle_A2	0	1	0.11	0.31	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_A3	0	1	0.07	0.25	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_AX	0	1	0.18	0.39	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
form_cycle_B1	0	1	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_B2	0	1	0.04	0.20	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_B3	0	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_BX	0	1	0.09	0.28	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_C1	0	1	0.05	0.22	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_C2	0	1	0.03	0.18	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_C3	0	1	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_CX	0	1	0.10	0.30	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_D1	0	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_D2	0	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_D3	0	1	0.01	0.11	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_DX	0	1	0.04	0.20	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_E1	0	1	0.01	0.10	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_E2	0	1	0.01	0.09	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_E3	0	1	0.01	0.07	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_EX	0	1	0.02	0.12	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_F1	0	1	0.00	0.06	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_F2	0	1	0.00	0.05	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_F3	0	1	0.00	0.04	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_FX	0	1	0.00	0.06	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Inner.Dirt	0	1	0.00	0.05	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Inner.Turf	0	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Synth	0	1	0.04	0.19	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Turf	0	1	0.35	0.48	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▅
wind_hvy	0	1	0.07	0.26	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
wind_vhvy	0	1	0.04	0.19	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
precip_rain	0	1	0.03	0.18	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
precip_snow	0	1	0.00	0.04	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Fst	0	1	0.51	0.50	0.00	0.00	1.00	1.00	1.00	▇▁▁▁▇
s_cond_Gd	0	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_My	0	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Sf	0	1	0.00	0.07	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Sly	0	1	0.06	0.23	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Trk_condit	0	1	0.00	0.03	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Yl	0	1	0.01	0.11	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_type_Sprint	0	1	0.52	0.50	0.00	0.00	1.00	1.00	1.00	▇▁▁▁▇
surface_cond_frz	0	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_fst	0	1	0.45	0.50	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▆
surface_cond_gd	0	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_gd.slw	0	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_hvy	0	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_puaf	0	1	0.01	0.08	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_sft	0	1	0.01	0.08	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_vbw	0	1	0.00	0.03	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_vslw	0	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_wf	0	1	0.01	0.10	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_wsm	0	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_yld	0	1	0.01	0.11	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_year	0	1	2019.20	1.00	2016.00	2019.00	2019.00	2020.00	2021.00	▁▃▇▇▂
race_date_month_Feb	0	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Mar	0	1	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Apr	0	1	0.05	0.22	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_May	0	1	0.07	0.25	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Jun	0	1	0.07	0.26	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Jul	0	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Aug	0	1	0.10	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Sep	0	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Oct	0	1	0.10	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Nov	0	1	0.10	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Dec	0	1	0.09	0.28	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁

Setup a Workflow

We create the workflow by adding the recipe and the model

xgb_wf <- workflow() %>% 
  add_recipe(figs_rec) %>% 
  add_model(xgb_spec)

xgb_wf

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 19 Recipe Steps
## 
## ● step_other()
## ● step_other()
## ● step_other()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● ...
## ● and 9 more steps.
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   loss_reduction = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

Now, let’s create cross-validation resamples for tuning our xgboost model

set.seed(123)
vb_folds <- vfold_cv(figs_train, strata = target)

vb_folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 x 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [24041/2672]> Fold01
##  2 <split [24041/2672]> Fold02
##  3 <split [24041/2672]> Fold03
##  4 <split [24041/2672]> Fold04
##  5 <split [24041/2672]> Fold05
##  6 <split [24041/2672]> Fold06
##  7 <split [24042/2671]> Fold07
##  8 <split [24043/2670]> Fold08
##  9 <split [24043/2670]> Fold09
## 10 <split [24043/2670]> Fold10

Tune The model

We use tune_grid() along gwith our tunable workflow, our resamples and our grid of parameters. We will use control_grid(save_pred=TRUE) to preserve the prediction for later analysis.

members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

doParallel::registerDoParallel()

set.seed(234)

xgb_res <- tune_grid(
  xgb_wf,
  resamples = vb_folds,
  metrics = members_metrics,
  control = control_grid(save_pred = TRUE)
)

## i Creating pre-processing data to finalize unknown parameter: mtry

xgb_res

## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 x 5
##    splits             id    .metrics         .notes         .predictions        
##    <list>             <chr> <list>           <list>         <list>              
##  1 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  2 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  3 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  4 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  5 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  6 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
##  7 <split [24042/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,710 × 1…
##  8 <split [24043/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,700 × 1…
##  9 <split [24043/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,700 × 1…
## 10 <split [24043/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,700 × 1…

Evaluate Model

Now we use collect_metrics to see the results of our modeling. Here we use accuracy and roc to evaluate the model

collect_metrics(xgb_res)

## # A tibble: 40 x 12
##     mtry min_n tree_depth    learn_rate loss_reduction sample_size .metric 
##    <int> <int>      <int>         <dbl>          <dbl>       <dbl> <chr>   
##  1   110    17          1 0.00000719        0.147            0.454 accuracy
##  2   110    17          1 0.00000719        0.147            0.454 roc_auc 
##  3   110    17          1 0.00000719        0.147            0.454 sens    
##  4   110    17          1 0.00000719        0.147            0.454 spec    
##  5    48    19          6 0.00000000167     2.75             0.578 accuracy
##  6    48    19          6 0.00000000167     2.75             0.578 roc_auc 
##  7    48    19          6 0.00000000167     2.75             0.578 sens    
##  8    48    19          6 0.00000000167     2.75             0.578 spec    
##  9    63     6          7 0.000000268       0.00000126       0.966 accuracy
## 10    63     6          7 0.000000268       0.00000126       0.966 roc_auc 
## # … with 30 more rows, and 5 more variables: .estimator <chr>, mean <dbl>,
## #   n <int>, std_err <dbl>, .config <chr>

Visualize Tuning Results

xgb_res %>% 
  collect_metrics() %>% 
  filter(.metric == "roc_auc") %>% 
  select(mean, mtry:sample_size) %>% 
  pivot_longer(mtry:sample_size,
               values_to = "value",
               names_to = "parameter"
  ) %>% 
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(x = NULL, y = "AUC")

What’s the Best Model

show_best(xgb_res, "roc_auc")

## # A tibble: 5 x 12
##    mtry min_n tree_depth   learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>        <dbl>          <dbl>       <dbl> <chr>  
## 1    95    27         10 0.0153        0.0000000510        0.837 roc_auc
## 2   137    13         12 0.00877       0.00000000110       0.720 roc_auc
## 3    63     6          7 0.000000268   0.00000126          0.966 roc_auc
## 4   125    37          8 0.0000000130  0.00418             0.308 roc_auc
## 5    30    30         14 0.0000587     0.000535            0.808 roc_auc
## # … with 5 more variables: .estimator <chr>, mean <dbl>, n <int>,
## #   std_err <dbl>, .config <chr>

best_auc <- select_best(xgb_res, "roc_auc")
best_auc

## # A tibble: 1 x 7
##    mtry min_n tree_depth learn_rate loss_reduction sample_size .config          
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>            
## 1    95    27         10     0.0153   0.0000000510       0.837 Preprocessor1_Mo…

Finalize Model

Now lets peform the last_fit() to finalize our model against the test dataset

final_xgb <- finalize_workflow(
  xgb_wf, 
  best_auc
)

final_xgb

## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 19 Recipe Steps
## 
## ● step_other()
## ● step_other()
## ● step_other()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● ...
## ● and 9 more steps.
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
## 
## Main Arguments:
##   mtry = 95
##   trees = 1000
##   min_n = 27
##   tree_depth = 10
##   learn_rate = 0.0153160089416973
##   loss_reduction = 5.10282054010883e-08
##   sample_size = 0.837027746313252
## 
## Computational engine: xgboost

The metrics and prediction have been calculated against the test data

What are the important variables

final_xgb %>% 
  fit(data = figs_train) %>% 
  pull_workflow_fit() %>% 
  vip(geom = "point", num_features=25L) +  
  labs(
    x = NULL,
    y = NULL,
    title= "Feature Importance") + theme_fivethirtyeight()

## [02:42:29] WARNING: amalgamation/../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

Final Results

final_res <- last_fit(final_xgb, figs_split)

collect_metrics(final_res)

## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.748 Preprocessor1_Model1
## 2 roc_auc  binary         0.793 Preprocessor1_Model1

Confusion Matrix

collect_predictions(final_res) %>% 
  conf_mat(target, .pred_class)

##           Truth
## Prediction NoTop  Top
##      NoTop  5372  854
##      Top    1393 1285

ROC Curve

final_res %>% 
  collect_predictions() %>% 
  roc_curve(target, .pred_Top, event_level = "second") %>% 
  ggplot(aes(x=1-specificity, y = sensitivity)) +
  geom_line(size = 1.5, color = "midnightblue") +
  geom_abline(
    lty = 2, alpha =0.5,
    color = "gray50",
    size = 1.2
  )

Predictions

xgb_predicitions <- final_res %>% 
  collect_predictions() %>% 
  mutate(correct = case_when(target == .pred_class ~ "Correct",
                             TRUE ~ "Incorrect"))
datatable(xgb_predicitions)