Random Forest 2 - Data 698

Get the data from Github

data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data2.csv")

figs <- read_csv(data) %>% 
  as_tibble() %>% 
  rename(n = `ï»¿n`) %>%
  mutate(eff_target = if_else(eff_target==1,"Top","NoTop")) %>% 
  # Remove columns that can't be used in model
  select(-target, -D1, -D2, -eff_TV, -top, -eff_top, -fv, -DST, -seed, -E, -E4, -TE, -split_800, -split_801, -split_802, -split_803, -split_804, -split_805, -won, -RST, -X) %>%
  rename(target = eff_target) %>% 
  #change character data to factors
  mutate_if(is.character, factor) %>%
  mutate(race_date = mdy(race_date)) %>% 
  mutate(fld = if_else(is.na(fld), 8, fld)) %>%
  mutate(wt = if_else(is.na(wt), 120, wt))  
  
  
skim(figs)

Data summary
Name	figs
Number of rows	35617
Number of columns	59
_______________________
Column type frequency:
Date	1
factor	14
numeric	44
________________________
Group variables	None

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
race_date	0	1	2016-04-13	2021-12-20	2019-10-31	1433

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
target	1	FALSE	2	NoT: 27061, Top: 8556
horse	1	FALSE	3387	CHA: 57, VIC: 57, STA: 54, EL : 53
surface_cond	1	FALSE	13	fst: 16366, frm: 10368, wsm: 2913, vsl: 1771
precip	1	FALSE	3	cle: 34259, rai: 1278, sno: 80
wind	1	FALSE	3	cal: 31586, hvy: 2581, vhv: 1450
gender	1	FALSE	2	Mal: 24217, Fem: 11400
distance	1	FALSE	22	8.0: 9316, 6.0: 7624, 8.5: 4621, 7.0: 3011
jky	1	FALSE	729	I O: 1034, L S: 930, M F: 915, J O: 879
trk_code	1	FALSE	145	GP: 8459, AQU: 4858, BEL: 3366, SA: 2892
surf	1	FALSE	5	Dir: 21482, Tur: 11757, Syn: 1271, Inn: 1038
s_cond	1	FALSE	8	Fst: 18433, Frm: 10362, Gd: 3275, Sly: 2180
form_cycle	1	FALSE	24	AX: 6600, CX: 3762, BX: 3536, A1: 3387
race_type	1	FALSE	2	Spr: 18816, Rou: 16801
trk	1	FALSE	130	GP: 8377, AQ: 4769, BE: 3359, SA: 2887

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
n	1	9.86	8.51	1.00	3.00	7.00	14.00	57.00	▇▂▁▁▁
cls_val	1	1.99	0.89	1.00	1.15	1.62	3.00	12.59	▇▁▁▁▁
cls_mv	1	0.07	0.68	-11.49	0.00	0.00	0.00	11.39	▁▁▇▁▁
yr	1	2019.22	0.99	2016.00	2019.00	2019.00	2020.00	2021.00	▁▃▇▇▂
age	1	3.64	1.30	1.00	3.00	3.00	4.00	11.00	▇▆▁▁▁
ftl	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
clm_by_hot	1	-0.93	0.34	-1.00	-1.00	-1.00	-1.00	1.00	▇▁▁▁▁
start	1	0.03	0.09	0.00	0.00	0.00	0.00	0.90	▇▁▁▁▁
bled	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
off_turf	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
bfnr	1	0.01	0.11	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lame	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lsx	1	0.00	0.09	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
blnkrs	1	0.04	0.27	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
age_wks	1	204.45	65.52	37.00	154.00	190.00	240.00	559.00	▂▇▂▁▁
wt	1	120.32	2.70	108.00	119.00	120.00	122.00	143.00	▁▇▂▁▁
off_odds	1	12.17	17.09	0.11	2.50	6.00	13.00	99.00	▇▁▁▁▁
fld	1	8.39	2.21	2.00	7.00	8.00	10.00	30.00	▅▇▁▁▁
L3	1	18.01	7.03	1.75	13.33	16.92	21.42	99.00	▇▃▁▁▁
L5	1	18.26	6.77	1.75	13.70	17.25	21.62	99.00	▇▃▁▁▁
L7	1	18.43	6.66	1.75	13.96	17.43	21.75	99.00	▇▃▁▁▁
rest	1	43.85	53.38	0.00	21.00	28.00	43.00	1081.00	▇▁▁▁▁
avg_rest	1	39.75	21.66	7.25	25.67	34.70	46.78	489.00	▇▁▁▁▁
efforts_last90	1	0.90	0.95	0.00	0.00	1.00	1.00	8.00	▇▂▁▁▁
Lag1	1	0.21	0.40	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag2	1	0.19	0.39	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag3	1	0.17	0.37	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag4	1	0.15	0.36	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag5	1	0.14	0.34	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
LP1	1	-1.91	2.27	-5.00	-5.00	-2.00	0.00	1.00	▆▂▃▂▇
LP2	1	-1.91	2.16	-5.00	-4.00	-2.00	0.00	1.00	▆▂▆▂▇
LP3	1	-1.92	2.05	-5.00	-4.00	-2.00	0.00	1.00	▆▂▇▂▇
LP4	1	-1.92	1.95	-5.00	-3.00	-2.00	0.00	1.00	▅▁▇▁▆
LP5	1	-1.93	1.85	-5.00	-3.00	-2.00	0.00	1.00	▃▁▇▁▅
pp_id	1	27487.78	16975.26	1.00	12623.00	26040.00	41459.00	59178.00	▇▇▆▇▆
splt_1	1	4.48	6.00	0.00	0.00	2.89	6.79	51.00	▇▁▁▁▁
splt_2	1	2.36	3.93	0.00	0.00	0.00	3.81	35.91	▇▁▁▁▁
splt_3	1	2.60	5.16	0.00	0.00	0.00	3.57	45.30	▇▁▁▁▁
splt_4	1	3.53	5.16	0.00	0.00	1.23	5.27	46.29	▇▁▁▁▁
splt_5	1	1.34	2.75	0.00	0.00	0.00	1.93	27.99	▇▁▁▁▁
splt_6	1	2.85	4.09	0.00	0.00	0.00	3.97	30.87	▇▂▁▁▁
splt_7	1	4.68	5.87	0.00	0.00	2.90	7.42	37.86	▇▂▁▁▁
splt_8	1	11.88	9.05	0.00	3.93	10.63	17.73	50.57	▇▆▂▁▁
splt_9	1	10.57	8.58	0.00	2.97	9.35	16.01	47.05	▇▅▂▁▁

Build Model

We can start by splitting our data into training and testing sets.

set.seed(123)

figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)

figs_split

## <Analysis/Assess/Total>
## <26713/8904/35617>

Create a Recipe

The recipe performs preprocessing necessary to fit a logistic regression model

figs_rec <- recipe(target ~ ., data=figs_train) %>%
  update_role(horse, new_role = "ID") %>% 
  update_role(pp_id, new_role = "ID") %>% 
  step_other(jky , threshold = 0.015) %>% 
  step_other(trk_code , threshold = 0.02) %>% 
  step_other(distance, threshold = 0.02) %>% 
  step_dummy(distance) %>% 
  step_dummy(gender) %>% 
  step_dummy(trk_code) %>% 
  step_dummy(jky) %>% 
  step_dummy(form_cycle) %>% 
  step_dummy(surf) %>% 
  step_dummy(wind) %>% 
  step_dummy(precip) %>% 
  step_dummy(s_cond) %>% 
  step_dummy(race_type) %>% 
  step_dummy(surface_cond) %>% 
  step_rm(yr, trk) %>% 
  step_date(race_date, features = c("month", "year")) %>% 
  step_dummy(race_date_month) %>% 
  step_rm(race_date, n) %>% 
  themis::step_upsample(target)
  
  

figs_prep <- prep(figs_rec)

juiced <- juice(figs_prep)
skim(juiced)

Data summary
Name	juiced
Number of rows	40592
Number of columns	142
_______________________
Column type frequency:
factor	2
numeric	140
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
horse	0	1	FALSE	3302	EYE: 66, SAL: 60, STA: 60, EL : 59
target	0	1	FALSE	2	NoT: 20296, Top: 20296

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
cls_val	1	1.97	0.89	1.00	1.15	1.50	3.00	12.59	▇▁▁▁▁
cls_mv	1	0.10	0.71	-11.49	0.00	0.00	0.00	11.39	▁▁▇▁▁
age	1	3.52	1.25	1.00	3.00	3.00	4.00	11.00	▇▅▁▁▁
ftl	1	0.06	0.25	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
clm_by_hot	1	-0.93	0.32	-1.00	-1.00	-1.00	-1.00	1.00	▇▁▁▁▁
start	1	0.02	0.09	0.00	0.00	0.00	0.00	0.90	▇▁▁▁▁
bled	1	0.00	0.01	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
off_turf	1	0.04	0.20	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
bfnr	1	0.01	0.09	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lame	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lsx	1	0.00	0.09	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
blnkrs	1	0.04	0.27	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
age_wks	1	198.11	62.63	44.00	151.00	183.00	230.00	559.00	▃▇▂▁▁
wt	1	120.34	2.71	108.00	119.00	120.00	122.00	140.00	▁▇▆▁▁
off_odds	1	12.00	17.01	0.11	2.50	5.00	13.00	99.00	▇▁▁▁▁
fld	1	8.44	2.21	2.00	7.00	8.00	10.00	30.00	▅▇▁▁▁
L3	1	18.01	6.90	1.75	13.33	16.96	21.42	99.00	▇▃▁▁▁
L5	1	18.41	6.67	1.75	13.88	17.40	21.83	99.00	▇▃▁▁▁
L7	1	18.63	6.56	1.75	14.18	17.64	22.00	99.00	▇▃▁▁▁
rest	1	45.67	54.66	0.00	21.00	29.00	46.00	850.00	▇▁▁▁▁
avg_rest	1	40.56	23.28	8.75	26.00	34.67	47.25	340.33	▇▁▁▁▁
efforts_last90	1	0.98	0.94	0.00	0.00	1.00	2.00	8.00	▇▃▁▁▁
Lag1	1	0.23	0.42	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag2	1	0.19	0.39	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag3	1	0.16	0.37	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag4	1	0.14	0.35	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag5	1	0.12	0.33	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
LP1	1	-1.62	2.27	-5.00	-4.00	-1.00	0.00	1.00	▅▁▂▁▇
LP2	1	-1.76	2.11	-5.00	-3.00	-2.00	0.00	1.00	▆▁▆▂▇
LP3	1	-1.84	1.99	-5.00	-3.00	-2.00	0.00	1.00	▅▁▇▁▇
LP4	1	-1.88	1.87	-5.00	-2.00	-2.00	0.00	1.00	▃▁▇▁▅
LP5	1	-1.91	1.76	-5.00	-2.00	-2.00	-1.00	1.00	▃▁▇▁▃
pp_id	1	27257.13	16858.55	1.00	12675.75	25486.50	41011.25	59176.00	▇▇▆▇▅
splt_1	1	4.32	5.74	0.00	0.00	2.92	6.68	51.00	▇▁▁▁▁
splt_2	1	2.39	3.85	0.00	0.00	0.00	3.81	35.91	▇▁▁▁▁
splt_3	1	2.32	4.86	0.00	0.00	0.00	2.92	45.30	▇▁▁▁▁
splt_4	1	3.39	4.96	0.00	0.00	1.02	4.87	46.29	▇▁▁▁▁
splt_5	1	1.30	2.68	0.00	0.00	0.00	1.92	27.99	▇▁▁▁▁
splt_6	1	2.73	3.95	0.00	0.00	0.00	3.94	30.87	▇▂▁▁▁
splt_7	1	4.45	5.63	0.00	0.00	2.89	6.86	37.86	▇▂▁▁▁
splt_8	1	11.22	8.68	0.00	3.00	9.84	16.67	50.57	▇▅▂▁▁
splt_9	1	10.01	8.18	0.00	2.96	8.20	14.91	47.05	▇▅▂▁▁
distance_X5.5F	1	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X6.0F	1	0.21	0.41	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
distance_X6.5F	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X7.0F	1	0.08	0.28	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X8.0F	1	0.27	0.45	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▃
distance_X8.5F	1	0.13	0.34	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X9.0F	1	0.05	0.22	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_other	1	0.06	0.23	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
gender_Male	1	0.67	0.47	0.00	0.00	1.00	1.00	1.00	▃▁▁▁▇
trk_code_BEL	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_CD	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_DMR	1	0.04	0.20	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_GP	1	0.24	0.43	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
trk_code_GPW	1	0.03	0.18	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_KEE	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_SA	1	0.08	0.28	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_SAR	1	0.06	0.23	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_WO	1	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_other	1	0.23	0.42	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
jky_E.Cancel	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_E.Jaramillo	1	0.02	0.12	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_E.Zayas	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_I.Ortiz..Jr.	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Alvarado	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Lezcano	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Ortiz	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Rosario	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_L.Reyes	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_L.Saez	1	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_M.Franco	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_M.Vasquez	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_P.Lopez	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_R.Maragh	1	0.01	0.12	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_T.Gaffalione	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_other	1	0.67	0.47	0.00	0.00	1.00	1.00	1.00	▃▁▁▁▇
form_cycle_A2	1	0.11	0.31	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_A3	1	0.07	0.25	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_AX	1	0.18	0.39	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
form_cycle_B1	1	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_B2	1	0.04	0.20	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_B3	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_BX	1	0.09	0.28	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_C1	1	0.05	0.22	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_C2	1	0.04	0.18	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_C3	1	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_CX	1	0.10	0.30	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_D1	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_D2	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_D3	1	0.01	0.11	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_DX	1	0.04	0.20	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_E1	1	0.01	0.10	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_E2	1	0.01	0.09	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_E3	1	0.01	0.07	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_EX	1	0.01	0.12	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_F1	1	0.00	0.07	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_F2	1	0.00	0.06	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_F3	1	0.00	0.04	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_FX	1	0.00	0.07	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Inner.Dirt	1	0.00	0.04	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Inner.Turf	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Synth	1	0.04	0.19	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Turf	1	0.35	0.48	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▅
wind_hvy	1	0.07	0.26	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
wind_vhvy	1	0.04	0.19	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
precip_rain	1	0.03	0.18	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
precip_snow	1	0.00	0.04	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Fst	1	0.51	0.50	0.00	0.00	1.00	1.00	1.00	▇▁▁▁▇
s_cond_Gd	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_My	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Sf	1	0.00	0.06	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Sly	1	0.06	0.23	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Trk_condit	1	0.00	0.04	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Yl	1	0.01	0.11	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_type_Sprint	1	0.52	0.50	0.00	0.00	1.00	1.00	1.00	▇▁▁▁▇
surface_cond_frz	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_fst	1	0.45	0.50	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▆
surface_cond_gd	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_gd.slw	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_hvy	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_puaf	1	0.01	0.08	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_sft	1	0.01	0.09	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_vbw	1	0.00	0.03	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_vslw	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_wf	1	0.01	0.10	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_wsm	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_yld	1	0.01	0.11	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_year	1	2019.20	1.00	2016.00	2019.00	2019.00	2020.00	2021.00	▁▃▇▇▂
race_date_month_Feb	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Mar	1	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Apr	1	0.05	0.22	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_May	1	0.07	0.25	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Jun	1	0.07	0.26	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Jul	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Aug	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Sep	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Oct	1	0.10	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Nov	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Dec	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁

Tuning the Random Forest Model

We create a tuning specification to facilitate tuning the random forest. Tree is set to 1000, min_n and mtry are parameters that will be tuned. We also set the model engine to ranger and specify a classifcation model.

tune_spec <- rand_forest(
  mtry = tune(),
  trees = 1000,
  min_n = tune()
  ) %>% 
  set_mode("classification") %>% 
  set_engine("ranger")

Setup a Workflow for the model

tune_wf <- workflow() %>% 
  add_recipe(figs_rec) %>% 
  add_model(tune_spec)

set.seed(234)

figs_folds <- vfold_cv(figs_train)

Tune hyperparameters

The inital training takes place in the following code chunk. We employ parallel processing to epadite the tuning process. After the tuning is complete collect_metrics and display the results

set.seed(234)

members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

doParallel::registerDoParallel(cores=28)

set.seed(345)

tune_res <- tune_grid(
  tune_wf,
  resamples = figs_folds,
  metrics = members_metrics,
  grid = 10
)

tune_res %>% 
  collect_metrics()

## # A tibble: 40 x 8
##     mtry min_n .metric  .estimator  mean     n std_err .config              
##    <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
##  1    29    30 accuracy binary     0.774    10 0.00257 Preprocessor1_Model01
##  2    29    30 roc_auc  binary     0.775    10 0.00272 Preprocessor1_Model01
##  3    29    30 sens     binary     0.888    10 0.00194 Preprocessor1_Model01
##  4    29    30 spec     binary     0.416    10 0.00656 Preprocessor1_Model01
##  5   112    12 accuracy binary     0.777    10 0.00271 Preprocessor1_Model02
##  6   112    12 roc_auc  binary     0.775    10 0.00262 Preprocessor1_Model02
##  7   112    12 sens     binary     0.905    10 0.00166 Preprocessor1_Model02
##  8   112    12 spec     binary     0.372    10 0.00569 Preprocessor1_Model02
##  9    65    27 accuracy binary     0.775    10 0.00229 Preprocessor1_Model03
## 10    65    27 roc_auc  binary     0.776    10 0.00285 Preprocessor1_Model03
## # … with 30 more rows

Visualize The Tuning Results

tune_res %>% 
  collect_metrics() %>% 
  filter(.metric == "roc_auc") %>% 
  select(mean, min_n, mtry) %>% 
  pivot_longer(min_n:mtry,
               values_to = "value",
               names_to = "parameter") %>% 
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(show.legend = FALSE) +
  facet_wrap(~ parameter, scales = "free_x")

Secondary Turning

Our initial tuning provided insight to the optimal parameters. We will now tune the parameters a second time to to obtain the optimal parameter settings.

rf_grid <- grid_regular(
  mtry(range = c(30, 40)),
  min_n(range = c(14, 16)),
  levels = 4
)



set.seed(456)
regular_res <- tune_grid(
  tune_wf,
  resamples = figs_folds,
  grid = rf_grid
)


regular_res %>% 
  collect_metrics()

## # A tibble: 24 x 8
##     mtry min_n .metric  .estimator  mean     n std_err .config              
##    <int> <int> <chr>    <chr>      <dbl> <int>   <dbl> <chr>                
##  1    30    14 accuracy binary     0.778    10 0.00255 Preprocessor1_Model01
##  2    30    14 roc_auc  binary     0.775    10 0.00279 Preprocessor1_Model01
##  3    33    14 accuracy binary     0.779    10 0.00247 Preprocessor1_Model02
##  4    33    14 roc_auc  binary     0.775    10 0.00277 Preprocessor1_Model02
##  5    36    14 accuracy binary     0.778    10 0.00195 Preprocessor1_Model03
##  6    36    14 roc_auc  binary     0.775    10 0.00293 Preprocessor1_Model03
##  7    40    14 accuracy binary     0.778    10 0.00208 Preprocessor1_Model04
##  8    40    14 roc_auc  binary     0.776    10 0.00265 Preprocessor1_Model04
##  9    30    15 accuracy binary     0.778    10 0.00227 Preprocessor1_Model05
## 10    30    15 roc_auc  binary     0.775    10 0.00265 Preprocessor1_Model05
## # … with 14 more rows

Visualize the final tuning results

regular_res %>% 
  collect_metrics() %>% 
  filter(.metric == "roc_auc") %>% 
  mutate(min_n = factor(min_n)) %>% 
  ggplot(aes(mtry, mean, color=min_n)) +
  geom_line(alpha =0.5, size = 1.5) + 
  geom_point() + theme_fivethirtyeight() + labs(title="Tuning Grid")

Pick the Best Model

We utilize the roc_auc metric as our metric to pick the best model.

best_auc <- select_best(regular_res, "roc_auc")

final_rf <- finalize_model(
  tune_spec,
  best_auc
  )

Variable Importance

We utilize the VIP package to identify the important variables.

fpchart <- final_rf %>% 
  set_engine("ranger", importance = "permutation") %>% 
  fit(target ~.,
      data = juice(figs_prep) %>% select(-horse)) %>% 
  vip(geom = "point", num_features=25L) +  
  labs(
    x = NULL,
    y = NULL,
    title= "Feature Importance") 

fpchart + theme_fivethirtyeight()

Finalize Model

The metrics for the final model are calculated and displayed below.

final_wf <- workflow() %>% 
  add_recipe(figs_rec) %>% 
  add_model(final_rf)

final_res <- final_wf %>% 
  last_fit(figs_split)

final_result <- final_res %>% 
  collect_metrics()

gt(final_result)

.metric	.estimator	.estimate	.config
accuracy	binary	0.7717880	Preprocessor1_Model1
roc_auc	binary	0.7705095	Preprocessor1_Model1

ROC Curve

final_res %>% 
  collect_predictions() %>% 
  roc_curve(target, .pred_Top, event_level = "second") %>% 
  ggplot(aes(x=1-specificity, y = sensitivity )) +
  geom_line(size = 1.5, color = "midnightblue") +
  geom_abline(
    lty = 2, alpha =0.5,
    color = "gray50",
    size = 1.2
    )  + 
  labs(title= "ROC Curve", x="1-Specificity",  y= "Sensitivity") + theme_fivethirtyeight()

Confusion Matrix

The confusion matrix for the final model is diplayed below:

collect_predictions(final_res) %>% 
  conf_mat(target, .pred_class)

##           Truth
## Prediction NoTop  Top
##      NoTop  6109 1376
##      Top     656  763

Model Predictions

Finally, model predictions on the test dataset are set forth in the data table below.

preTab <- final_res %>% 
  collect_predictions() %>% 
  mutate(correct = case_when(target == .pred_class ~ "Correct", 
                             TRUE ~ "Incorrect"))

datatable(preTab)