Logistic Regression 2

Get the data from Github

data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data2.csv")

figs <- read_csv(data) %>% 
  as_tibble() %>% 
  rename(n = `ï»¿n`) %>% 
  mutate(eff_target = if_else(eff_target==1, "Top", "NoTop")) %>% 
  # Remove columns that can't be used in model
  select(-target, -D1, -D2, -eff_TV, -top, -eff_top, -fv, -DST, -seed, -E, -E4, -TE, -split_800, -split_801, -split_802, -split_803, -split_804, -split_805, -RST, -X) %>% 
  rename(target = eff_target) %>% 
  #change character data to factors
  mutate_if(is.character, factor) %>% 
  mutate(race_date = mdy(race_date)) %>% 
  mutate(target = factor(target, c("Top", "NoTop")))
  # Change Top to first level
  # levels(figs$target)

  
skim(figs)

Data summary
Name	figs
Number of rows	35617
Number of columns	60
_______________________
Column type frequency:
Date	1
factor	14
numeric	45
________________________
Group variables	None

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
race_date	0	1	2016-04-13	2021-12-20	2019-10-31	1433

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
target	1	FALSE	2	NoT: 27061, Top: 8556
horse	1	FALSE	3387	CHA: 57, VIC: 57, STA: 54, EL : 53
surface_cond	1	FALSE	13	fst: 16366, frm: 10368, wsm: 2913, vsl: 1771
precip	1	FALSE	3	cle: 34259, rai: 1278, sno: 80
wind	1	FALSE	3	cal: 31586, hvy: 2581, vhv: 1450
gender	1	FALSE	2	Mal: 24217, Fem: 11400
distance	1	FALSE	22	8.0: 9316, 6.0: 7624, 8.5: 4621, 7.0: 3011
jky	1	FALSE	729	I O: 1034, L S: 930, M F: 915, J O: 879
trk_code	1	FALSE	145	GP: 8459, AQU: 4858, BEL: 3366, SA: 2892
surf	1	FALSE	5	Dir: 21482, Tur: 11757, Syn: 1271, Inn: 1038
s_cond	1	FALSE	8	Fst: 18433, Frm: 10362, Gd: 3275, Sly: 2180
form_cycle	1	FALSE	24	AX: 6600, CX: 3762, BX: 3536, A1: 3387
race_type	1	FALSE	2	Spr: 18816, Rou: 16801
trk	1	FALSE	130	GP: 8377, AQ: 4769, BE: 3359, SA: 2887

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
n	0	1	9.86	8.51	1.00	3.00	7.00	14.00	57.00	▇▂▁▁▁
cls_val	0	1	1.99	0.89	1.00	1.15	1.62	3.00	12.59	▇▁▁▁▁
cls_mv	0	1	0.07	0.68	-11.49	0.00	0.00	0.00	11.39	▁▁▇▁▁
yr	0	1	2019.22	0.99	2016.00	2019.00	2019.00	2020.00	2021.00	▁▃▇▇▂
age	0	1	3.64	1.30	1.00	3.00	3.00	4.00	11.00	▇▆▁▁▁
ftl	0	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
clm_by_hot	0	1	-0.93	0.34	-1.00	-1.00	-1.00	-1.00	1.00	▇▁▁▁▁
start	0	1	0.03	0.09	0.00	0.00	0.00	0.00	0.90	▇▁▁▁▁
bled	0	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
off_turf	0	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
bfnr	0	1	0.01	0.11	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lame	0	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lsx	0	1	0.00	0.09	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
blnkrs	0	1	0.04	0.27	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
age_wks	0	1	204.45	65.52	37.00	154.00	190.00	240.00	559.00	▂▇▂▁▁
wt	2	1	120.32	2.70	108.00	119.00	120.00	122.00	143.00	▁▇▂▁▁
off_odds	0	1	12.17	17.09	0.11	2.50	6.00	13.00	99.00	▇▁▁▁▁
fld	2	1	8.39	2.21	2.00	7.00	8.00	10.00	30.00	▅▇▁▁▁
L3	0	1	18.01	7.03	1.75	13.33	16.92	21.42	99.00	▇▃▁▁▁
L5	0	1	18.26	6.77	1.75	13.70	17.25	21.62	99.00	▇▃▁▁▁
L7	0	1	18.43	6.66	1.75	13.96	17.43	21.75	99.00	▇▃▁▁▁
rest	0	1	43.85	53.38	0.00	21.00	28.00	43.00	1081.00	▇▁▁▁▁
avg_rest	0	1	39.75	21.66	7.25	25.67	34.70	46.78	489.00	▇▁▁▁▁
efforts_last90	0	1	0.90	0.95	0.00	0.00	1.00	1.00	8.00	▇▂▁▁▁
Lag1	0	1	0.21	0.40	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag2	0	1	0.19	0.39	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag3	0	1	0.17	0.37	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag4	0	1	0.15	0.36	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag5	0	1	0.14	0.34	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
LP1	0	1	-1.91	2.27	-5.00	-5.00	-2.00	0.00	1.00	▆▂▃▂▇
LP2	0	1	-1.91	2.16	-5.00	-4.00	-2.00	0.00	1.00	▆▂▆▂▇
LP3	0	1	-1.92	2.05	-5.00	-4.00	-2.00	0.00	1.00	▆▂▇▂▇
LP4	0	1	-1.92	1.95	-5.00	-3.00	-2.00	0.00	1.00	▅▁▇▁▆
LP5	0	1	-1.93	1.85	-5.00	-3.00	-2.00	0.00	1.00	▃▁▇▁▅
pp_id	0	1	27487.78	16975.26	1.00	12623.00	26040.00	41459.00	59178.00	▇▇▆▇▆
won	0	1	0.19	0.39	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
splt_1	0	1	4.48	6.00	0.00	0.00	2.89	6.79	51.00	▇▁▁▁▁
splt_2	0	1	2.36	3.93	0.00	0.00	0.00	3.81	35.91	▇▁▁▁▁
splt_3	0	1	2.60	5.16	0.00	0.00	0.00	3.57	45.30	▇▁▁▁▁
splt_4	0	1	3.53	5.16	0.00	0.00	1.23	5.27	46.29	▇▁▁▁▁
splt_5	0	1	1.34	2.75	0.00	0.00	0.00	1.93	27.99	▇▁▁▁▁
splt_6	0	1	2.85	4.09	0.00	0.00	0.00	3.97	30.87	▇▂▁▁▁
splt_7	0	1	4.68	5.87	0.00	0.00	2.90	7.42	37.86	▇▂▁▁▁
splt_8	0	1	11.88	9.05	0.00	3.93	10.63	17.73	50.57	▇▆▂▁▁
splt_9	0	1	10.57	8.58	0.00	2.97	9.35	16.01	47.05	▇▅▂▁▁

Build Model

We can start by splitting our data into training and testing sets.

set.seed(123)

figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)


figs_split

## <Analysis/Assess/Total>
## <26713/8904/35617>

Create Bootstrap

Next, let’s create bootstrap resamples of the training data, to evaluate our models.

set.seed(234)
figs_boot <- bootstraps(figs_train)
figs_boot

## # Bootstrap sampling 
## # A tibble: 25 x 2
##    splits               id         
##    <list>               <chr>      
##  1 <split [26713/9810]> Bootstrap01
##  2 <split [26713/9822]> Bootstrap02
##  3 <split [26713/9857]> Bootstrap03
##  4 <split [26713/9783]> Bootstrap04
##  5 <split [26713/9855]> Bootstrap05
##  6 <split [26713/9837]> Bootstrap06
##  7 <split [26713/9817]> Bootstrap07
##  8 <split [26713/9801]> Bootstrap08
##  9 <split [26713/9788]> Bootstrap09
## 10 <split [26713/9846]> Bootstrap10
## # … with 15 more rows

Create Model Specification

We create a model specification utilizing the glm package

glm_spec <- logistic_reg() %>% 
  set_engine("glm")

glm_spec

## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm

Create a Recipe

The recipe performs preprocessing necessary to fit a logistic regression model

figs_rec <- recipe(target ~ ., data=figs_train) %>%
  update_role(horse, new_role = "ID") %>% 
  update_role(pp_id, new_role = "ID") %>% 
  update_role(won, new_role = "ID") %>% 
  step_date(race_date, features = c("month", "year")) %>% 
  step_other(jky , threshold = 0.015) %>% 
  step_other(trk_code , threshold = 0.02) %>% 
  step_other(distance, threshold = 0.02) %>% 
  step_dummy(distance) %>% 
  step_dummy(gender) %>% 
  step_dummy(trk_code) %>% 
  step_dummy(jky) %>% 
  step_dummy(form_cycle) %>% 
  step_dummy(surf) %>% 
  step_dummy(wind) %>% 
  step_dummy(precip) %>% 
  step_dummy(s_cond) %>% 
  step_dummy(race_type) %>% 
  step_dummy(surface_cond) %>% 
  step_dummy(race_date_month) %>%
  step_rm(yr, trk) %>% 
  step_rm(race_date, n) %>% 
  step_zv(all_predictors(), -all_outcomes()) %>% 
  step_corr(all_numeric(), -all_outcomes()) %>% 
  themis::step_downsample(target)
  
  

figs_prep <- prep(figs_rec)

juiced <- juice(figs_prep) 

skim(juiced)

Data summary
Name	juiced
Number of rows	12834
Number of columns	140
_______________________
Column type frequency:
factor	2
numeric	138
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
horse	0	1	FALSE	2955	PLA: 22, RAD: 20, HON: 19, SAL: 19
target	0	1	FALSE	2	Top: 6417, NoT: 6417

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
cls_val	0	1	1.97	0.89	1.00	1.15	1.50	3.00	12.59	▇▁▁▁▁
cls_mv	0	1	0.10	0.70	-7.51	0.00	0.00	0.00	11.39	▁▇▂▁▁
ftl	0	1	0.07	0.25	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
clm_by_hot	0	1	-0.93	0.32	-1.00	-1.00	-1.00	-1.00	1.00	▇▁▁▁▁
start	0	1	0.02	0.09	0.00	0.00	0.00	0.00	0.90	▇▁▁▁▁
bled	0	1	0.00	0.01	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
off_turf	0	1	0.04	0.20	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
bfnr	0	1	0.01	0.09	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lame	0	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
lsx	0	1	0.00	0.10	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
blnkrs	0	1	0.04	0.27	-1.00	0.00	0.00	0.00	1.00	▁▁▇▁▁
age_wks	0	1	198.15	62.81	44.00	151.00	183.00	230.00	557.00	▃▇▂▁▁
wt	1	1	120.31	2.70	108.00	118.00	120.00	122.00	140.00	▁▇▆▁▁
off_odds	0	1	11.92	16.87	0.11	2.50	5.00	13.00	99.00	▇▁▁▁▁
fld	1	1	8.42	2.20	2.00	7.00	8.00	10.00	30.00	▅▇▁▁▁
L7	0	1	18.65	6.66	3.25	14.14	17.62	22.05	99.00	▇▂▁▁▁
rest	0	1	45.27	53.38	0.00	21.00	29.00	46.00	730.00	▇▁▁▁▁
avg_rest	0	1	40.42	22.76	9.33	26.00	34.83	47.14	297.00	▇▁▁▁▁
efforts_last90	0	1	0.98	0.94	0.00	0.00	1.00	2.00	7.00	▇▂▁▁▁
Lag1	0	1	0.23	0.42	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag2	0	1	0.19	0.39	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag3	0	1	0.16	0.37	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag4	0	1	0.14	0.35	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
Lag5	0	1	0.13	0.33	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
LP1	0	1	-1.62	2.27	-5.00	-4.00	-1.00	0.00	1.00	▅▁▂▁▇
LP2	0	1	-1.75	2.11	-5.00	-3.00	-2.00	0.00	1.00	▅▁▆▂▇
LP3	0	1	-1.85	1.98	-5.00	-3.00	-2.00	0.00	1.00	▅▁▇▁▇
LP4	0	1	-1.86	1.87	-5.00	-2.00	-2.00	0.00	1.00	▃▁▇▁▅
LP5	0	1	-1.90	1.76	-5.00	-2.00	-2.00	-1.00	1.00	▃▁▇▁▃
pp_id	0	1	27262.91	16891.16	3.00	12649.50	25503.50	41077.50	59175.00	▇▇▆▇▅
won	0	1	0.24	0.43	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
splt_1	0	1	4.36	5.76	0.00	0.00	2.93	6.71	51.00	▇▁▁▁▁
splt_2	0	1	2.38	3.81	0.00	0.00	0.00	3.82	33.07	▇▁▁▁▁
splt_3	0	1	2.31	4.87	0.00	0.00	0.00	2.92	45.30	▇▁▁▁▁
splt_4	0	1	3.43	4.99	0.00	0.00	1.63	4.89	46.29	▇▁▁▁▁
splt_5	0	1	1.31	2.69	0.00	0.00	0.00	1.96	26.99	▇▁▁▁▁
splt_6	0	1	2.70	3.91	0.00	0.00	0.00	3.94	30.87	▇▁▁▁▁
splt_7	0	1	4.45	5.63	0.00	0.00	2.90	6.87	37.86	▇▂▁▁▁
splt_8	0	1	11.22	8.69	0.00	3.00	9.85	16.55	50.57	▇▅▂▁▁
splt_9	0	1	10.09	8.22	0.00	2.96	8.38	14.97	43.48	▇▅▂▁▁
race_date_year	0	1	2019.20	1.00	2016.00	2019.00	2019.00	2020.00	2021.00	▁▃▇▇▂
distance_X5.5F	0	1	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X6.0F	0	1	0.21	0.40	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
distance_X6.5F	0	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X7.0F	0	1	0.09	0.28	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X8.0F	0	1	0.27	0.44	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▃
distance_X8.5F	0	1	0.14	0.34	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_X9.0F	0	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
distance_other	0	1	0.06	0.23	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
gender_Male	0	1	0.68	0.47	0.00	0.00	1.00	1.00	1.00	▃▁▁▁▇
trk_code_BEL	0	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_CD	0	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_DMR	0	1	0.04	0.20	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_GP	0	1	0.24	0.43	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
trk_code_GPW	0	1	0.03	0.18	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_KEE	0	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_SA	0	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_SAR	0	1	0.05	0.23	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_WO	0	1	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
trk_code_other	0	1	0.22	0.42	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
jky_E.Cancel	0	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_E.Jaramillo	0	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_E.Zayas	0	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_I.Ortiz..Jr.	0	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Alvarado	0	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Lezcano	0	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Ortiz	0	1	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_J.Rosario	0	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_L.Reyes	0	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_L.Saez	0	1	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_M.Franco	0	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_M.Vasquez	0	1	0.02	0.13	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_P.Lopez	0	1	0.02	0.12	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_R.Maragh	0	1	0.02	0.12	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_T.Gaffalione	0	1	0.02	0.15	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
jky_other	0	1	0.67	0.47	0.00	0.00	1.00	1.00	1.00	▃▁▁▁▇
form_cycle_A2	0	1	0.11	0.31	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_A3	0	1	0.07	0.26	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_AX	0	1	0.18	0.39	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▂
form_cycle_B1	0	1	0.06	0.25	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_B2	0	1	0.04	0.20	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_B3	0	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_BX	0	1	0.09	0.28	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_C1	0	1	0.05	0.22	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_C2	0	1	0.04	0.18	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_C3	0	1	0.03	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_CX	0	1	0.10	0.30	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_D1	0	1	0.02	0.16	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_D2	0	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_D3	0	1	0.01	0.12	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_DX	0	1	0.04	0.20	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_E1	0	1	0.01	0.10	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_E2	0	1	0.01	0.09	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_E3	0	1	0.00	0.07	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_EX	0	1	0.02	0.12	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_F1	0	1	0.00	0.06	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_F2	0	1	0.00	0.06	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_F3	0	1	0.00	0.04	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
form_cycle_FX	0	1	0.00	0.07	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Inner.Dirt	0	1	0.00	0.04	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Inner.Turf	0	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Synth	0	1	0.04	0.19	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surf_Turf	0	1	0.35	0.48	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▅
wind_hvy	0	1	0.07	0.26	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
wind_vhvy	0	1	0.04	0.19	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
precip_rain	0	1	0.03	0.18	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
precip_snow	0	1	0.00	0.04	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Fst	0	1	0.51	0.50	0.00	0.00	1.00	1.00	1.00	▇▁▁▁▇
s_cond_Gd	0	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_My	0	1	0.02	0.14	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Sf	0	1	0.00	0.07	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Sly	0	1	0.06	0.23	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Trk_condit	0	1	0.00	0.03	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
s_cond_Yl	0	1	0.01	0.11	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_type_Sprint	0	1	0.52	0.50	0.00	0.00	1.00	1.00	1.00	▇▁▁▁▇
surface_cond_frz	0	1	0.00	0.03	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_fst	0	1	0.45	0.50	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▆
surface_cond_gd	0	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_gd.slw	0	1	0.03	0.17	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_hvy	0	1	0.00	0.02	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_puaf	0	1	0.01	0.08	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_sft	0	1	0.01	0.09	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_vbw	0	1	0.00	0.03	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_vslw	0	1	0.05	0.21	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_wf	0	1	0.01	0.09	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_wsm	0	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
surface_cond_yld	0	1	0.01	0.11	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Feb	0	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Mar	0	1	0.06	0.24	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Apr	0	1	0.05	0.22	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_May	0	1	0.07	0.25	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Jun	0	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Jul	0	1	0.08	0.27	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Aug	0	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Sep	0	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Oct	0	1	0.09	0.29	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Nov	0	1	0.10	0.30	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁
race_date_month_Dec	0	1	0.09	0.28	0.00	0.00	0.00	0.00	1.00	▇▁▁▁▁

Setup a Workflow

We create the workflow by adding the formula and recipe

figs_wf <- workflow() %>% 
  add_recipe(figs_rec)

Add Model to Workflow and Fit to each of the resamples.

members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

doParallel::registerDoParallel()

glm_rs <- figs_wf %>% 
  add_model(glm_spec) %>% 
  fit_resamples(
    resamples = figs_boot,
    metrics = members_metrics,
    control = control_resamples(save_pred = TRUE)
  )

glm_rs

## # Resampling results
## # Bootstrap sampling 
## # A tibble: 25 x 5
##    splits            id         .metrics        .notes        .predictions      
##    <list>            <chr>      <list>          <list>        <list>            
##  1 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [2 ×… <tibble [9,810 × …
##  2 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,822 × …
##  3 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,857 × …
##  4 <split [26713/97… Bootstrap… <tibble [4 × 4… <tibble [2 ×… <tibble [9,783 × …
##  5 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,855 × …
##  6 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,837 × …
##  7 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,817 × …
##  8 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,801 × …
##  9 <split [26713/97… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,788 × …
## 10 <split [26713/98… Bootstrap… <tibble [4 × 4… <tibble [1 ×… <tibble [9,846 × …
## # … with 15 more rows

Evaluate Model

Now we use collect_metrics to see the results of our modeling. Here we use accuracy and roc to evaluate the model

collect_metrics(glm_rs)

## # A tibble: 4 x 6
##   .metric  .estimator  mean     n  std_err .config             
##   <chr>    <chr>      <dbl> <int>    <dbl> <chr>               
## 1 accuracy binary     0.679    25 0.000834 Preprocessor1_Model1
## 2 roc_auc  binary     0.757    25 0.000891 Preprocessor1_Model1
## 3 sens     binary     0.699    25 0.00219  Preprocessor1_Model1
## 4 spec     binary     0.673    25 0.00133  Preprocessor1_Model1

Now lets look at the confusion matrix

glm_rs %>% 
  conf_mat_resampled()

## # A tibble: 4 x 3
##   Prediction Truth  Freq
##   <fct>      <fct> <dbl>
## 1 Top        Top   1657.
## 2 Top        NoTop 2435.
## 3 NoTop      Top    715.
## 4 NoTop      NoTop 5009.

Finally, lets plot and roc curve to visualize our roc resutls

glm_rs %>% 
  collect_predictions() %>% 
  group_by(id) %>%   
  roc_curve(target, .pred_Top) %>% 
  ggplot(aes(1-specificity, sensitivity, color=id)) +
  geom_abline(lty=2, color="gray80", size= 1.5) +
  geom_path(show.legend =FALSE, alpha = 0.6, size =1.2) +
  coord_equal()

Finalize Model

Now lets peform the last_fit() to finalize our model against the test dataset

figs_final <- figs_wf %>% 
  add_model(glm_spec) %>% 
  last_fit(figs_split)

figs_final

## # Resampling results
## # Manual resampling 
## # A tibble: 1 x 6
##   splits         id          .metrics     .notes      .predictions     .workflow
##   <list>         <chr>       <list>       <list>      <list>           <list>   
## 1 <split [26713… train/test… <tibble [2 … <tibble [1… <tibble [8,904 … <workflo…

The metrics and prediction have been calculated against the test data

collect_metrics(figs_final)

## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.671 Preprocessor1_Model1
## 2 roc_auc  binary         0.756 Preprocessor1_Model1

collect_predictions(figs_final) %>% 
  conf_mat(target, .pred_class)

##           Truth
## Prediction  Top NoTop
##      Top   1519  2306
##      NoTop  620  4458

The coefficients (which we can get out using tidy()) have been estimated using the training data. If we use exponentiate = TRUE, we have odds ratios.

options(scipen = 999)

figs_final$.workflow[[1]] %>% 
  tidy(exponentiate=TRUE) %>% 
  arrange(p.value) %>%
  DT::datatable()

figs_final %>%
  pull(.workflow) %>%
  pluck(1) %>%
  tidy() %>%
  filter(term != "(Intercept)") %>%
  ggplot(aes(estimate, fct_reorder(term, estimate))) +
  geom_vline(xintercept = 0, color = "gray50", lty = 2, size = 1.2) +
  geom_point(size = 2, color = "#85144B") +
  labs(y = NULL, x = "Coefficent from logistic regression")

Logistic Regression 2 - Data 698

Get the data from Github

Build Model

We can start by splitting our data into training and testing sets.

Create Bootstrap

Next, let’s create bootstrap resamples of the training data, to evaluate our models.

Create Model Specification

We create a model specification utilizing the glm package

Create a Recipe

The recipe performs preprocessing necessary to fit a logistic regression model

Setup a Workflow

We create the workflow by adding the formula and recipe

Add Model to Workflow and Fit to each of the resamples.

Evaluate Model

Now we use collect_metrics to see the results of our modeling. Here we use accuracy and roc to evaluate the model

Now lets look at the confusion matrix

Finally, lets plot and roc curve to visualize our roc resutls

Finalize Model

Now lets peform the last_fit() to finalize our model against the test dataset

The metrics and prediction have been calculated against the test data

The coefficients (which we can get out using tidy()) have been estimated using the training data. If we use exponentiate = TRUE, we have odds ratios.