Logistic Regression

Get the data from Github

data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data.csv")

figs <- read_csv(data) %>% 
  as_tibble() %>% 
  rename(target = `ï»¿target`) %>% 
  mutate(PT3_2 = PT3) %>% 
  mutate(PT3_2 = str_replace_all(PT3_2,"0","T")) %>%
  mutate(PT3_2 = str_replace_all(PT3_2,"1","A")) %>% 
  mutate(PT3_2 = str_replace_all(PT3_2,"2","B")) %>%
  mutate(PT3_2 = str_replace_all(PT3_2,"3","C")) %>% 
  mutate(PT3_2 = str_replace_all(PT3_2,"4","D")) %>% 
  mutate(PT3 = PT3_2) %>% 
  select(-PT3_2, -D1, -D2, -eff_TV, -n, -top, -eff_top, -fv, -DST) %>%  
  #change character data to factors
  mutate_if(is.character, factor)
  #change distance to numeric
  
 
  
skim(figs)

Data summary
Name	figs
Number of rows	34918
Number of columns	41
_______________________
Column type frequency:
Date	1
factor	14
numeric	26
________________________
Group variables	None

Variable type: Date

skim_variable	n_missing	complete_rate	min	max	median	n_unique
race_date	0	1	2016-04-13	2021-12-20	2019-10-31	1433

Variable type: factor

skim_variable	complete_rate	ordered	n_unique	top_counts
target	1	FALSE	2	NoT: 27015, TOP: 7903
horse	1	FALSE	3388	CHA: 57, EL : 53, STA: 53, STA: 52
surface_cond	1	FALSE	13	fst: 16041, frm: 10188, wsm: 2845, vsl: 1734
precip	1	FALSE	3	cle: 33590, rai: 1250, sno: 78
wind	1	FALSE	3	cal: 30965, hvy: 2537, vhv: 1416
gender	1	FALSE	2	Mal: 23692, Fem: 11226
distance	1	FALSE	22	8.0: 9161, 6.0: 7606, 8.5: 4518, 7.0: 2920
jky	1	FALSE	729	I O: 1009, L S: 910, M F: 886, J O: 853
trk_code	1	FALSE	145	GP: 8279, AQU: 4746, BEL: 3269, SA: 2878
surf	1	FALSE	5	Dir: 21037, Tur: 11548, Syn: 1245, Inn: 1019
s_cond	1	FALSE	8	Fst: 18059, Frm: 10175, Gd: 3216, Sly: 2129
PT3	1	FALSE	352	–: 3388, P: 3007, XXX: 2290, TP: 1717
form_cycle	1	FALSE	24	AX: 6410, CX: 3647, BX: 3470, A1: 3388
race_type	1	FALSE	2	Spr: 18436, Rou: 16482

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
yr	1	2019.22	0.99	2016.00	2019.00	2019.00	2020.00	2021.0	▁▃▇▇▂
age	1	3.64	1.30	1.00	3.00	3.00	4.00	11.0	▇▆▁▁▁
ftl	1	0.08	0.27	0.00	0.00	0.00	0.00	1.0	▇▁▁▁▁
clm_by_hot	1	-0.93	0.34	-1.00	-1.00	-1.00	-1.00	1.0	▇▁▁▁▁
start	1	0.03	0.09	0.00	0.00	0.00	0.00	0.9	▇▁▁▁▁
bled	1	0.00	0.02	0.00	0.00	0.00	0.00	1.0	▇▁▁▁▁
off_turf	1	0.05	0.21	0.00	0.00	0.00	0.00	1.0	▇▁▁▁▁
bfnr	1	0.01	0.11	0.00	0.00	0.00	0.00	1.0	▇▁▁▁▁
lame	1	0.00	0.02	0.00	0.00	0.00	0.00	1.0	▇▁▁▁▁
lsx	1	0.00	0.09	-1.00	0.00	0.00	0.00	1.0	▁▁▇▁▁
blnkrs	1	0.04	0.27	-1.00	0.00	0.00	0.00	1.0	▁▁▇▁▁
age_wks	1	204.49	65.51	37.00	154.00	190.00	240.00	559.0	▂▇▂▁▁
wt	1	120.33	2.70	108.00	119.00	120.00	122.00	143.0	▁▇▂▁▁
off_odds	1	12.19	17.14	0.11	2.50	6.00	13.00	99.0	▇▁▁▁▁
fld	1	8.40	2.21	2.00	7.00	8.00	10.00	30.0	▅▇▁▁▁
L3	1	18.03	7.04	1.75	13.33	16.92	21.42	99.0	▇▃▁▁▁
L5	1	18.27	6.78	1.75	13.75	17.25	21.62	99.0	▇▃▁▁▁
L7	1	18.44	6.68	1.75	14.00	17.43	21.75	99.0	▇▃▁▁▁
rest	1	44.73	53.55	2.00	21.00	28.00	44.00	1081.0	▇▁▁▁▁
avg_rest	1	39.90	21.70	9.67	25.68	34.80	46.90	489.0	▇▁▁▁▁
efforts_last90	1	0.90	0.95	0.00	0.00	1.00	1.00	8.0	▇▂▁▁▁
Lag1	1	0.20	0.40	0.00	0.00	0.00	0.00	1.0	▇▁▁▁▂
Lag2	1	0.19	0.39	0.00	0.00	0.00	0.00	1.0	▇▁▁▁▂
Lag3	1	0.17	0.37	0.00	0.00	0.00	0.00	1.0	▇▁▁▁▂
Lag4	1	0.15	0.36	0.00	0.00	0.00	0.00	1.0	▇▁▁▁▂
Lag5	1	0.14	0.34	0.00	0.00	0.00	0.00	1.0	▇▁▁▁▁

Build Model

We can start by splitting our data into training and testing sets.

set.seed(123)

figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)

Create Bootstrap

Next, let’s create bootstrap resamples of the training data, to evaluate our models.

set.seed(234)
figs_boot <- bootstraps(figs_train)
figs_boot

## # Bootstrap sampling 
## # A tibble: 25 x 2
##    splits               id         
##    <list>               <chr>      
##  1 <split [26190/9615]> Bootstrap01
##  2 <split [26190/9619]> Bootstrap02
##  3 <split [26190/9658]> Bootstrap03
##  4 <split [26190/9585]> Bootstrap04
##  5 <split [26190/9666]> Bootstrap05
##  6 <split [26190/9641]> Bootstrap06
##  7 <split [26190/9609]> Bootstrap07
##  8 <split [26190/9605]> Bootstrap08
##  9 <split [26190/9596]> Bootstrap09
## 10 <split [26190/9661]> Bootstrap10
## # … with 15 more rows

Create Model Specification

We create a model specification utilizing the glm package

glm_spec <- logistic_reg() %>% 
  set_engine("glm")

glm_spec

## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm

Create a Recipe

The recipe performs preprocessing necessary to fit a logistic regression model

figs_rec<-  recipe(target ~ ., data=figs_train) %>% 
  update_role(horse, new_role = "ID") %>% 
  step_zv(all_numeric()) %>% 
  step_corr(all_numeric()) %>% 
  step_other(jky , threshold = 0.015) %>% 
  step_other(trk_code , threshold = 0.02) %>% 
  step_other(distance, threshold = 0.02) %>% 
  step_dummy(distance) %>% 
  step_dummy(PT3) %>% 
  step_dummy(gender) %>% 
  step_dummy(trk_code) %>% 
  step_dummy(jky) %>% 
  step_dummy(form_cycle) %>% 
  step_dummy(surf) %>% 
  step_dummy(wind) %>% 
  step_dummy(precip) %>% 
  step_dummy(s_cond) %>% 
  step_dummy(race_type) %>% 
  step_dummy(surface_cond) %>% 
  step_rm(yr) %>% 
  step_date(race_date, features = c("month", "year")) %>% 
  step_rm(race_date) %>% 
  #step_zv(all_predictors()) %>% 
  #step_corr(all_predictors()) %>% 
  themis::step_downsample(target)
  
  

figs_prep <- prep(figs_rec)

Setup a Workflow

We create the workflow by adding the formula and recipe

figs_wf <- workflow() %>% 
  add_recipe(figs_rec)

Add Model to Workflow and Fit to each of the resamples.

glm_rs <- figs_wf %>% 
  add_model(glm_spec) %>% 
  fit_resamples(
    resamples = figs_boot,
    control = control_resamples(save_pred = TRUE)
  )

## ! Bootstrap01: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap02: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap02: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap03: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap03: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap04: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap04: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap05: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap06: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap06: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap07: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap07: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap08: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap08: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap09: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap09: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap10: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap10: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap11: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap11: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap12: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap12: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap13: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap13: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap14: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap14: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap15: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap15: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap16: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap16: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap17: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap17: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap18: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap18: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap19: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap19: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap20: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap20: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap21: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap21: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap22: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap22: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap23: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap23: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap24: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap24: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

## ! Bootstrap25: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! Bootstrap25: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

glm_rs

## Warning: This tuning result has notes. Example notes on model fitting include:
## preprocessor 1/1, model 1/1 (predictions): prediction from a rank-deficient fit may be misleading
## preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0 or 1 occurred
## preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0 or 1 occurred

## # Resampling results
## # Bootstrap sampling 
## # A tibble: 25 x 5
##    splits            id         .metrics        .notes        .predictions      
##    <list>            <chr>      <list>          <list>        <list>            
##  1 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,615 × …
##  2 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,619 × …
##  3 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,658 × …
##  4 <split [26190/95… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,585 × …
##  5 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [1 ×… <tibble [9,666 × …
##  6 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,641 × …
##  7 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,609 × …
##  8 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,605 × …
##  9 <split [26190/95… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,596 × …
## 10 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,661 × …
## # … with 15 more rows

Evaluate Model

Now we use collect_metrics to see the results of our modeling. Here we use accuracy and roc to evaluate the model

collect_metrics(glm_rs)

## # A tibble: 2 x 6
##   .metric  .estimator  mean     n  std_err .config             
##   <chr>    <chr>      <dbl> <int>    <dbl> <chr>               
## 1 accuracy binary     0.680    25 0.000917 Preprocessor1_Model1
## 2 roc_auc  binary     0.772    25 0.000989 Preprocessor1_Model1

Now lets look at the confusion matrix

glm_rs %>% 
  conf_mat_resampled()

## # A tibble: 4 x 3
##   Prediction Truth  Freq
##   <fct>      <fct> <dbl>
## 1 NoTop      NoTop 4965.
## 2 NoTop      TOP    589.
## 3 TOP        NoTop 2486.
## 4 TOP        TOP   1580.

Finally, lets plot and roc curve to visualize our roc resutls

glm_rs %>% 
  collect_predictions() %>% 
  group_by(id) %>%   
  roc_curve(target, .pred_TOP, event_level = "second") %>% 
  ggplot(aes(1-specificity, sensitivity, color=id)) +
  geom_abline(lty=2, color="gray80", size= 1.5) +
  geom_path(show.legend =FALSE, alpha = 0.6, size =1.2) +
  coord_equal()

Finalize Model

Now lets peform the last_fit() to finalize our model against the test dataset

figs_final <- figs_wf %>% 
  add_model(glm_spec) %>% 
  last_fit(figs_split)

## ! train/test split: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...

## ! train/test split: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...

figs_final

## Warning: This tuning result has notes. Example notes on model fitting include:
## preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0 or 1 occurred
## preprocessor 1/1, model 1/1 (predictions): prediction from a rank-deficient fit may be misleading

## # Resampling results
## # Manual resampling 
## # A tibble: 1 x 6
##   splits         id          .metrics     .notes      .predictions     .workflow
##   <list>         <chr>       <list>       <list>      <list>           <list>   
## 1 <split [26190… train/test… <tibble [2 … <tibble [2… <tibble [8,728 … <workflo…

The metrics and prediction have been calculated against the test data

collect_metrics(figs_final)

## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.676 Preprocessor1_Model1
## 2 roc_auc  binary         0.777 Preprocessor1_Model1

collect_predictions(figs_final) %>% 
  conf_mat(target, .pred_class)

##           Truth
## Prediction NoTop  TOP
##      NoTop  4416  494
##      TOP    2337 1481

The coefficients (which we can get out using tidy()) have been estimated using the training data. If we use exponentiate = TRUE, we have odds ratios.

figs_final$.workflow[[1]] %>% 
  tidy(exponentiate=TRUE)

## # A tibble: 475 x 5
##    term        estimate std.error statistic p.value
##    <chr>          <dbl>     <dbl>     <dbl>   <dbl>
##  1 (Intercept) 1.68e+54  146.      0.856    0.392  
##  2 ftl         1.16e+ 0    0.151   0.991    0.322  
##  3 clm_by_hot  9.54e- 1    0.0721 -0.653    0.514  
##  4 start       2.22e+ 0    0.286   2.80     0.00516
##  5 bled        1.56e- 8 2029.     -0.00886  0.993  
##  6 off_turf    9.69e- 1    0.121  -0.261    0.794  
##  7 bfnr        5.23e- 6  304.     -0.0399   0.968  
##  8 lame        4.62e+ 0 2613.      0.000586 1.00   
##  9 lsx         1.80e+ 0    0.228   2.58     0.00992
## 10 blnkrs      8.81e- 1    0.0925 -1.37     0.172  
## # … with 465 more rows

Logistic Regression - Data 698

Get the data from Github

Build Model

We can start by splitting our data into training and testing sets.

Create Bootstrap

Next, let’s create bootstrap resamples of the training data, to evaluate our models.

Create Model Specification

We create a model specification utilizing the glm package

Create a Recipe

The recipe performs preprocessing necessary to fit a logistic regression model

Setup a Workflow

We create the workflow by adding the formula and recipe

Add Model to Workflow and Fit to each of the resamples.

Evaluate Model

Now we use collect_metrics to see the results of our modeling. Here we use accuracy and roc to evaluate the model

Now lets look at the confusion matrix

Finally, lets plot and roc curve to visualize our roc resutls

Finalize Model

Now lets peform the last_fit() to finalize our model against the test dataset

The metrics and prediction have been calculated against the test data

The coefficients (which we can get out using tidy()) have been estimated using the training data. If we use exponentiate = TRUE, we have odds ratios.