Get the data from Github

data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data.csv")

figs <- read_csv(data) %>% 
  as_tibble() %>% 
  rename(target = `target`) %>% 
  mutate(PT3_2 = PT3) %>% 
  mutate(PT3_2 = str_replace_all(PT3_2,"0","T")) %>%
  mutate(PT3_2 = str_replace_all(PT3_2,"1","A")) %>% 
  mutate(PT3_2 = str_replace_all(PT3_2,"2","B")) %>%
  mutate(PT3_2 = str_replace_all(PT3_2,"3","C")) %>% 
  mutate(PT3_2 = str_replace_all(PT3_2,"4","D")) %>% 
  mutate(PT3 = PT3_2) %>% 
  select(-PT3_2, -D1, -D2, -eff_TV, -n, -top, -eff_top, -fv, -DST) %>%  
  #change character data to factors
  mutate_if(is.character, factor)
  #change distance to numeric
  
 
  
skim(figs)
Data summary
Name figs
Number of rows 34918
Number of columns 41
_______________________
Column type frequency:
Date 1
factor 14
numeric 26
________________________
Group variables None

Variable type: Date

skim_variable n_missing complete_rate min max median n_unique
race_date 0 1 2016-04-13 2021-12-20 2019-10-31 1433

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
target 0 1 FALSE 2 NoT: 27015, TOP: 7903
horse 0 1 FALSE 3388 CHA: 57, EL : 53, STA: 53, STA: 52
surface_cond 0 1 FALSE 13 fst: 16041, frm: 10188, wsm: 2845, vsl: 1734
precip 0 1 FALSE 3 cle: 33590, rai: 1250, sno: 78
wind 0 1 FALSE 3 cal: 30965, hvy: 2537, vhv: 1416
gender 0 1 FALSE 2 Mal: 23692, Fem: 11226
distance 0 1 FALSE 22 8.0: 9161, 6.0: 7606, 8.5: 4518, 7.0: 2920
jky 0 1 FALSE 729 I O: 1009, L S: 910, M F: 886, J O: 853
trk_code 0 1 FALSE 145 GP: 8279, AQU: 4746, BEL: 3269, SA: 2878
surf 0 1 FALSE 5 Dir: 21037, Tur: 11548, Syn: 1245, Inn: 1019
s_cond 0 1 FALSE 8 Fst: 18059, Frm: 10175, Gd: 3216, Sly: 2129
PT3 0 1 FALSE 352 –: 3388, P: 3007, XXX: 2290, TP: 1717
form_cycle 0 1 FALSE 24 AX: 6410, CX: 3647, BX: 3470, A1: 3388
race_type 0 1 FALSE 2 Spr: 18436, Rou: 16482

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
yr 0 1 2019.22 0.99 2016.00 2019.00 2019.00 2020.00 2021.0 ▁▃▇▇▂
age 0 1 3.64 1.30 1.00 3.00 3.00 4.00 11.0 ▇▆▁▁▁
ftl 0 1 0.08 0.27 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▁
clm_by_hot 0 1 -0.93 0.34 -1.00 -1.00 -1.00 -1.00 1.0 ▇▁▁▁▁
start 0 1 0.03 0.09 0.00 0.00 0.00 0.00 0.9 ▇▁▁▁▁
bled 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▁
off_turf 0 1 0.05 0.21 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▁
bfnr 0 1 0.01 0.11 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▁
lame 0 1 0.00 0.02 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▁
lsx 0 1 0.00 0.09 -1.00 0.00 0.00 0.00 1.0 ▁▁▇▁▁
blnkrs 0 1 0.04 0.27 -1.00 0.00 0.00 0.00 1.0 ▁▁▇▁▁
age_wks 0 1 204.49 65.51 37.00 154.00 190.00 240.00 559.0 ▂▇▂▁▁
wt 0 1 120.33 2.70 108.00 119.00 120.00 122.00 143.0 ▁▇▂▁▁
off_odds 0 1 12.19 17.14 0.11 2.50 6.00 13.00 99.0 ▇▁▁▁▁
fld 0 1 8.40 2.21 2.00 7.00 8.00 10.00 30.0 ▅▇▁▁▁
L3 0 1 18.03 7.04 1.75 13.33 16.92 21.42 99.0 ▇▃▁▁▁
L5 0 1 18.27 6.78 1.75 13.75 17.25 21.62 99.0 ▇▃▁▁▁
L7 0 1 18.44 6.68 1.75 14.00 17.43 21.75 99.0 ▇▃▁▁▁
rest 0 1 44.73 53.55 2.00 21.00 28.00 44.00 1081.0 ▇▁▁▁▁
avg_rest 0 1 39.90 21.70 9.67 25.68 34.80 46.90 489.0 ▇▁▁▁▁
efforts_last90 0 1 0.90 0.95 0.00 0.00 1.00 1.00 8.0 ▇▂▁▁▁
Lag1 0 1 0.20 0.40 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▂
Lag2 0 1 0.19 0.39 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▂
Lag3 0 1 0.17 0.37 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▂
Lag4 0 1 0.15 0.36 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▂
Lag5 0 1 0.14 0.34 0.00 0.00 0.00 0.00 1.0 ▇▁▁▁▁

Build Model

We can start by splitting our data into training and testing sets.

set.seed(123)

figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)

Create Bootstrap

Next, let’s create bootstrap resamples of the training data, to evaluate our models.

set.seed(234)
figs_boot <- bootstraps(figs_train)
figs_boot
## # Bootstrap sampling 
## # A tibble: 25 x 2
##    splits               id         
##    <list>               <chr>      
##  1 <split [26190/9615]> Bootstrap01
##  2 <split [26190/9619]> Bootstrap02
##  3 <split [26190/9658]> Bootstrap03
##  4 <split [26190/9585]> Bootstrap04
##  5 <split [26190/9666]> Bootstrap05
##  6 <split [26190/9641]> Bootstrap06
##  7 <split [26190/9609]> Bootstrap07
##  8 <split [26190/9605]> Bootstrap08
##  9 <split [26190/9596]> Bootstrap09
## 10 <split [26190/9661]> Bootstrap10
## # … with 15 more rows

Create Model Specification

We create a model specification utilizing the glm package

glm_spec <- logistic_reg() %>% 
  set_engine("glm")

glm_spec
## Logistic Regression Model Specification (classification)
## 
## Computational engine: glm

Create a Recipe

The recipe performs preprocessing necessary to fit a logistic regression model

figs_rec<-  recipe(target ~ ., data=figs_train) %>% 
  update_role(horse, new_role = "ID") %>% 
  step_zv(all_numeric()) %>% 
  step_corr(all_numeric()) %>% 
  step_other(jky , threshold = 0.015) %>% 
  step_other(trk_code , threshold = 0.02) %>% 
  step_other(distance, threshold = 0.02) %>% 
  step_dummy(distance) %>% 
  step_dummy(PT3) %>% 
  step_dummy(gender) %>% 
  step_dummy(trk_code) %>% 
  step_dummy(jky) %>% 
  step_dummy(form_cycle) %>% 
  step_dummy(surf) %>% 
  step_dummy(wind) %>% 
  step_dummy(precip) %>% 
  step_dummy(s_cond) %>% 
  step_dummy(race_type) %>% 
  step_dummy(surface_cond) %>% 
  step_rm(yr) %>% 
  step_date(race_date, features = c("month", "year")) %>% 
  step_rm(race_date) %>% 
  #step_zv(all_predictors()) %>% 
  #step_corr(all_predictors()) %>% 
  themis::step_downsample(target)
  
  

figs_prep <- prep(figs_rec)

Setup a Workflow

We create the workflow by adding the formula and recipe

figs_wf <- workflow() %>% 
  add_recipe(figs_rec) 

Add Model to Workflow and Fit to each of the resamples.

glm_rs <- figs_wf %>% 
  add_model(glm_spec) %>% 
  fit_resamples(
    resamples = figs_boot,
    control = control_resamples(save_pred = TRUE)
  )
## ! Bootstrap01: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap02: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap02: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap03: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap03: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap04: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap04: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap05: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap06: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap06: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap07: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap07: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap08: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap08: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap09: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap09: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap10: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap10: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap11: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap11: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap12: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap12: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap13: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap13: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap14: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap14: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap15: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap15: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap16: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap16: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap17: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap17: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap18: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap18: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap19: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap19: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap20: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap20: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap21: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap21: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap22: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap22: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap23: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap23: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap24: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap24: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap25: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap25: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
glm_rs
## Warning: This tuning result has notes. Example notes on model fitting include:
## preprocessor 1/1, model 1/1 (predictions): prediction from a rank-deficient fit may be misleading
## preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0 or 1 occurred
## preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0 or 1 occurred
## # Resampling results
## # Bootstrap sampling 
## # A tibble: 25 x 5
##    splits            id         .metrics        .notes        .predictions      
##    <list>            <chr>      <list>          <list>        <list>            
##  1 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,615 × …
##  2 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,619 × …
##  3 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,658 × …
##  4 <split [26190/95… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,585 × …
##  5 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [1 ×… <tibble [9,666 × …
##  6 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,641 × …
##  7 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,609 × …
##  8 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,605 × …
##  9 <split [26190/95… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,596 × …
## 10 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,661 × …
## # … with 15 more rows

Evaluate Model

Now we use collect_metrics to see the results of our modeling. Here we use accuracy and roc to evaluate the model

collect_metrics(glm_rs)
## # A tibble: 2 x 6
##   .metric  .estimator  mean     n  std_err .config             
##   <chr>    <chr>      <dbl> <int>    <dbl> <chr>               
## 1 accuracy binary     0.680    25 0.000917 Preprocessor1_Model1
## 2 roc_auc  binary     0.772    25 0.000989 Preprocessor1_Model1

Now lets look at the confusion matrix

glm_rs %>% 
  conf_mat_resampled()
## # A tibble: 4 x 3
##   Prediction Truth  Freq
##   <fct>      <fct> <dbl>
## 1 NoTop      NoTop 4965.
## 2 NoTop      TOP    589.
## 3 TOP        NoTop 2486.
## 4 TOP        TOP   1580.

Finally, lets plot and roc curve to visualize our roc resutls

glm_rs %>% 
  collect_predictions() %>% 
  group_by(id) %>%   
  roc_curve(target, .pred_TOP, event_level = "second") %>% 
  ggplot(aes(1-specificity, sensitivity, color=id)) +
  geom_abline(lty=2, color="gray80", size= 1.5) +
  geom_path(show.legend =FALSE, alpha = 0.6, size =1.2) +
  coord_equal()

Finalize Model

Now lets peform the last_fit() to finalize our model against the test dataset

figs_final <- figs_wf %>% 
  add_model(glm_spec) %>% 
  last_fit(figs_split)
## ! train/test split: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! train/test split: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
figs_final
## Warning: This tuning result has notes. Example notes on model fitting include:
## preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0 or 1 occurred
## preprocessor 1/1, model 1/1 (predictions): prediction from a rank-deficient fit may be misleading
## # Resampling results
## # Manual resampling 
## # A tibble: 1 x 6
##   splits         id          .metrics     .notes      .predictions     .workflow
##   <list>         <chr>       <list>       <list>      <list>           <list>   
## 1 <split [26190… train/test… <tibble [2 … <tibble [2… <tibble [8,728 … <workflo…

The metrics and prediction have been calculated against the test data

collect_metrics(figs_final)
## # A tibble: 2 x 4
##   .metric  .estimator .estimate .config             
##   <chr>    <chr>          <dbl> <chr>               
## 1 accuracy binary         0.676 Preprocessor1_Model1
## 2 roc_auc  binary         0.777 Preprocessor1_Model1
collect_predictions(figs_final) %>% 
  conf_mat(target, .pred_class)
##           Truth
## Prediction NoTop  TOP
##      NoTop  4416  494
##      TOP    2337 1481

The coefficients (which we can get out using tidy()) have been estimated using the training data. If we use exponentiate = TRUE, we have odds ratios.

figs_final$.workflow[[1]] %>% 
  tidy(exponentiate=TRUE)
## # A tibble: 475 x 5
##    term        estimate std.error statistic p.value
##    <chr>          <dbl>     <dbl>     <dbl>   <dbl>
##  1 (Intercept) 1.68e+54  146.      0.856    0.392  
##  2 ftl         1.16e+ 0    0.151   0.991    0.322  
##  3 clm_by_hot  9.54e- 1    0.0721 -0.653    0.514  
##  4 start       2.22e+ 0    0.286   2.80     0.00516
##  5 bled        1.56e- 8 2029.     -0.00886  0.993  
##  6 off_turf    9.69e- 1    0.121  -0.261    0.794  
##  7 bfnr        5.23e- 6  304.     -0.0399   0.968  
##  8 lame        4.62e+ 0 2613.      0.000586 1.00   
##  9 lsx         1.80e+ 0    0.228   2.58     0.00992
## 10 blnkrs      8.81e- 1    0.0925 -1.37     0.172  
## # … with 465 more rows