data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data.csv")
figs <- read_csv(data) %>%
as_tibble() %>%
rename(target = `target`) %>%
mutate(PT3_2 = PT3) %>%
mutate(PT3_2 = str_replace_all(PT3_2,"0","T")) %>%
mutate(PT3_2 = str_replace_all(PT3_2,"1","A")) %>%
mutate(PT3_2 = str_replace_all(PT3_2,"2","B")) %>%
mutate(PT3_2 = str_replace_all(PT3_2,"3","C")) %>%
mutate(PT3_2 = str_replace_all(PT3_2,"4","D")) %>%
mutate(PT3 = PT3_2) %>%
select(-PT3_2, -D1, -D2, -eff_TV, -n, -top, -eff_top, -fv, -DST) %>%
#change character data to factors
mutate_if(is.character, factor)
#change distance to numeric
skim(figs)
| Name | figs |
| Number of rows | 34918 |
| Number of columns | 41 |
| _______________________ | |
| Column type frequency: | |
| Date | 1 |
| factor | 14 |
| numeric | 26 |
| ________________________ | |
| Group variables | None |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| race_date | 0 | 1 | 2016-04-13 | 2021-12-20 | 2019-10-31 | 1433 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| target | 0 | 1 | FALSE | 2 | NoT: 27015, TOP: 7903 |
| horse | 0 | 1 | FALSE | 3388 | CHA: 57, EL : 53, STA: 53, STA: 52 |
| surface_cond | 0 | 1 | FALSE | 13 | fst: 16041, frm: 10188, wsm: 2845, vsl: 1734 |
| precip | 0 | 1 | FALSE | 3 | cle: 33590, rai: 1250, sno: 78 |
| wind | 0 | 1 | FALSE | 3 | cal: 30965, hvy: 2537, vhv: 1416 |
| gender | 0 | 1 | FALSE | 2 | Mal: 23692, Fem: 11226 |
| distance | 0 | 1 | FALSE | 22 | 8.0: 9161, 6.0: 7606, 8.5: 4518, 7.0: 2920 |
| jky | 0 | 1 | FALSE | 729 | I O: 1009, L S: 910, M F: 886, J O: 853 |
| trk_code | 0 | 1 | FALSE | 145 | GP: 8279, AQU: 4746, BEL: 3269, SA: 2878 |
| surf | 0 | 1 | FALSE | 5 | Dir: 21037, Tur: 11548, Syn: 1245, Inn: 1019 |
| s_cond | 0 | 1 | FALSE | 8 | Fst: 18059, Frm: 10175, Gd: 3216, Sly: 2129 |
| PT3 | 0 | 1 | FALSE | 352 | –: 3388, P: 3007, XXX: 2290, TP: 1717 |
| form_cycle | 0 | 1 | FALSE | 24 | AX: 6410, CX: 3647, BX: 3470, A1: 3388 |
| race_type | 0 | 1 | FALSE | 2 | Spr: 18436, Rou: 16482 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| yr | 0 | 1 | 2019.22 | 0.99 | 2016.00 | 2019.00 | 2019.00 | 2020.00 | 2021.0 | ▁▃▇▇▂ |
| age | 0 | 1 | 3.64 | 1.30 | 1.00 | 3.00 | 3.00 | 4.00 | 11.0 | ▇▆▁▁▁ |
| ftl | 0 | 1 | 0.08 | 0.27 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▇▁▁▁▁ |
| clm_by_hot | 0 | 1 | -0.93 | 0.34 | -1.00 | -1.00 | -1.00 | -1.00 | 1.0 | ▇▁▁▁▁ |
| start | 0 | 1 | 0.03 | 0.09 | 0.00 | 0.00 | 0.00 | 0.00 | 0.9 | ▇▁▁▁▁ |
| bled | 0 | 1 | 0.00 | 0.02 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▇▁▁▁▁ |
| off_turf | 0 | 1 | 0.05 | 0.21 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▇▁▁▁▁ |
| bfnr | 0 | 1 | 0.01 | 0.11 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▇▁▁▁▁ |
| lame | 0 | 1 | 0.00 | 0.02 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▇▁▁▁▁ |
| lsx | 0 | 1 | 0.00 | 0.09 | -1.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▁▁▇▁▁ |
| blnkrs | 0 | 1 | 0.04 | 0.27 | -1.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▁▁▇▁▁ |
| age_wks | 0 | 1 | 204.49 | 65.51 | 37.00 | 154.00 | 190.00 | 240.00 | 559.0 | ▂▇▂▁▁ |
| wt | 0 | 1 | 120.33 | 2.70 | 108.00 | 119.00 | 120.00 | 122.00 | 143.0 | ▁▇▂▁▁ |
| off_odds | 0 | 1 | 12.19 | 17.14 | 0.11 | 2.50 | 6.00 | 13.00 | 99.0 | ▇▁▁▁▁ |
| fld | 0 | 1 | 8.40 | 2.21 | 2.00 | 7.00 | 8.00 | 10.00 | 30.0 | ▅▇▁▁▁ |
| L3 | 0 | 1 | 18.03 | 7.04 | 1.75 | 13.33 | 16.92 | 21.42 | 99.0 | ▇▃▁▁▁ |
| L5 | 0 | 1 | 18.27 | 6.78 | 1.75 | 13.75 | 17.25 | 21.62 | 99.0 | ▇▃▁▁▁ |
| L7 | 0 | 1 | 18.44 | 6.68 | 1.75 | 14.00 | 17.43 | 21.75 | 99.0 | ▇▃▁▁▁ |
| rest | 0 | 1 | 44.73 | 53.55 | 2.00 | 21.00 | 28.00 | 44.00 | 1081.0 | ▇▁▁▁▁ |
| avg_rest | 0 | 1 | 39.90 | 21.70 | 9.67 | 25.68 | 34.80 | 46.90 | 489.0 | ▇▁▁▁▁ |
| efforts_last90 | 0 | 1 | 0.90 | 0.95 | 0.00 | 0.00 | 1.00 | 1.00 | 8.0 | ▇▂▁▁▁ |
| Lag1 | 0 | 1 | 0.20 | 0.40 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▇▁▁▁▂ |
| Lag2 | 0 | 1 | 0.19 | 0.39 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▇▁▁▁▂ |
| Lag3 | 0 | 1 | 0.17 | 0.37 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▇▁▁▁▂ |
| Lag4 | 0 | 1 | 0.15 | 0.36 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▇▁▁▁▂ |
| Lag5 | 0 | 1 | 0.14 | 0.34 | 0.00 | 0.00 | 0.00 | 0.00 | 1.0 | ▇▁▁▁▁ |
set.seed(123)
figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)
set.seed(234)
figs_boot <- bootstraps(figs_train)
figs_boot
## # Bootstrap sampling
## # A tibble: 25 x 2
## splits id
## <list> <chr>
## 1 <split [26190/9615]> Bootstrap01
## 2 <split [26190/9619]> Bootstrap02
## 3 <split [26190/9658]> Bootstrap03
## 4 <split [26190/9585]> Bootstrap04
## 5 <split [26190/9666]> Bootstrap05
## 6 <split [26190/9641]> Bootstrap06
## 7 <split [26190/9609]> Bootstrap07
## 8 <split [26190/9605]> Bootstrap08
## 9 <split [26190/9596]> Bootstrap09
## 10 <split [26190/9661]> Bootstrap10
## # … with 15 more rows
glm_spec <- logistic_reg() %>%
set_engine("glm")
glm_spec
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
figs_rec<- recipe(target ~ ., data=figs_train) %>%
update_role(horse, new_role = "ID") %>%
step_zv(all_numeric()) %>%
step_corr(all_numeric()) %>%
step_other(jky , threshold = 0.015) %>%
step_other(trk_code , threshold = 0.02) %>%
step_other(distance, threshold = 0.02) %>%
step_dummy(distance) %>%
step_dummy(PT3) %>%
step_dummy(gender) %>%
step_dummy(trk_code) %>%
step_dummy(jky) %>%
step_dummy(form_cycle) %>%
step_dummy(surf) %>%
step_dummy(wind) %>%
step_dummy(precip) %>%
step_dummy(s_cond) %>%
step_dummy(race_type) %>%
step_dummy(surface_cond) %>%
step_rm(yr) %>%
step_date(race_date, features = c("month", "year")) %>%
step_rm(race_date) %>%
#step_zv(all_predictors()) %>%
#step_corr(all_predictors()) %>%
themis::step_downsample(target)
figs_prep <- prep(figs_rec)
figs_wf <- workflow() %>%
add_recipe(figs_rec)
glm_rs <- figs_wf %>%
add_model(glm_spec) %>%
fit_resamples(
resamples = figs_boot,
control = control_resamples(save_pred = TRUE)
)
## ! Bootstrap01: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap01: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap02: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap02: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap03: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap03: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap04: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap04: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap05: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap06: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap06: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap07: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap07: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap08: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap08: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap09: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap09: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap10: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap10: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap11: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap11: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap12: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap12: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap13: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap13: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap14: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap14: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap15: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap15: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap16: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap16: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap17: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap17: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap18: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap18: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap19: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap19: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap20: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap20: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap21: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap21: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap22: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap22: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap23: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap23: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap24: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap24: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
## ! Bootstrap25: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! Bootstrap25: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
glm_rs
## Warning: This tuning result has notes. Example notes on model fitting include:
## preprocessor 1/1, model 1/1 (predictions): prediction from a rank-deficient fit may be misleading
## preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0 or 1 occurred
## preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0 or 1 occurred
## # Resampling results
## # Bootstrap sampling
## # A tibble: 25 x 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,615 × …
## 2 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,619 × …
## 3 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,658 × …
## 4 <split [26190/95… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,585 × …
## 5 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [1 ×… <tibble [9,666 × …
## 6 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,641 × …
## 7 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,609 × …
## 8 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,605 × …
## 9 <split [26190/95… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,596 × …
## 10 <split [26190/96… Bootstrap… <tibble [2 × 4… <tibble [2 ×… <tibble [9,661 × …
## # … with 15 more rows
collect_metrics(glm_rs)
## # A tibble: 2 x 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy binary 0.680 25 0.000917 Preprocessor1_Model1
## 2 roc_auc binary 0.772 25 0.000989 Preprocessor1_Model1
glm_rs %>%
conf_mat_resampled()
## # A tibble: 4 x 3
## Prediction Truth Freq
## <fct> <fct> <dbl>
## 1 NoTop NoTop 4965.
## 2 NoTop TOP 589.
## 3 TOP NoTop 2486.
## 4 TOP TOP 1580.
glm_rs %>%
collect_predictions() %>%
group_by(id) %>%
roc_curve(target, .pred_TOP, event_level = "second") %>%
ggplot(aes(1-specificity, sensitivity, color=id)) +
geom_abline(lty=2, color="gray80", size= 1.5) +
geom_path(show.legend =FALSE, alpha = 0.6, size =1.2) +
coord_equal()
figs_final <- figs_wf %>%
add_model(glm_spec) %>%
last_fit(figs_split)
## ! train/test split: preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0...
## ! train/test split: preprocessor 1/1, model 1/1 (predictions): prediction from a rank-defici...
figs_final
## Warning: This tuning result has notes. Example notes on model fitting include:
## preprocessor 1/1, model 1/1: glm.fit: fitted probabilities numerically 0 or 1 occurred
## preprocessor 1/1, model 1/1 (predictions): prediction from a rank-deficient fit may be misleading
## # Resampling results
## # Manual resampling
## # A tibble: 1 x 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [26190… train/test… <tibble [2 … <tibble [2… <tibble [8,728 … <workflo…
collect_metrics(figs_final)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.676 Preprocessor1_Model1
## 2 roc_auc binary 0.777 Preprocessor1_Model1
collect_predictions(figs_final) %>%
conf_mat(target, .pred_class)
## Truth
## Prediction NoTop TOP
## NoTop 4416 494
## TOP 2337 1481
figs_final$.workflow[[1]] %>%
tidy(exponentiate=TRUE)
## # A tibble: 475 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 1.68e+54 146. 0.856 0.392
## 2 ftl 1.16e+ 0 0.151 0.991 0.322
## 3 clm_by_hot 9.54e- 1 0.0721 -0.653 0.514
## 4 start 2.22e+ 0 0.286 2.80 0.00516
## 5 bled 1.56e- 8 2029. -0.00886 0.993
## 6 off_turf 9.69e- 1 0.121 -0.261 0.794
## 7 bfnr 5.23e- 6 304. -0.0399 0.968
## 8 lame 4.62e+ 0 2613. 0.000586 1.00
## 9 lsx 1.80e+ 0 0.228 2.58 0.00992
## 10 blnkrs 8.81e- 1 0.0925 -1.37 0.172
## # … with 465 more rows