Get the data from Github
data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data2.csv")
figs <- read_csv(data) %>%
as_tibble() %>%
rename(n = `n`) %>%
mutate(eff_target = if_else(eff_target==1,"Top","NoTop")) %>%
# Remove columns that can't be used in model
select(-target, -D1, -D2, -eff_TV, -top, -eff_top, -fv, -DST, -seed, -E, -E4, -TE, -split_800, -split_801, -split_802, -split_803, -split_804, -split_805, -won, -RST, -X) %>%
rename(target = eff_target) %>%
#change character data to factors
mutate_if(is.character, factor) %>%
mutate(race_date = mdy(race_date))
#change distance to numeric
skim(figs)
Data summary
| Name |
figs |
| Number of rows |
35617 |
| Number of columns |
59 |
| _______________________ |
|
| Column type frequency: |
|
| Date |
1 |
| factor |
14 |
| numeric |
44 |
| ________________________ |
|
| Group variables |
None |
Variable type: Date
| race_date |
0 |
1 |
2016-04-13 |
2021-12-20 |
2019-10-31 |
1433 |
Variable type: factor
| target |
0 |
1 |
FALSE |
2 |
NoT: 27061, Top: 8556 |
| horse |
0 |
1 |
FALSE |
3387 |
CHA: 57, VIC: 57, STA: 54, EL : 53 |
| surface_cond |
0 |
1 |
FALSE |
13 |
fst: 16366, frm: 10368, wsm: 2913, vsl: 1771 |
| precip |
0 |
1 |
FALSE |
3 |
cle: 34259, rai: 1278, sno: 80 |
| wind |
0 |
1 |
FALSE |
3 |
cal: 31586, hvy: 2581, vhv: 1450 |
| gender |
0 |
1 |
FALSE |
2 |
Mal: 24217, Fem: 11400 |
| distance |
0 |
1 |
FALSE |
22 |
8.0: 9316, 6.0: 7624, 8.5: 4621, 7.0: 3011 |
| jky |
0 |
1 |
FALSE |
729 |
I O: 1034, L S: 930, M F: 915, J O: 879 |
| trk_code |
0 |
1 |
FALSE |
145 |
GP: 8459, AQU: 4858, BEL: 3366, SA: 2892 |
| surf |
0 |
1 |
FALSE |
5 |
Dir: 21482, Tur: 11757, Syn: 1271, Inn: 1038 |
| s_cond |
0 |
1 |
FALSE |
8 |
Fst: 18433, Frm: 10362, Gd: 3275, Sly: 2180 |
| form_cycle |
0 |
1 |
FALSE |
24 |
AX: 6600, CX: 3762, BX: 3536, A1: 3387 |
| race_type |
0 |
1 |
FALSE |
2 |
Spr: 18816, Rou: 16801 |
| trk |
0 |
1 |
FALSE |
130 |
GP: 8377, AQ: 4769, BE: 3359, SA: 2887 |
Variable type: numeric
| n |
0 |
1 |
9.86 |
8.51 |
1.00 |
3.00 |
7.00 |
14.00 |
57.00 |
▇▂▁▁▁ |
| cls_val |
0 |
1 |
1.99 |
0.89 |
1.00 |
1.15 |
1.62 |
3.00 |
12.59 |
▇▁▁▁▁ |
| cls_mv |
0 |
1 |
0.07 |
0.68 |
-11.49 |
0.00 |
0.00 |
0.00 |
11.39 |
▁▁▇▁▁ |
| yr |
0 |
1 |
2019.22 |
0.99 |
2016.00 |
2019.00 |
2019.00 |
2020.00 |
2021.00 |
▁▃▇▇▂ |
| age |
0 |
1 |
3.64 |
1.30 |
1.00 |
3.00 |
3.00 |
4.00 |
11.00 |
▇▆▁▁▁ |
| ftl |
0 |
1 |
0.08 |
0.27 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| clm_by_hot |
0 |
1 |
-0.93 |
0.34 |
-1.00 |
-1.00 |
-1.00 |
-1.00 |
1.00 |
▇▁▁▁▁ |
| start |
0 |
1 |
0.03 |
0.09 |
0.00 |
0.00 |
0.00 |
0.00 |
0.90 |
▇▁▁▁▁ |
| bled |
0 |
1 |
0.00 |
0.02 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| off_turf |
0 |
1 |
0.05 |
0.21 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| bfnr |
0 |
1 |
0.01 |
0.11 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| lame |
0 |
1 |
0.00 |
0.02 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| lsx |
0 |
1 |
0.00 |
0.09 |
-1.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▁▁▇▁▁ |
| blnkrs |
0 |
1 |
0.04 |
0.27 |
-1.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▁▁▇▁▁ |
| age_wks |
0 |
1 |
204.45 |
65.52 |
37.00 |
154.00 |
190.00 |
240.00 |
559.00 |
▂▇▂▁▁ |
| wt |
2 |
1 |
120.32 |
2.70 |
108.00 |
119.00 |
120.00 |
122.00 |
143.00 |
▁▇▂▁▁ |
| off_odds |
0 |
1 |
12.17 |
17.09 |
0.11 |
2.50 |
6.00 |
13.00 |
99.00 |
▇▁▁▁▁ |
| fld |
2 |
1 |
8.39 |
2.21 |
2.00 |
7.00 |
8.00 |
10.00 |
30.00 |
▅▇▁▁▁ |
| L3 |
0 |
1 |
18.01 |
7.03 |
1.75 |
13.33 |
16.92 |
21.42 |
99.00 |
▇▃▁▁▁ |
| L5 |
0 |
1 |
18.26 |
6.77 |
1.75 |
13.70 |
17.25 |
21.62 |
99.00 |
▇▃▁▁▁ |
| L7 |
0 |
1 |
18.43 |
6.66 |
1.75 |
13.96 |
17.43 |
21.75 |
99.00 |
▇▃▁▁▁ |
| rest |
0 |
1 |
43.85 |
53.38 |
0.00 |
21.00 |
28.00 |
43.00 |
1081.00 |
▇▁▁▁▁ |
| avg_rest |
0 |
1 |
39.75 |
21.66 |
7.25 |
25.67 |
34.70 |
46.78 |
489.00 |
▇▁▁▁▁ |
| efforts_last90 |
0 |
1 |
0.90 |
0.95 |
0.00 |
0.00 |
1.00 |
1.00 |
8.00 |
▇▂▁▁▁ |
| Lag1 |
0 |
1 |
0.21 |
0.40 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▂ |
| Lag2 |
0 |
1 |
0.19 |
0.39 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▂ |
| Lag3 |
0 |
1 |
0.17 |
0.37 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▂ |
| Lag4 |
0 |
1 |
0.15 |
0.36 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▂ |
| Lag5 |
0 |
1 |
0.14 |
0.34 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| LP1 |
0 |
1 |
-1.91 |
2.27 |
-5.00 |
-5.00 |
-2.00 |
0.00 |
1.00 |
▆▂▃▂▇ |
| LP2 |
0 |
1 |
-1.91 |
2.16 |
-5.00 |
-4.00 |
-2.00 |
0.00 |
1.00 |
▆▂▆▂▇ |
| LP3 |
0 |
1 |
-1.92 |
2.05 |
-5.00 |
-4.00 |
-2.00 |
0.00 |
1.00 |
▆▂▇▂▇ |
| LP4 |
0 |
1 |
-1.92 |
1.95 |
-5.00 |
-3.00 |
-2.00 |
0.00 |
1.00 |
▅▁▇▁▆ |
| LP5 |
0 |
1 |
-1.93 |
1.85 |
-5.00 |
-3.00 |
-2.00 |
0.00 |
1.00 |
▃▁▇▁▅ |
| pp_id |
0 |
1 |
27487.78 |
16975.26 |
1.00 |
12623.00 |
26040.00 |
41459.00 |
59178.00 |
▇▇▆▇▆ |
| splt_1 |
0 |
1 |
4.48 |
6.00 |
0.00 |
0.00 |
2.89 |
6.79 |
51.00 |
▇▁▁▁▁ |
| splt_2 |
0 |
1 |
2.36 |
3.93 |
0.00 |
0.00 |
0.00 |
3.81 |
35.91 |
▇▁▁▁▁ |
| splt_3 |
0 |
1 |
2.60 |
5.16 |
0.00 |
0.00 |
0.00 |
3.57 |
45.30 |
▇▁▁▁▁ |
| splt_4 |
0 |
1 |
3.53 |
5.16 |
0.00 |
0.00 |
1.23 |
5.27 |
46.29 |
▇▁▁▁▁ |
| splt_5 |
0 |
1 |
1.34 |
2.75 |
0.00 |
0.00 |
0.00 |
1.93 |
27.99 |
▇▁▁▁▁ |
| splt_6 |
0 |
1 |
2.85 |
4.09 |
0.00 |
0.00 |
0.00 |
3.97 |
30.87 |
▇▂▁▁▁ |
| splt_7 |
0 |
1 |
4.68 |
5.87 |
0.00 |
0.00 |
2.90 |
7.42 |
37.86 |
▇▂▁▁▁ |
| splt_8 |
0 |
1 |
11.88 |
9.05 |
0.00 |
3.93 |
10.63 |
17.73 |
50.57 |
▇▆▂▁▁ |
| splt_9 |
0 |
1 |
10.57 |
8.58 |
0.00 |
2.97 |
9.35 |
16.01 |
47.05 |
▇▅▂▁▁ |
Build Model
We can start by splitting our data into training and testing sets.
set.seed(123)
figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)
figs_split
## <Analysis/Assess/Total>
## <26713/8904/35617>
Create Model Specification
We create a model specification utilizing the xgboost package
xgb_spec <- boost_tree(
trees = 1000,
tree_depth = tune(),
min_n = tune(),
loss_reduction = tune(),
sample_size = tune(),
mtry = tune(),
learn_rate = tune(),
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_spec
## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
## tree_depth = tune()
## learn_rate = tune()
## loss_reduction = tune()
## sample_size = tune()
##
## Computational engine: xgboost
Create a Grid for the Hyperparameter value options
xgb_grid <- grid_latin_hypercube(
tree_depth(),
min_n(),
loss_reduction(),
sample_size = sample_prop(),
finalize(mtry(), figs_train),
learn_rate(),
size = 30
)
Setup a Workflow
We create the workflow by adding the recipe and the model
xgb_wf <- workflow() %>%
add_recipe(figs_rec) %>%
add_model(xgb_spec)
xgb_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 19 Recipe Steps
##
## ● step_other()
## ● step_other()
## ● step_other()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● step_dummy()
## ● ...
## ● and 9 more steps.
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
## tree_depth = tune()
## learn_rate = tune()
## loss_reduction = tune()
## sample_size = tune()
##
## Computational engine: xgboost
Now, let’s create cross-validation resamples for tuning our xgboost model
set.seed(123)
vb_folds <- vfold_cv(figs_train, strata = target)
vb_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 x 2
## splits id
## <list> <chr>
## 1 <split [24041/2672]> Fold01
## 2 <split [24041/2672]> Fold02
## 3 <split [24041/2672]> Fold03
## 4 <split [24041/2672]> Fold04
## 5 <split [24041/2672]> Fold05
## 6 <split [24041/2672]> Fold06
## 7 <split [24042/2671]> Fold07
## 8 <split [24043/2670]> Fold08
## 9 <split [24043/2670]> Fold09
## 10 <split [24043/2670]> Fold10
Tune The model
We use tune_grid() along gwith our tunable workflow, our resamples and our grid of parameters. We will use control_grid(save_pred=TRUE) to preserve the prediction for later analysis.
members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)
doParallel::registerDoParallel()
set.seed(234)
xgb_res <- tune_grid(
xgb_wf,
resamples = vb_folds,
metrics = members_metrics,
control = control_grid(save_pred = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
xgb_res
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 x 5
## splits id .metrics .notes .predictions
## <list> <chr> <list> <list> <list>
## 1 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
## 2 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
## 3 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
## 4 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
## 5 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
## 6 <split [24041/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,720 × 1…
## 7 <split [24042/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,710 × 1…
## 8 <split [24043/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,700 × 1…
## 9 <split [24043/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,700 × 1…
## 10 <split [24043/267… Fold… <tibble [40 × 1… <tibble [0 × … <tibble [26,700 × 1…
Evaluate Model
Now we use collect_metrics to see the results of our modeling. Here we use accuracy and roc to evaluate the model
collect_metrics(xgb_res)
## # A tibble: 40 x 12
## mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 110 17 1 0.00000719 0.147 0.454 accuracy
## 2 110 17 1 0.00000719 0.147 0.454 roc_auc
## 3 110 17 1 0.00000719 0.147 0.454 sens
## 4 110 17 1 0.00000719 0.147 0.454 spec
## 5 48 19 6 0.00000000167 2.75 0.578 accuracy
## 6 48 19 6 0.00000000167 2.75 0.578 roc_auc
## 7 48 19 6 0.00000000167 2.75 0.578 sens
## 8 48 19 6 0.00000000167 2.75 0.578 spec
## 9 63 6 7 0.000000268 0.00000126 0.966 accuracy
## 10 63 6 7 0.000000268 0.00000126 0.966 roc_auc
## # … with 30 more rows, and 5 more variables: .estimator <chr>, mean <dbl>,
## # n <int>, std_err <dbl>, .config <chr>
Visualize Tuning Results
xgb_res %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
select(mean, mtry:sample_size) %>%
pivot_longer(mtry:sample_size,
values_to = "value",
names_to = "parameter"
) %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~parameter, scales = "free_x") +
labs(x = NULL, y = "AUC")

What’s the Best Model
show_best(xgb_res, "roc_auc")
## # A tibble: 5 x 12
## mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 95 27 10 0.0153 0.0000000510 0.837 roc_auc
## 2 137 13 12 0.00877 0.00000000110 0.720 roc_auc
## 3 63 6 7 0.000000268 0.00000126 0.966 roc_auc
## 4 125 37 8 0.0000000130 0.00418 0.308 roc_auc
## 5 30 30 14 0.0000587 0.000535 0.808 roc_auc
## # … with 5 more variables: .estimator <chr>, mean <dbl>, n <int>,
## # std_err <dbl>, .config <chr>
best_auc <- select_best(xgb_res, "roc_auc")
best_auc
## # A tibble: 1 x 7
## mtry min_n tree_depth learn_rate loss_reduction sample_size .config
## <int> <int> <int> <dbl> <dbl> <dbl> <chr>
## 1 95 27 10 0.0153 0.0000000510 0.837 Preprocessor1_Mo…
Finalize Model
The metrics and prediction have been calculated against the test data
What are the important variables
final_xgb %>%
fit(data = figs_train) %>%
pull_workflow_fit() %>%
vip(geom = "point", num_features=25L) +
labs(
x = NULL,
y = NULL,
title= "Feature Importance") + theme_fivethirtyeight()
## [02:42:29] WARNING: amalgamation/../src/learner.cc:1061: Starting in XGBoost 1.3.0, the default evaluation metric used with the objective 'binary:logistic' was changed from 'error' to 'logloss'. Explicitly set eval_metric if you'd like to restore the old behavior.

Final Results
final_res <- last_fit(final_xgb, figs_split)
collect_metrics(final_res)
## # A tibble: 2 x 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <chr>
## 1 accuracy binary 0.748 Preprocessor1_Model1
## 2 roc_auc binary 0.793 Preprocessor1_Model1
Confusion Matrix
collect_predictions(final_res) %>%
conf_mat(target, .pred_class)
## Truth
## Prediction NoTop Top
## NoTop 5372 854
## Top 1393 1285
ROC Curve
final_res %>%
collect_predictions() %>%
roc_curve(target, .pred_Top, event_level = "second") %>%
ggplot(aes(x=1-specificity, y = sensitivity)) +
geom_line(size = 1.5, color = "midnightblue") +
geom_abline(
lty = 2, alpha =0.5,
color = "gray50",
size = 1.2
)

Predictions
xgb_predicitions <- final_res %>%
collect_predictions() %>%
mutate(correct = case_when(target == .pred_class ~ "Correct",
TRUE ~ "Incorrect"))
datatable(xgb_predicitions)