library(lubridate)
library(tidyverse)
library(tidymodels)
library(skimr)
library(ggthemes)
library(DBI)
library(RMariaDB)
library(odbc)
library(RcppRoll)
library(magick)
library(RCurl)
library(vip)
library(themis)
library(DT)
library(gt)
Get the data from Github
data <- getURL("https://raw.githubusercontent.com/Handicappr/Rstudio_test_project/main/fig_data2.csv")
figs <- read_csv(data) %>%
as_tibble() %>%
rename(n = `n`) %>%
mutate(eff_target = if_else(eff_target==1,"Top","NoTop")) %>%
# Remove columns that can't be used in model
select(-target, -D1, -D2, -eff_TV, -top, -eff_top, -fv, -DST, -seed, -E, -E4, -TE, -split_800, -split_801, -split_802, -split_803, -split_804, -split_805, -won, -RST, -X) %>%
rename(target = eff_target) %>%
#change character data to factors
mutate_if(is.character, factor) %>%
mutate(race_date = mdy(race_date)) %>%
mutate(fld = if_else(is.na(fld), 8, fld)) %>%
mutate(wt = if_else(is.na(wt), 120, wt))
skim(figs)
Data summary
| Name |
figs |
| Number of rows |
35617 |
| Number of columns |
59 |
| _______________________ |
|
| Column type frequency: |
|
| Date |
1 |
| factor |
14 |
| numeric |
44 |
| ________________________ |
|
| Group variables |
None |
Variable type: Date
| race_date |
0 |
1 |
2016-04-13 |
2021-12-20 |
2019-10-31 |
1433 |
Variable type: factor
| target |
0 |
1 |
FALSE |
2 |
NoT: 27061, Top: 8556 |
| horse |
0 |
1 |
FALSE |
3387 |
CHA: 57, VIC: 57, STA: 54, EL : 53 |
| surface_cond |
0 |
1 |
FALSE |
13 |
fst: 16366, frm: 10368, wsm: 2913, vsl: 1771 |
| precip |
0 |
1 |
FALSE |
3 |
cle: 34259, rai: 1278, sno: 80 |
| wind |
0 |
1 |
FALSE |
3 |
cal: 31586, hvy: 2581, vhv: 1450 |
| gender |
0 |
1 |
FALSE |
2 |
Mal: 24217, Fem: 11400 |
| distance |
0 |
1 |
FALSE |
22 |
8.0: 9316, 6.0: 7624, 8.5: 4621, 7.0: 3011 |
| jky |
0 |
1 |
FALSE |
729 |
I O: 1034, L S: 930, M F: 915, J O: 879 |
| trk_code |
0 |
1 |
FALSE |
145 |
GP: 8459, AQU: 4858, BEL: 3366, SA: 2892 |
| surf |
0 |
1 |
FALSE |
5 |
Dir: 21482, Tur: 11757, Syn: 1271, Inn: 1038 |
| s_cond |
0 |
1 |
FALSE |
8 |
Fst: 18433, Frm: 10362, Gd: 3275, Sly: 2180 |
| form_cycle |
0 |
1 |
FALSE |
24 |
AX: 6600, CX: 3762, BX: 3536, A1: 3387 |
| race_type |
0 |
1 |
FALSE |
2 |
Spr: 18816, Rou: 16801 |
| trk |
0 |
1 |
FALSE |
130 |
GP: 8377, AQ: 4769, BE: 3359, SA: 2887 |
Variable type: numeric
| n |
0 |
1 |
9.86 |
8.51 |
1.00 |
3.00 |
7.00 |
14.00 |
57.00 |
▇▂▁▁▁ |
| cls_val |
0 |
1 |
1.99 |
0.89 |
1.00 |
1.15 |
1.62 |
3.00 |
12.59 |
▇▁▁▁▁ |
| cls_mv |
0 |
1 |
0.07 |
0.68 |
-11.49 |
0.00 |
0.00 |
0.00 |
11.39 |
▁▁▇▁▁ |
| yr |
0 |
1 |
2019.22 |
0.99 |
2016.00 |
2019.00 |
2019.00 |
2020.00 |
2021.00 |
▁▃▇▇▂ |
| age |
0 |
1 |
3.64 |
1.30 |
1.00 |
3.00 |
3.00 |
4.00 |
11.00 |
▇▆▁▁▁ |
| ftl |
0 |
1 |
0.08 |
0.27 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| clm_by_hot |
0 |
1 |
-0.93 |
0.34 |
-1.00 |
-1.00 |
-1.00 |
-1.00 |
1.00 |
▇▁▁▁▁ |
| start |
0 |
1 |
0.03 |
0.09 |
0.00 |
0.00 |
0.00 |
0.00 |
0.90 |
▇▁▁▁▁ |
| bled |
0 |
1 |
0.00 |
0.02 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| off_turf |
0 |
1 |
0.05 |
0.21 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| bfnr |
0 |
1 |
0.01 |
0.11 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| lame |
0 |
1 |
0.00 |
0.02 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| lsx |
0 |
1 |
0.00 |
0.09 |
-1.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▁▁▇▁▁ |
| blnkrs |
0 |
1 |
0.04 |
0.27 |
-1.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▁▁▇▁▁ |
| age_wks |
0 |
1 |
204.45 |
65.52 |
37.00 |
154.00 |
190.00 |
240.00 |
559.00 |
▂▇▂▁▁ |
| wt |
0 |
1 |
120.32 |
2.70 |
108.00 |
119.00 |
120.00 |
122.00 |
143.00 |
▁▇▂▁▁ |
| off_odds |
0 |
1 |
12.17 |
17.09 |
0.11 |
2.50 |
6.00 |
13.00 |
99.00 |
▇▁▁▁▁ |
| fld |
0 |
1 |
8.39 |
2.21 |
2.00 |
7.00 |
8.00 |
10.00 |
30.00 |
▅▇▁▁▁ |
| L3 |
0 |
1 |
18.01 |
7.03 |
1.75 |
13.33 |
16.92 |
21.42 |
99.00 |
▇▃▁▁▁ |
| L5 |
0 |
1 |
18.26 |
6.77 |
1.75 |
13.70 |
17.25 |
21.62 |
99.00 |
▇▃▁▁▁ |
| L7 |
0 |
1 |
18.43 |
6.66 |
1.75 |
13.96 |
17.43 |
21.75 |
99.00 |
▇▃▁▁▁ |
| rest |
0 |
1 |
43.85 |
53.38 |
0.00 |
21.00 |
28.00 |
43.00 |
1081.00 |
▇▁▁▁▁ |
| avg_rest |
0 |
1 |
39.75 |
21.66 |
7.25 |
25.67 |
34.70 |
46.78 |
489.00 |
▇▁▁▁▁ |
| efforts_last90 |
0 |
1 |
0.90 |
0.95 |
0.00 |
0.00 |
1.00 |
1.00 |
8.00 |
▇▂▁▁▁ |
| Lag1 |
0 |
1 |
0.21 |
0.40 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▂ |
| Lag2 |
0 |
1 |
0.19 |
0.39 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▂ |
| Lag3 |
0 |
1 |
0.17 |
0.37 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▂ |
| Lag4 |
0 |
1 |
0.15 |
0.36 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▂ |
| Lag5 |
0 |
1 |
0.14 |
0.34 |
0.00 |
0.00 |
0.00 |
0.00 |
1.00 |
▇▁▁▁▁ |
| LP1 |
0 |
1 |
-1.91 |
2.27 |
-5.00 |
-5.00 |
-2.00 |
0.00 |
1.00 |
▆▂▃▂▇ |
| LP2 |
0 |
1 |
-1.91 |
2.16 |
-5.00 |
-4.00 |
-2.00 |
0.00 |
1.00 |
▆▂▆▂▇ |
| LP3 |
0 |
1 |
-1.92 |
2.05 |
-5.00 |
-4.00 |
-2.00 |
0.00 |
1.00 |
▆▂▇▂▇ |
| LP4 |
0 |
1 |
-1.92 |
1.95 |
-5.00 |
-3.00 |
-2.00 |
0.00 |
1.00 |
▅▁▇▁▆ |
| LP5 |
0 |
1 |
-1.93 |
1.85 |
-5.00 |
-3.00 |
-2.00 |
0.00 |
1.00 |
▃▁▇▁▅ |
| pp_id |
0 |
1 |
27487.78 |
16975.26 |
1.00 |
12623.00 |
26040.00 |
41459.00 |
59178.00 |
▇▇▆▇▆ |
| splt_1 |
0 |
1 |
4.48 |
6.00 |
0.00 |
0.00 |
2.89 |
6.79 |
51.00 |
▇▁▁▁▁ |
| splt_2 |
0 |
1 |
2.36 |
3.93 |
0.00 |
0.00 |
0.00 |
3.81 |
35.91 |
▇▁▁▁▁ |
| splt_3 |
0 |
1 |
2.60 |
5.16 |
0.00 |
0.00 |
0.00 |
3.57 |
45.30 |
▇▁▁▁▁ |
| splt_4 |
0 |
1 |
3.53 |
5.16 |
0.00 |
0.00 |
1.23 |
5.27 |
46.29 |
▇▁▁▁▁ |
| splt_5 |
0 |
1 |
1.34 |
2.75 |
0.00 |
0.00 |
0.00 |
1.93 |
27.99 |
▇▁▁▁▁ |
| splt_6 |
0 |
1 |
2.85 |
4.09 |
0.00 |
0.00 |
0.00 |
3.97 |
30.87 |
▇▂▁▁▁ |
| splt_7 |
0 |
1 |
4.68 |
5.87 |
0.00 |
0.00 |
2.90 |
7.42 |
37.86 |
▇▂▁▁▁ |
| splt_8 |
0 |
1 |
11.88 |
9.05 |
0.00 |
3.93 |
10.63 |
17.73 |
50.57 |
▇▆▂▁▁ |
| splt_9 |
0 |
1 |
10.57 |
8.58 |
0.00 |
2.97 |
9.35 |
16.01 |
47.05 |
▇▅▂▁▁ |
Build Model
We can start by splitting our data into training and testing sets.
set.seed(123)
figs_split <- initial_split(figs, strata = target)
figs_train <- training(figs_split)
figs_test <- testing(figs_split)
figs_split
## <Analysis/Assess/Total>
## <26713/8904/35617>
Tuning the Random Forest Model
We create a tuning specification to facilitate tuning the random forest. Tree is set to 1000, min_n and mtry are parameters that will be tuned. We also set the model engine to ranger and specify a classifcation model.
tune_spec <- rand_forest(
mtry = tune(),
trees = 1000,
min_n = tune()
) %>%
set_mode("classification") %>%
set_engine("ranger")
Setup a Workflow for the model
tune_wf <- workflow() %>%
add_recipe(figs_rec) %>%
add_model(tune_spec)
set.seed(234)
figs_folds <- vfold_cv(figs_train)
Tune hyperparameters
The inital training takes place in the following code chunk. We employ parallel processing to epadite the tuning process. After the tuning is complete collect_metrics and display the results
set.seed(234)
members_metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)
doParallel::registerDoParallel(cores=28)
set.seed(345)
tune_res <- tune_grid(
tune_wf,
resamples = figs_folds,
metrics = members_metrics,
grid = 10
)
tune_res %>%
collect_metrics()
## # A tibble: 40 x 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 29 30 accuracy binary 0.774 10 0.00257 Preprocessor1_Model01
## 2 29 30 roc_auc binary 0.775 10 0.00272 Preprocessor1_Model01
## 3 29 30 sens binary 0.888 10 0.00194 Preprocessor1_Model01
## 4 29 30 spec binary 0.416 10 0.00656 Preprocessor1_Model01
## 5 112 12 accuracy binary 0.777 10 0.00271 Preprocessor1_Model02
## 6 112 12 roc_auc binary 0.775 10 0.00262 Preprocessor1_Model02
## 7 112 12 sens binary 0.905 10 0.00166 Preprocessor1_Model02
## 8 112 12 spec binary 0.372 10 0.00569 Preprocessor1_Model02
## 9 65 27 accuracy binary 0.775 10 0.00229 Preprocessor1_Model03
## 10 65 27 roc_auc binary 0.776 10 0.00285 Preprocessor1_Model03
## # … with 30 more rows
Visualize The Tuning Results
tune_res %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
select(mean, min_n, mtry) %>%
pivot_longer(min_n:mtry,
values_to = "value",
names_to = "parameter") %>%
ggplot(aes(value, mean, color = parameter)) +
geom_point(show.legend = FALSE) +
facet_wrap(~ parameter, scales = "free_x")

Secondary Turning
Our initial tuning provided insight to the optimal parameters. We will now tune the parameters a second time to to obtain the optimal parameter settings.
rf_grid <- grid_regular(
mtry(range = c(30, 40)),
min_n(range = c(14, 16)),
levels = 4
)
set.seed(456)
regular_res <- tune_grid(
tune_wf,
resamples = figs_folds,
grid = rf_grid
)
regular_res %>%
collect_metrics()
## # A tibble: 24 x 8
## mtry min_n .metric .estimator mean n std_err .config
## <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 30 14 accuracy binary 0.778 10 0.00255 Preprocessor1_Model01
## 2 30 14 roc_auc binary 0.775 10 0.00279 Preprocessor1_Model01
## 3 33 14 accuracy binary 0.779 10 0.00247 Preprocessor1_Model02
## 4 33 14 roc_auc binary 0.775 10 0.00277 Preprocessor1_Model02
## 5 36 14 accuracy binary 0.778 10 0.00195 Preprocessor1_Model03
## 6 36 14 roc_auc binary 0.775 10 0.00293 Preprocessor1_Model03
## 7 40 14 accuracy binary 0.778 10 0.00208 Preprocessor1_Model04
## 8 40 14 roc_auc binary 0.776 10 0.00265 Preprocessor1_Model04
## 9 30 15 accuracy binary 0.778 10 0.00227 Preprocessor1_Model05
## 10 30 15 roc_auc binary 0.775 10 0.00265 Preprocessor1_Model05
## # … with 14 more rows
Visualize the final tuning results
regular_res %>%
collect_metrics() %>%
filter(.metric == "roc_auc") %>%
mutate(min_n = factor(min_n)) %>%
ggplot(aes(mtry, mean, color=min_n)) +
geom_line(alpha =0.5, size = 1.5) +
geom_point() + theme_fivethirtyeight() + labs(title="Tuning Grid")

Pick the Best Model
We utilize the roc_auc metric as our metric to pick the best model.
best_auc <- select_best(regular_res, "roc_auc")
final_rf <- finalize_model(
tune_spec,
best_auc
)
Variable Importance
We utilize the VIP package to identify the important variables.
fpchart <- final_rf %>%
set_engine("ranger", importance = "permutation") %>%
fit(target ~.,
data = juice(figs_prep) %>% select(-horse)) %>%
vip(geom = "point", num_features=25L) +
labs(
x = NULL,
y = NULL,
title= "Feature Importance")
fpchart + theme_fivethirtyeight()

Finalize Model
The metrics for the final model are calculated and displayed below.
final_wf <- workflow() %>%
add_recipe(figs_rec) %>%
add_model(final_rf)
final_res <- final_wf %>%
last_fit(figs_split)
final_result <- final_res %>%
collect_metrics()
gt(final_result)
| .metric |
.estimator |
.estimate |
.config |
| accuracy |
binary |
0.7717880 |
Preprocessor1_Model1 |
| roc_auc |
binary |
0.7705095 |
Preprocessor1_Model1 |
ROC Curve
final_res %>%
collect_predictions() %>%
roc_curve(target, .pred_Top, event_level = "second") %>%
ggplot(aes(x=1-specificity, y = sensitivity )) +
geom_line(size = 1.5, color = "midnightblue") +
geom_abline(
lty = 2, alpha =0.5,
color = "gray50",
size = 1.2
) +
labs(title= "ROC Curve", x="1-Specificity", y= "Sensitivity") + theme_fivethirtyeight()

Confusion Matrix
The confusion matrix for the final model is diplayed below:
collect_predictions(final_res) %>%
conf_mat(target, .pred_class)
## Truth
## Prediction NoTop Top
## NoTop 6109 1376
## Top 656 763
Model Predictions
Finally, model predictions on the test dataset are set forth in the data table below.
preTab <- final_res %>%
collect_predictions() %>%
mutate(correct = case_when(target == .pred_class ~ "Correct",
TRUE ~ "Incorrect"))
datatable(preTab)