Explore data
library(tidyverse)
train_raw <- read_csv("train.csv")
train_raw %>%
ggplot(aes(plate_x, plate_z, z = is_home_run)) +
stat_summary_hex(alpha = 0.8, bins = 10) +
scale_fill_viridis_c(labels = scales::percent) +
labs(fill = "% home runs")

train_raw %>%
ggplot(aes(launch_angle, launch_speed, z = is_home_run)) +
stat_summary_hex(alpha = 0.8, bins = 15) +
scale_fill_viridis_c(labels = scales::percent) +
labs(fill = "% home runs")

train_raw %>%
mutate(is_home_run = if_else(as.logical(is_home_run), "yes", "no")) %>%
select(is_home_run, balls, strikes, inning) %>%
pivot_longer(balls:inning) %>%
mutate(name = fct_inorder(name)) %>%
ggplot(aes(value, after_stat(density), fill = is_home_run)) +
geom_histogram(alpha = 0.5, binwidth = 1, position = "identity") +
facet_wrap(vars(name), scales = "free") +
labs(x = NULL, fill = "Home run?")

Build a model
library(tidymodels)
set.seed(123)
bb_split <- train_raw %>%
mutate(is_home_run = if_else(as.logical(is_home_run), "HR", "no"),
factor(is_home_run)) %>%
initial_split(strata = is_home_run)
bb_train <- training(bb_split)
bb_test <- testing(bb_split)
set.seed(234)
bb_folds <- vfold_cv(bb_train, strata = is_home_run)
bb_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [31214/3469]> Fold01
## 2 <split [31214/3469]> Fold02
## 3 <split [31214/3469]> Fold03
## 4 <split [31215/3468]> Fold04
## 5 <split [31215/3468]> Fold05
## 6 <split [31215/3468]> Fold06
## 7 <split [31215/3468]> Fold07
## 8 <split [31215/3468]> Fold08
## 9 <split [31215/3468]> Fold09
## 10 <split [31215/3468]> Fold10
bb_rec <-
recipe(is_home_run ~ launch_angle + launch_speed + plate_x + plate_z +
bb_type + bearing + pitch_mph +is_pitcher_lefty + is_batter_lefty +
inning + balls + strikes + game_date,
data = bb_train) %>%
step_date(game_date, features = c("week"), keep_original_cols = FALSE) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_impute_median(all_numeric_predictors(), -launch_angle, -launch_speed) %>%
step_impute_linear(launch_angle, launch_speed,
impute_with = imp_vars(plate_x, plate_z, pitch_mph)) %>%
step_nzv(all_predictors())
prep(bb_rec)
xgb_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
mtry = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_wf <- workflow(bb_rec, xgb_spec)
Using racing methods to tune our model
library(finetune)
doParallel::registerDoParallel()
set.seed(345)
xgb_rs <- tune_race_anova(
xgb_wf,
resamples = bb_folds,
grid = 15,
metrics = metric_set(mn_log_loss),
control = control_race(verbose_elim = TRUE)
)
xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 5
## splits id .order .metrics .notes
## <list> <chr> <int> <list> <list>
## 1 <split [31214/3469]> Fold01 2 <tibble [15 × 7]> <tibble [0 × 3]>
## 2 <split [31214/3469]> Fold02 3 <tibble [15 × 7]> <tibble [0 × 3]>
## 3 <split [31215/3468]> Fold10 1 <tibble [15 × 7]> <tibble [0 × 3]>
## 4 <split [31215/3468]> Fold07 4 <tibble [6 × 7]> <tibble [0 × 3]>
## 5 <split [31214/3469]> Fold03 5 <tibble [1 × 7]> <tibble [0 × 3]>
## 6 <split [31215/3468]> Fold04 8 <tibble [1 × 7]> <tibble [0 × 3]>
## 7 <split [31215/3468]> Fold05 6 <tibble [1 × 7]> <tibble [0 × 3]>
## 8 <split [31215/3468]> Fold06 9 <tibble [1 × 7]> <tibble [0 × 3]>
## 9 <split [31215/3468]> Fold08 10 <tibble [1 × 7]> <tibble [0 × 3]>
## 10 <split [31215/3468]> Fold09 7 <tibble [1 × 7]> <tibble [0 × 3]>
plot_race(xgb_rs)

show_best(xgb_rs)
## # A tibble: 1 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 6 1536 11 mn_log_loss binary 0.0981 10 0.00173 Preprocessor1_M…
xgb_last <- xgb_wf %>%
finalize_workflow(select_best(xgb_rs, "mn_log_loss")) %>%
last_fit(bb_split)
xgb_last
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [34683/11561]> train/test sp… <tibble> <tibble> <tibble> <workflow>
collect_predictions(xgb_last) %>%
mn_log_loss(is_home_run, .pred_HR)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mn_log_loss binary 0.0974
library(vip)
extract_workflow(xgb_last) %>%
extract_fit_parsnip() %>%
vip(geom = "point", num_features = 15)
