Explore data

library(tidyverse)
train_raw <- read_csv("train.csv")
train_raw %>%
  ggplot(aes(plate_x, plate_z, z = is_home_run)) +
  stat_summary_hex(alpha = 0.8, bins = 10) +
  scale_fill_viridis_c(labels = scales::percent) +
  labs(fill = "% home runs")

train_raw %>%
  ggplot(aes(launch_angle, launch_speed, z = is_home_run)) +
  stat_summary_hex(alpha = 0.8, bins = 15) +
  scale_fill_viridis_c(labels = scales::percent) +
  labs(fill = "% home runs")

train_raw %>%
  mutate(is_home_run = if_else(as.logical(is_home_run), "yes", "no")) %>%
  select(is_home_run, balls, strikes, inning) %>%
  pivot_longer(balls:inning) %>%
  mutate(name = fct_inorder(name)) %>%
  ggplot(aes(value, after_stat(density), fill = is_home_run)) +
  geom_histogram(alpha = 0.5, binwidth = 1, position = "identity") +
  facet_wrap(vars(name), scales = "free") +
  labs(x = NULL, fill = "Home run?")

Build a model

library(tidymodels)

set.seed(123)
bb_split <- train_raw %>%
  mutate(is_home_run = if_else(as.logical(is_home_run), "HR", "no"),
         factor(is_home_run)) %>%
  initial_split(strata = is_home_run)

bb_train <- training(bb_split)
bb_test <- testing(bb_split)

set.seed(234)
bb_folds <- vfold_cv(bb_train, strata = is_home_run)

bb_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [31214/3469]> Fold01
##  2 <split [31214/3469]> Fold02
##  3 <split [31214/3469]> Fold03
##  4 <split [31215/3468]> Fold04
##  5 <split [31215/3468]> Fold05
##  6 <split [31215/3468]> Fold06
##  7 <split [31215/3468]> Fold07
##  8 <split [31215/3468]> Fold08
##  9 <split [31215/3468]> Fold09
## 10 <split [31215/3468]> Fold10
bb_rec <-
  recipe(is_home_run ~ launch_angle + launch_speed + plate_x + plate_z +
    bb_type + bearing + pitch_mph +is_pitcher_lefty + is_batter_lefty +
    inning + balls + strikes + game_date,
  data = bb_train) %>%
  step_date(game_date, features = c("week"), keep_original_cols = FALSE) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_impute_median(all_numeric_predictors(), -launch_angle, -launch_speed) %>%
  step_impute_linear(launch_angle, launch_speed,
    impute_with = imp_vars(plate_x, plate_z, pitch_mph)) %>%
  step_nzv(all_predictors())

prep(bb_rec)
xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(bb_rec, xgb_spec)

Using racing methods to tune our model

library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_race_anova(
  xgb_wf,
  resamples = bb_folds,
  grid = 15,
  metrics = metric_set(mn_log_loss),
  control = control_race(verbose_elim = TRUE)
)

xgb_rs
## # Tuning results
## # 10-fold cross-validation using stratification 
## # A tibble: 10 × 5
##    splits               id     .order .metrics          .notes          
##    <list>               <chr>   <int> <list>            <list>          
##  1 <split [31214/3469]> Fold01      2 <tibble [15 × 7]> <tibble [0 × 3]>
##  2 <split [31214/3469]> Fold02      3 <tibble [15 × 7]> <tibble [0 × 3]>
##  3 <split [31215/3468]> Fold10      1 <tibble [15 × 7]> <tibble [0 × 3]>
##  4 <split [31215/3468]> Fold07      4 <tibble [6 × 7]>  <tibble [0 × 3]>
##  5 <split [31214/3469]> Fold03      5 <tibble [1 × 7]>  <tibble [0 × 3]>
##  6 <split [31215/3468]> Fold04      8 <tibble [1 × 7]>  <tibble [0 × 3]>
##  7 <split [31215/3468]> Fold05      6 <tibble [1 × 7]>  <tibble [0 × 3]>
##  8 <split [31215/3468]> Fold06      9 <tibble [1 × 7]>  <tibble [0 × 3]>
##  9 <split [31215/3468]> Fold08     10 <tibble [1 × 7]>  <tibble [0 × 3]>
## 10 <split [31215/3468]> Fold09      7 <tibble [1 × 7]>  <tibble [0 × 3]>
plot_race(xgb_rs)

show_best(xgb_rs)
## # A tibble: 1 × 9
##    mtry trees min_n .metric     .estimator   mean     n std_err .config         
##   <int> <int> <int> <chr>       <chr>       <dbl> <int>   <dbl> <chr>           
## 1     6  1536    11 mn_log_loss binary     0.0981    10 0.00173 Preprocessor1_M…
xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "mn_log_loss")) %>%
  last_fit(bb_split)

xgb_last
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits                id             .metrics .notes   .predictions .workflow 
##   <list>                <chr>          <list>   <list>   <list>       <list>    
## 1 <split [34683/11561]> train/test sp… <tibble> <tibble> <tibble>     <workflow>
collect_predictions(xgb_last) %>%
  mn_log_loss(is_home_run, .pred_HR)
## # A tibble: 1 × 3
##   .metric     .estimator .estimate
##   <chr>       <chr>          <dbl>
## 1 mn_log_loss binary        0.0974
library(vip)

extract_workflow(xgb_last) %>%
  extract_fit_parsnip() %>%
  vip(geom = "point", num_features = 15)