library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
train <- read_csv("../00_data/train 2.csv")  
## Rows: 46244 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): home_team, away_team, batter_team, batter_name, pitcher_name, bb_...
## dbl  (16): bip_id, batter_id, pitcher_id, is_batter_lefty, is_pitcher_lefty,...
## date  (1): game_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
train_raw <- train %>% 
    mutate(game_date = as.Date(game_date))
train_raw %>%
  ggplot(aes(plate_x, plate_z, z = is_home_run)) +
  stat_summary_hex(alpha = 0.8, bins = 10) +
  scale_fill_viridis_c(labels = scales::percent) +
  labs(fill = "% home runs")

train_raw %>%
  ggplot(aes(launch_angle, launch_speed, z = is_home_run)) +
  stat_summary_hex(alpha = 0.8, bins = 15) +
  scale_fill_viridis_c(labels = scales::percent) +
  labs(fill = "% home runs")
## Warning: Removed 20378 rows containing non-finite values
## (`stat_summary_hex()`).

train_raw %>%
  mutate(is_home_run = if_else(as.logical(is_home_run), "yes", "no")) %>%
  select(is_home_run, balls, strikes, inning) %>%
  pivot_longer(balls:inning) %>%
  mutate(name = fct_inorder(name)) %>%
  ggplot(aes(value, after_stat(density), fill = is_home_run)) +
  geom_histogram(alpha = 0.5, binwidth = 1, position = "identity") +
  facet_wrap(~name, scales = "free") +
  labs(fill = "Home run?")

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.5     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
bb_split <- train_raw %>%
  mutate(
    is_home_run = if_else(as.logical(is_home_run), "HR", "no"),
    is_home_run = factor(is_home_run)
  ) %>%
  initial_split(strata = is_home_run)
bb_train <- training(bb_split)
bb_test <- testing(bb_split)

set.seed(234)
bb_folds <- vfold_cv(bb_train, strata = is_home_run)
bb_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [31214/3469]> Fold01
##  2 <split [31214/3469]> Fold02
##  3 <split [31214/3469]> Fold03
##  4 <split [31215/3468]> Fold04
##  5 <split [31215/3468]> Fold05
##  6 <split [31215/3468]> Fold06
##  7 <split [31215/3468]> Fold07
##  8 <split [31215/3468]> Fold08
##  9 <split [31215/3468]> Fold09
## 10 <split [31215/3468]> Fold10
bb_rec <-
  recipe(is_home_run ~ launch_angle + launch_speed + plate_x + plate_z + inning + balls + strikes + game_date + bb_type +bearing + pitch_mph + is_pitcher_lefty + is_batter_lefty,
  data = bb_train
  ) %>%
  step_date(game_date, features = c("week"), keep_original_cols = FALSE) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_impute_median(all_numeric_predictors(), -launch_angle, -launch_speed) %>%
  step_impute_linear(launch_angle, launch_speed,
    impute_with = imp_vars(plate_x, plate_z, pitch_mph)
  ) %>%
  step_nzv(all_predictors())

prep(bb_rec)
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 13
## 
## ── Training information
## Training data contained 34683 data points and 15255 incomplete rows.
## 
## ── Operations
## • Date features from: game_date | Trained
## • Unknown factor level assignment for: bb_type, bearing | Trained
## • Dummy variables from: bb_type, bearing | Trained
## • Median imputation for: plate_x, plate_z, inning, balls, ... | Trained
## • Linear regression imputation for: launch_angle, launch_speed | Trained
## • Sparse, unbalanced variable filter removed: bb_type_unknown, ... | Trained
xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(bb_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_race_anova(
  xgb_wf,
  resamples = bb_folds,
  grid = 15,
  metrics = metric_set(mn_log_loss),
  control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will minimize the mn_log_loss metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 9 eliminated; 6 candidates remain.
## 
## ℹ Fold07: All but one parameter combination were eliminated.
plot_race(xgb_rs)

show_best(xgb_rs)
## # A tibble: 1 × 9
##    mtry trees min_n .metric     .estimator   mean     n std_err .config         
##   <int> <int> <int> <chr>       <chr>       <dbl> <int>   <dbl> <chr>           
## 1     6  1536    11 mn_log_loss binary     0.0980    10 0.00169 Preprocessor1_M…
xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "mn_log_loss")) %>%
  last_fit(bb_split)

xgb_last
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits                id             .metrics .notes   .predictions .workflow 
##   <list>                <chr>          <list>   <list>   <list>       <list>    
## 1 <split [34683/11561]> train/test sp… <tibble> <tibble> <tibble>     <workflow>
collect_predictions(xgb_last) %>%
  mn_log_loss(is_home_run, .pred_HR)
## # A tibble: 1 × 3
##   .metric     .estimator .estimate
##   <chr>       <chr>          <dbl>
## 1 mn_log_loss binary        0.0973
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
extract_workflow(xgb_last) %>%
  extract_fit_parsnip() %>%
  vip(geom = "point", num_features = 15)

1. What is the research question?

#2. Data Exploration and Transformation: - The newly transformed data has home runs as a factor and changed the text to either “HR” or “no”.

3. Data Preparation and Modeling:

- There were a few steps made in this data prep and modeling section that include: step_unknown(creates a specification of a recipe step that will assign a missing value in a factor level to unknown), step_dummy(creates a specification of a recipe step that will convert nominal data into one or more numeric binary models), step_impute_median(creates a specification of a recipe step that will substitute missing values of numeric variables by the training set median of those variables.) and step_impute_linear(creates a specification of a recipe step that will create linear regression models to impute missing data). 

#4. Model Evaluation: - Looking at the mn_log_loss, we can see that the model did a good job of predicting the outcome.

5. Conclusion: