library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
train <- read_csv("../00_data/train 2.csv")
## Rows: 46244 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): home_team, away_team, batter_team, batter_name, pitcher_name, bb_...
## dbl (16): bip_id, batter_id, pitcher_id, is_batter_lefty, is_pitcher_lefty,...
## date (1): game_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
train_raw <- train %>%
mutate(game_date = as.Date(game_date))
train_raw %>%
ggplot(aes(plate_x, plate_z, z = is_home_run)) +
stat_summary_hex(alpha = 0.8, bins = 10) +
scale_fill_viridis_c(labels = scales::percent) +
labs(fill = "% home runs")
train_raw %>%
ggplot(aes(launch_angle, launch_speed, z = is_home_run)) +
stat_summary_hex(alpha = 0.8, bins = 15) +
scale_fill_viridis_c(labels = scales::percent) +
labs(fill = "% home runs")
## Warning: Removed 20378 rows containing non-finite values
## (`stat_summary_hex()`).
train_raw %>%
mutate(is_home_run = if_else(as.logical(is_home_run), "yes", "no")) %>%
select(is_home_run, balls, strikes, inning) %>%
pivot_longer(balls:inning) %>%
mutate(name = fct_inorder(name)) %>%
ggplot(aes(value, after_stat(density), fill = is_home_run)) +
geom_histogram(alpha = 0.5, binwidth = 1, position = "identity") +
facet_wrap(~name, scales = "free") +
labs(fill = "Home run?")
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Use suppressPackageStartupMessages() to eliminate package startup messages
set.seed(123)
bb_split <- train_raw %>%
mutate(
is_home_run = if_else(as.logical(is_home_run), "HR", "no"),
is_home_run = factor(is_home_run)
) %>%
initial_split(strata = is_home_run)
bb_train <- training(bb_split)
bb_test <- testing(bb_split)
set.seed(234)
bb_folds <- vfold_cv(bb_train, strata = is_home_run)
bb_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [31214/3469]> Fold01
## 2 <split [31214/3469]> Fold02
## 3 <split [31214/3469]> Fold03
## 4 <split [31215/3468]> Fold04
## 5 <split [31215/3468]> Fold05
## 6 <split [31215/3468]> Fold06
## 7 <split [31215/3468]> Fold07
## 8 <split [31215/3468]> Fold08
## 9 <split [31215/3468]> Fold09
## 10 <split [31215/3468]> Fold10
bb_rec <-
recipe(is_home_run ~ launch_angle + launch_speed + plate_x + plate_z + inning + balls + strikes + game_date + bb_type +bearing + pitch_mph + is_pitcher_lefty + is_batter_lefty,
data = bb_train
) %>%
step_date(game_date, features = c("week"), keep_original_cols = FALSE) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_impute_median(all_numeric_predictors(), -launch_angle, -launch_speed) %>%
step_impute_linear(launch_angle, launch_speed,
impute_with = imp_vars(plate_x, plate_z, pitch_mph)
) %>%
step_nzv(all_predictors())
prep(bb_rec)
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 13
##
## ── Training information
## Training data contained 34683 data points and 15255 incomplete rows.
##
## ── Operations
## • Date features from: game_date | Trained
## • Unknown factor level assignment for: bb_type, bearing | Trained
## • Dummy variables from: bb_type, bearing | Trained
## • Median imputation for: plate_x, plate_z, inning, balls, ... | Trained
## • Linear regression imputation for: launch_angle, launch_speed | Trained
## • Sparse, unbalanced variable filter removed: bb_type_unknown, ... | Trained
xgb_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
mtry = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_wf <- workflow(bb_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()
set.seed(345)
xgb_rs <- tune_race_anova(
xgb_wf,
resamples = bb_folds,
grid = 15,
metrics = metric_set(mn_log_loss),
control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will minimize the mn_log_loss metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 9 eliminated; 6 candidates remain.
##
## ℹ Fold07: All but one parameter combination were eliminated.
plot_race(xgb_rs)
show_best(xgb_rs)
## # A tibble: 1 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 6 1536 11 mn_log_loss binary 0.0980 10 0.00169 Preprocessor1_M…
xgb_last <- xgb_wf %>%
finalize_workflow(select_best(xgb_rs, "mn_log_loss")) %>%
last_fit(bb_split)
xgb_last
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [34683/11561]> train/test sp… <tibble> <tibble> <tibble> <workflow>
collect_predictions(xgb_last) %>%
mn_log_loss(is_home_run, .pred_HR)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mn_log_loss binary 0.0973
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
extract_workflow(xgb_last) %>%
extract_fit_parsnip() %>%
vip(geom = "point", num_features = 15)
#2. Data Exploration and Transformation: - The newly transformed data has home runs as a factor and changed the text to either “HR” or “no”.
- There were a few steps made in this data prep and modeling section that include: step_unknown(creates a specification of a recipe step that will assign a missing value in a factor level to unknown), step_dummy(creates a specification of a recipe step that will convert nominal data into one or more numeric binary models), step_impute_median(creates a specification of a recipe step that will substitute missing values of numeric variables by the training set median of those variables.) and step_impute_linear(creates a specification of a recipe step that will create linear regression models to impute missing data).
#4. Model Evaluation: - Looking at the mn_log_loss, we can see that the model did a good job of predicting the outcome.