library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
train_raw <- read_csv("train.csv")
## Rows: 46244 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr   (8): home_team, away_team, batter_team, batter_name, pitcher_name, bb_...
## dbl  (16): bip_id, batter_id, pitcher_id, is_batter_lefty, is_pitcher_lefty,...
## date  (1): game_date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
train_raw %>%
  ggplot(aes(plate_x, plate_z, z = is_home_run)) +
  stat_summary_hex(alpha = 0.8, bins = 10) +
  scale_fill_viridis_c(labels = scales::percent) +
  labs(fill = "% home runs")

train_raw %>%
  ggplot(aes(launch_angle, launch_speed, z = is_home_run)) +
  stat_summary_hex(alpha = 0.8, bins = 15) +
  scale_fill_viridis_c(labels = scales::percent) +
  labs(fill = "% home runs")
## Warning: Removed 20378 rows containing non-finite values
## (`stat_summary_hex()`).

train_raw %>%
  mutate(is_home_run = if_else(as.logical(is_home_run), "yes", "no")) %>%
  select(is_home_run, balls, strikes, inning) %>%
  pivot_longer(balls:inning) %>%
  mutate(name = fct_inorder(name)) %>%
  ggplot(aes(value, after_stat(density), fill = is_home_run)) +
  geom_histogram(alpha = 0.5, binwidth = 1, position = "identity") +
  facet_wrap(~name, scales = "free") +
  labs(fill = "Home run?")

Build a model

library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom        1.0.5     ✔ rsample      1.2.0
## ✔ dials        1.2.0     ✔ tune         1.1.2
## ✔ infer        1.0.4     ✔ workflows    1.1.3
## ✔ modeldata    1.2.0     ✔ workflowsets 1.0.1
## ✔ parsnip      1.1.1     ✔ yardstick    1.2.0
## ✔ recipes      1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
bb_split <- train_raw %>%
  mutate(
    is_home_run = if_else(as.logical(is_home_run), "HR", "no"),
    is_home_run = factor(is_home_run)
  ) %>%
  initial_split(strata = is_home_run)
bb_train <- training(bb_split)
bb_test <- testing(bb_split)

set.seed(234)
bb_folds <- vfold_cv(bb_train, strata = is_home_run)
bb_folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits               id    
##    <list>               <chr> 
##  1 <split [31214/3469]> Fold01
##  2 <split [31214/3469]> Fold02
##  3 <split [31214/3469]> Fold03
##  4 <split [31215/3468]> Fold04
##  5 <split [31215/3468]> Fold05
##  6 <split [31215/3468]> Fold06
##  7 <split [31215/3468]> Fold07
##  8 <split [31215/3468]> Fold08
##  9 <split [31215/3468]> Fold09
## 10 <split [31215/3468]> Fold10
bb_rec <-
  recipe(is_home_run ~ launch_angle + launch_speed + plate_x + plate_z +
    bb_type + bearing + pitch_mph +
    is_pitcher_lefty + is_batter_lefty +
    inning + balls + strikes + game_date,
  data = bb_train) %>%
  step_date(game_date, features = c("week"), keep_original_cols = FALSE) %>%
  step_unknown(all_nominal_predictors()) %>%
  step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
  step_impute_median(all_numeric_predictors(), -launch_angle, -launch_speed) %>%
  step_impute_linear(launch_angle, launch_speed,
    impute_with = imp_vars(plate_x, plate_z, pitch_mph)) %>%
  step_nzv(all_predictors())

## we can `prep()` just to check that it works
prep(bb_rec)
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 13
## 
## ── Training information
## Training data contained 34683 data points and 15255 incomplete rows.
## 
## ── Operations
## • Date features from: game_date | Trained
## • Unknown factor level assignment for: bb_type, bearing | Trained
## • Dummy variables from: bb_type, bearing | Trained
## • Median imputation for: plate_x, plate_z, pitch_mph, ... | Trained
## • Linear regression imputation for: launch_angle, launch_speed | Trained
## • Sparse, unbalanced variable filter removed: bb_type_unknown, ... | Trained
xgb_spec <-
  boost_tree(
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = 0.01
  ) %>%
  set_engine("xgboost") %>%
  set_mode("classification")

xgb_wf <- workflow(bb_rec, xgb_spec)

Use racing to tune xgboost

library(finetune)
doParallel::registerDoParallel()

set.seed(345)
xgb_rs <- tune_race_anova(
  xgb_wf,
  resamples = bb_folds,
  grid = 15,
  metrics = metric_set(mn_log_loss),
  control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will minimize the mn_log_loss metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 9 eliminated; 6 candidates remain.
## 
## ℹ Fold07: All but one parameter combination were eliminated.
plot_race(xgb_rs)

show_best(xgb_rs)
## # A tibble: 1 × 9
##    mtry trees min_n .metric     .estimator   mean     n std_err .config         
##   <int> <int> <int> <chr>       <chr>       <dbl> <int>   <dbl> <chr>           
## 1     6  1536    11 mn_log_loss binary     0.0981    10 0.00171 Preprocessor1_M…
xgb_last <- xgb_wf %>%
  finalize_workflow(select_best(xgb_rs, "mn_log_loss")) %>%
  last_fit(bb_split)

xgb_last
## # Resampling results
## # Manual resampling 
## # A tibble: 1 × 6
##   splits                id             .metrics .notes   .predictions .workflow 
##   <list>                <chr>          <list>   <list>   <list>       <list>    
## 1 <split [34683/11561]> train/test sp… <tibble> <tibble> <tibble>     <workflow>
collect_predictions(xgb_last) %>%
  mn_log_loss(is_home_run, .pred_HR)
## # A tibble: 1 × 3
##   .metric     .estimator .estimate
##   <chr>       <chr>          <dbl>
## 1 mn_log_loss binary        0.0975
library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
extract_workflow(xgb_last) %>%
  extract_fit_parsnip() %>%
  vip(geom = "point", num_features = 15)

  1. Question and Data:
    • What is the research question? Clearly state the research question you aim to address using the new dataset. Can we predict whether a particulare baseball play will result in a home run based on feature such as launch angle, launch speed, plate position, pitch type and other game related variables.

    • Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The data set contains 46,244 observations of 25 variables. It is data containing various variables about baseball. Some variables are game_date, home_team, away_team, batter_name, launch speed, plate position, pitch type, launch angle and more.

    • What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The primary variables of interest in the data set that are used in the analysis are launch_angle, launch_speed, plate_x, plate_z, bb_type, bearing, pitch_mph, is_pitcher_lefty, is_batter_lefty, inning, balls, strikes, game_date and is_home_run. We are using all of these variables to predict is_home_run. Launch_angle, launch_speed. plate_x, plate_z, plate_mph, is_pitcher_lefty, is_battery_lefty, innings, balls, and strikes are all numerical data. Bb_type and bearing are character data, and game_date is date.

  2. Data Exploration and Transformation:
    • Describe the differences between the original data and the data transformed for modeling. Why? Explain any preprocessing or transformations performed on the new dataset compared to the original data. Discuss why these changes were necessary or beneficial. The original data set had 46,244 observations of 25 variables. We used a few data preprocessing and transformation steps in the code along video. We used mutate(is_home_run = if_else(as.logical(is_home_run), “yes”, “no”)) to change the is_home_run variable from 0 or 1 to no or yes. This makes the data read and use in the model. We used select(is_home_run, balls, strikes, inning) to narrow down the dataset to only include the four variables is_home_run, balls, strikes, and inning. This narrows down the data set so that we get rid of any unnecessary data to better train the model. We used pivot_longer(balls:inning) to transform the data from wide format to long format. We used this to transform the three columns balls, strikes, and inning into two columns, name and value. We also used mutate(name = fct_inorder(name))to modify the name column. This function changed the variable from character data to factor data and arranged them in the order they appear in the data set.
  3. Data Preparation and Modeling:
    • What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. The data preparation steps we used were mutate, select, pivot_longer, and mutate with fct_inorder. We also split the data into train and test. This is necessary to be sure we are using a sample of the data to train and a separate sample to test the data.

    • What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning models we used in the analysis was XGBoost. We used boost_tree and set_engine. XGBoost was used to predict the home runs based on various variables such as plate_x, plate_z, inning, balls, strikes, pitch_mph, launch_speed. and launch_angle. XGBoost is great at handling classification tasks.

  4. Model Evaluation:
    • What metrics are used in the model evaluation? Detail the evaluation metrics you used to assess the performance of your machine learning model(s) on the new dataset. Discuss the significance of these metrics in the context of your research question. The metrics we used were collect_predictions(xgb_last) %>% mn_log_loss(is_home_run, .pred_HR). Log Loss is a metric that tests the accuraacy of a classifier. It is suitable for binary and multi-class classification problems. Because the research question is aiming to predict a primary outcome (whether a baseball hit is a home run or not) Log Loss is an appropriate metric to use. It considered the correctness of a prediction as well as factors in the confidence of the prediction. Log loss helps assess how well a model can predict the likelihood of an outcome.
  5. Conclusion:
    • What are the major findings? Summarize the key findings and insights obtained from your analysis of the new dataset. After turning the model the best parameters were selected based on the Log Loss metric. The analysis provides insights into factors affecting home runs and uses XGBoost to predict the likelihood of a play being a home run. The mn_log_loss code resulted in 0.0975 estimator with a mean of 0.0981.