library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
train_raw <- read_csv("train.csv")
## Rows: 46244 Columns: 25
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): home_team, away_team, batter_team, batter_name, pitcher_name, bb_...
## dbl (16): bip_id, batter_id, pitcher_id, is_batter_lefty, is_pitcher_lefty,...
## date (1): game_date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
train_raw %>%
ggplot(aes(plate_x, plate_z, z = is_home_run)) +
stat_summary_hex(alpha = 0.8, bins = 10) +
scale_fill_viridis_c(labels = scales::percent) +
labs(fill = "% home runs")
train_raw %>%
ggplot(aes(launch_angle, launch_speed, z = is_home_run)) +
stat_summary_hex(alpha = 0.8, bins = 15) +
scale_fill_viridis_c(labels = scales::percent) +
labs(fill = "% home runs")
## Warning: Removed 20378 rows containing non-finite values
## (`stat_summary_hex()`).
train_raw %>%
mutate(is_home_run = if_else(as.logical(is_home_run), "yes", "no")) %>%
select(is_home_run, balls, strikes, inning) %>%
pivot_longer(balls:inning) %>%
mutate(name = fct_inorder(name)) %>%
ggplot(aes(value, after_stat(density), fill = is_home_run)) +
geom_histogram(alpha = 0.5, binwidth = 1, position = "identity") +
facet_wrap(~name, scales = "free") +
labs(fill = "Home run?")
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.4 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
set.seed(123)
bb_split <- train_raw %>%
mutate(
is_home_run = if_else(as.logical(is_home_run), "HR", "no"),
is_home_run = factor(is_home_run)
) %>%
initial_split(strata = is_home_run)
bb_train <- training(bb_split)
bb_test <- testing(bb_split)
set.seed(234)
bb_folds <- vfold_cv(bb_train, strata = is_home_run)
bb_folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [31214/3469]> Fold01
## 2 <split [31214/3469]> Fold02
## 3 <split [31214/3469]> Fold03
## 4 <split [31215/3468]> Fold04
## 5 <split [31215/3468]> Fold05
## 6 <split [31215/3468]> Fold06
## 7 <split [31215/3468]> Fold07
## 8 <split [31215/3468]> Fold08
## 9 <split [31215/3468]> Fold09
## 10 <split [31215/3468]> Fold10
bb_rec <-
recipe(is_home_run ~ launch_angle + launch_speed + plate_x + plate_z +
bb_type + bearing + pitch_mph +
is_pitcher_lefty + is_batter_lefty +
inning + balls + strikes + game_date,
data = bb_train) %>%
step_date(game_date, features = c("week"), keep_original_cols = FALSE) %>%
step_unknown(all_nominal_predictors()) %>%
step_dummy(all_nominal_predictors(), one_hot = TRUE) %>%
step_impute_median(all_numeric_predictors(), -launch_angle, -launch_speed) %>%
step_impute_linear(launch_angle, launch_speed,
impute_with = imp_vars(plate_x, plate_z, pitch_mph)) %>%
step_nzv(all_predictors())
## we can `prep()` just to check that it works
prep(bb_rec)
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 13
##
## ── Training information
## Training data contained 34683 data points and 15255 incomplete rows.
##
## ── Operations
## • Date features from: game_date | Trained
## • Unknown factor level assignment for: bb_type, bearing | Trained
## • Dummy variables from: bb_type, bearing | Trained
## • Median imputation for: plate_x, plate_z, pitch_mph, ... | Trained
## • Linear regression imputation for: launch_angle, launch_speed | Trained
## • Sparse, unbalanced variable filter removed: bb_type_unknown, ... | Trained
xgb_spec <-
boost_tree(
trees = tune(),
min_n = tune(),
mtry = tune(),
learn_rate = 0.01
) %>%
set_engine("xgboost") %>%
set_mode("classification")
xgb_wf <- workflow(bb_rec, xgb_spec)
library(finetune)
doParallel::registerDoParallel()
set.seed(345)
xgb_rs <- tune_race_anova(
xgb_wf,
resamples = bb_folds,
grid = 15,
metrics = metric_set(mn_log_loss),
control = control_race(verbose_elim = TRUE)
)
## i Creating pre-processing data to finalize unknown parameter: mtry
## ℹ Racing will minimize the mn_log_loss metric.
## ℹ Resamples are analyzed in a random order.
## ℹ Fold10: 9 eliminated; 6 candidates remain.
##
## ℹ Fold07: All but one parameter combination were eliminated.
plot_race(xgb_rs)
show_best(xgb_rs)
## # A tibble: 1 × 9
## mtry trees min_n .metric .estimator mean n std_err .config
## <int> <int> <int> <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 6 1536 11 mn_log_loss binary 0.0981 10 0.00171 Preprocessor1_M…
xgb_last <- xgb_wf %>%
finalize_workflow(select_best(xgb_rs, "mn_log_loss")) %>%
last_fit(bb_split)
xgb_last
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [34683/11561]> train/test sp… <tibble> <tibble> <tibble> <workflow>
collect_predictions(xgb_last) %>%
mn_log_loss(is_home_run, .pred_HR)
## # A tibble: 1 × 3
## .metric .estimator .estimate
## <chr> <chr> <dbl>
## 1 mn_log_loss binary 0.0975
library(vip)
##
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
##
## vi
extract_workflow(xgb_last) %>%
extract_fit_parsnip() %>%
vip(geom = "point", num_features = 15)
What is the research question? Clearly state the research question you aim to address using the new dataset. Can we predict whether a particulare baseball play will result in a home run based on feature such as launch angle, launch speed, plate position, pitch type and other game related variables.
Describe the data briefly: Provide an overview of the new dataset, highlighting its key characteristics and dimensions. The data set contains 46,244 observations of 25 variables. It is data containing various variables about baseball. Some variables are game_date, home_team, away_team, batter_name, launch speed, plate position, pitch type, launch angle and more.
What are the characteristics of the key variables used in the analysis? Describe the primary variables of interest in the dataset and their characteristics. The primary variables of interest in the data set that are used in the analysis are launch_angle, launch_speed, plate_x, plate_z, bb_type, bearing, pitch_mph, is_pitcher_lefty, is_batter_lefty, inning, balls, strikes, game_date and is_home_run. We are using all of these variables to predict is_home_run. Launch_angle, launch_speed. plate_x, plate_z, plate_mph, is_pitcher_lefty, is_battery_lefty, innings, balls, and strikes are all numerical data. Bb_type and bearing are character data, and game_date is date.
What are the names of data preparation steps mentioned in the video? List and describe any data preparation steps or techniques mentioned in the CA video that you applied to the new dataset. The data preparation steps we used were mutate, select, pivot_longer, and mutate with fct_inorder. We also split the data into train and test. This is necessary to be sure we are using a sample of the data to train and a separate sample to test the data.
What is the name of the machine learning model(s) used in the analysis? Specify the machine learning model(s) you employed for your analysis and briefly explain their relevance to the research question. The machine learning models we used in the analysis was XGBoost. We used boost_tree and set_engine. XGBoost was used to predict the home runs based on various variables such as plate_x, plate_z, inning, balls, strikes, pitch_mph, launch_speed. and launch_angle. XGBoost is great at handling classification tasks.