Question: Swing probability is the backbone of several other important metrics that compare individual players to league average. Create one such metric and aggregate it by player for Season 2. In 250 words, explain the metric methodology. Send the top 10 and bottom 10 players in the leaderboard for this metric.

#library(tidyverse)
#libary(xgboost)
#library(DT)

Statistic 1: Combo Rating

This statistic will calculate the effectiveness of pitch combos. A combo will be judged on its ability to induce chases and called strikes on the second pitch. This ability will be measured against the independent swing probability of the second pitch.

For example, Pitcher A throws a SL / FB combo, where the FB alone would have a swing probability of 0.8; however, the SL / FB sequence “freezes” the batter, inducing a take in the zone. This would be recorded as a credit to the effectiveness of the combo.

This calculation requires at-bat id, which the provided data does not have. I will take the liberty of using full Statcast data for 2023 for this statistic.

I will calculate a second statistic below that can be calculated with the provided data to fully comply with the prompt.

Apply model from Q1 to 2023 Statcast data

load('Statcast2023.RData')

# mimic provided data, with the addition of at-bat identification
full_data <- Statcast2023 %>%
  select(game_pk, at_bat_number, pitch_number, pitch_type, release_speed, pitcher, 
         description, stand, p_throws, balls, strikes, pfx_x, pfx_z, 
         plate_x, plate_z, sz_top, sz_bot, true_zone)

# Data cleaning processes described in more detail in Q1 script

swing_vector <- c("foul", "hit_into_play", "swinging_strike", "foul_tip",
                  "swinging_strike_blocked", "foul_pitchout")
bunt_vector <- c("foul_bunt", "missed_bunt", "bunt_foul_tip")
full_data <- full_data %>%
  filter(!description %in% bunt_vector) %>%
  mutate(swing = description %in% swing_vector)


# Classify combos

full_data <- full_data %>% 
  group_by(game_pk, at_bat_number) %>%
  arrange(game_pk, at_bat_number, pitch_number) %>%
  mutate(prev_pitch_type = lag(pitch_type)) %>% # find the previous pitch
  mutate(combo = case_when(!is.na(prev_pitch_type) ~ paste0(prev_pitch_type, " / ", pitch_type),
                           TRUE ~ NA)) # create combo column, NA if first pitch of AB

Calculate derived variables for modeling

# 1. plate_z relative to sz_top / sz_bot

stat_data <- full_data %>%
  mutate(plate_z_rel = plate_z - (sz_top + sz_bot) / 2) %>% # location - middle
  select(-plate_z) # remove plate_z

# 2. RHH / LHH dummy variable

stat_data <- stat_data %>%
  mutate(batter_r = ifelse(stand == "R", 1, 0)) # 0 if LHH

# 3. RHP / LHP dummy variable

stat_data <- stat_data %>%
  mutate(pitcher_r = ifelse(p_throws == "R", 1, 0)) # 0 if LHP

# 4. pitcher's usage rate of the pitch type in the count vs. RHH / LHH

stat_data <- stat_data %>%
  group_by(pitcher, batter_r, balls, strikes) %>%
  mutate(total_pitches = n()) %>% # temp column
  group_by(pitcher, batter_r, balls, strikes, pitch_type) %>%
  mutate(pitch_type_count = n()) %>% # temp column
  mutate(pitcher_pitch_usage = pitch_type_count / total_pitches) %>%
  select(-total_pitches, -pitch_type_count) %>% # remove temps
  ungroup()

# 5. Velocity difference between the pitch and the pitcher's most-used pitch

stat_data <- stat_data %>%
  left_join(stat_data %>% # sub query to get table of most-used pitch
                  group_by(pitcher, batter_r) %>%
                  mutate(total_pitches = n()) %>%
                  group_by(pitcher, batter_r, pitch_type) %>%
                  mutate(pitch_type_count = n()) %>%
                  mutate(pitcher_pitch_usage = pitch_type_count / total_pitches) %>%
                  group_by(pitcher, batter_r) %>%
                  mutate(most_used_pitch = pitch_type[which.max(pitcher_pitch_usage)]) %>%
                  filter(pitch_type == most_used_pitch) %>%
                  group_by(pitcher, batter_r) %>%
                  summarize(avg_velo_most_used = mean(release_speed)),
              
              by = c("pitcher", "batter_r")) %>%
  
  mutate(diff_velo_most_used = release_speed - avg_velo_most_used) %>%
  select(-avg_velo_most_used)
## `summarise()` has grouped output by 'pitcher'. You can override using the
## `.groups` argument.

Apply model to find swing probability

load('model_3.rda') # final model from Q1

# prepare for xgboost
ddata <- xgb.DMatrix(as.matrix(stat_data[,c(5,10:14,21:25)]),
                     label = stat_data$swing)

# predictions
stat_model_pred <- predict(model_3, ddata)

# add preds to the data
stat_data$swing_prob <- stat_model_pred 

Compare swing probability to actual swing result.

In other words, positive difference is always a credit to the effectiveness of the combo.

stat_data <- stat_data %>%
  mutate(swing_diff = case_when(
                          true_zone == 'strike' ~ swing_prob - swing,
                          true_zone == 'ball' ~ swing - swing_prob))
combo_ratings <- stat_data %>%
  filter(!is.na(combo)) %>% # remove first pitches of ABs
  group_by(pitcher, combo) %>%
  summarize(combo_rating = mean(swing_diff, na.rm = TRUE),
            count = n()) %>%
  filter(count > 50) %>% # only include combos that occur over 50 times
  mutate(combo_rating = round(combo_rating, 3)) %>%
  arrange(desc(combo_rating))
## `summarise()` has grouped output by 'pitcher'. You can override using the
## `.groups` argument.

Add player names

# match IDs to names (from previous project work)
pitcher_info <- read.csv('pitcher_info.csv')

combo_ratings <- combo_ratings %>%
  left_join(pitcher_info, by = c('pitcher' = 'id')) %>%
  select(player_name, combo, combo_rating)
## Adding missing grouping variables: `pitcher`
DT::datatable(head(combo_ratings, 10))

This table shows the ten combos that are most deceptive to batters.

Two important notes:

  1. This model only judges deceptiveness in swing decisions, not in overall effectiveness. To judge overall effectiveness, a statistic such as run value could replace swing probability diff.

  2. The swing diff measures the deceptiveness of the pitch in the combo compared to the standard deceptiveness of that particular pitch. In other words, it measures a pitcher in the context of his own arsenal.

For example, the highest-rated combo may not be the most deceptive in the league, but it is the combo which most elevates that particular pitcher’s deceptiveness.

Therefore, rather than a league ranking, this statistic might be most meaningful as a personal ranking for each pitcher.

pitcher_name <- 'Chris Sale'

player_table <- combo_ratings %>%
  filter(player_name == pitcher_name)

DT::datatable(player_table)

This table shows that Chris Sale’s Changeup/Slider combo is his most deceptive, and his Fastball/Changeup combo is his least deceptive.

Statistic 2: Plate Discipline Against Average

This statistic will measure a batter’s plate discipline, comparing his swing decisions to swing probabilities.

year2_data <- all_data %>%
  filter(season == 2)# from Q1 script


# apply the derived variable calculations 
# 1. plate_z relative to sz_top / sz_bot

year2_data <- year2_data %>%
  mutate(plate_z_rel = plate_z - (sz_top + sz_bot) / 2) # location - middle

# 2. RHH / LHH dummy variable

year2_data <- year2_data %>%
  mutate(batter_r = ifelse(stand == "R", 1, 0)) # 0 if LHH

# 3. RHP / LHP dummy variable

year2_data <- year2_data %>%
  mutate(pitcher_r = ifelse(p_throws == "R", 1, 0)) # 0 if LHP

# 4. pitcher's usage rate of the pitch type in the count vs. RHH / LHH

year2_data <- year2_data %>%
  group_by(pitcher, batter_r, balls, strikes) %>%
  mutate(total_pitches = n()) %>% # temp column
  group_by(pitcher, batter_r, balls, strikes, pitch_type) %>%
  mutate(pitch_type_count = n()) %>% # temp column
  mutate(pitcher_pitch_usage = pitch_type_count / total_pitches) %>%
  select(-total_pitches, -pitch_type_count) %>% # remove temps
  ungroup()
 
# 5. Velocity difference between the pitch and the pitcher's most-used pitch

year2_data <- year2_data %>%
  left_join(year2_data %>% # sub query to get table of most-used pitch
                  group_by(pitcher, batter_r) %>%
                  mutate(total_pitches = n()) %>%
                  group_by(pitcher, batter_r, pitch_type) %>%
                  mutate(pitch_type_count = n()) %>%
                  mutate(pitcher_pitch_usage = pitch_type_count / total_pitches) %>%
                  group_by(pitcher, batter_r) %>%
                  mutate(most_used_pitch = pitch_type[which.max(pitcher_pitch_usage)]) %>%
                  filter(pitch_type == most_used_pitch) %>%
                  group_by(pitcher, batter_r) %>%
                  summarize(avg_velo_most_used = mean(release_speed)),
              
              by = c("pitcher", "batter_r")) %>%
  
  mutate(diff_velo_most_used = release_speed - avg_velo_most_used) %>%
  select(-avg_velo_most_used)
## `summarise()` has grouped output by 'pitcher'. You can override using the
## `.groups` argument.
dyear2 <- xgb.DMatrix(data = as.matrix(year2_data[,c(3,10:14, 19:23)]), 
                      label = year2_data$swing)

year2_data$swing_prob <- predict(model_3, dyear2)

Use the same diff calculations as described above.

year2_data$true_zone <- ifelse(year2_data$plate_z < year2_data$sz_top & 
                                 year2_data$plate_z > year2_data$sz_bot &
                                 year2_data$plate_x < (0.83) &
                                 year2_data$plate_x > (-0.83),
                                 'strike','ball')
 

year2_data <- year2_data %>%
  mutate(swing_diff = case_when(
                          true_zone == 'strike' ~ swing_prob - swing,
                          true_zone == 'ball' ~ swing - swing_prob))

plate_disc <- year2_data %>%
  group_by(batter) %>%
  summarize(pd_rating = round(mean(swing_diff, na.rm = TRUE),3),
            count = n()) %>%
  arrange(desc(pd_rating))

DT::datatable(head(plate_disc, 10))
DT::datatable(tail(plate_disc, 10))