# library(tidyverse)
# library(baseballr)

full_data <- read.csv('Pitcher X Data 2024.csv')

Explore Player X data

This script explores the provided data and prepares it for Tableau

by_pitch <- full_data %>%
  filter(!is.na(PitchType)) %>%
  group_by(PitchType) %>%
  summarize(usage = round(n()/nrow(full_data),2),
              swing_prob = round(mean(swing_prob, na.rm = TRUE),3),
            whiff_prob_gs = round(mean(whiff_prob_gs, na.rm = TRUE),3),
            whiff_prob = round(mean(whiff_prob, na.rm = TRUE),3))

by_pitch
## # A tibble: 5 × 5
##   PitchType usage swing_prob whiff_prob_gs whiff_prob
##   <chr>     <dbl>      <dbl>         <dbl>      <dbl>
## 1 CHANGEUP   0.17      0.545         0.398      0.149
## 2 CURVEBALL  0.08      0.361         0.423      0.122
## 3 CUTTER     0.16      0.471         0.297      0.081
## 4 FASTBALL   0.4       0.418         0.242      0.056
## 5 SLIDER     0.19      0.455         0.555      0.176
release_data <- full_data %>%
  filter(!is.na(PitchType)) %>%
  group_by(PitchType) %>%
  summarize(avg_rel_height = mean(ReleaseHeight),
            sd_rel_height = sd(ReleaseHeight),
            avg_rel_side = mean(ReleaseSide),
            sd_rel_side = sd(ReleaseSide))

# release_data

#release_plot <- ggplot(data = release_data) +
#  geom_point(aes(x = avg_rel_side, y = avg_rel_height, size = sd_rel_side), alpha = 0.5) +
#  xlim(0, 3) +
#  ylim(0, 8)

#release_plot

# *exploratory, not utilized

Create Summative Player X data

pitcher_x <- full_data %>%
  filter(!is.na(PitchType)) %>%
  group_by(PitchType) %>%
  summarize(
    n = n(),
    avg_velo = mean(ReleaseSpeed),
    h_break = mean(HorzBreak),
    v_break = mean(InducedVertBreak),
    swing_prob = mean(swing_prob, na.rm = TRUE),
    whiff_prob = mean(whiff_prob, na.rm = TRUE),
    whiff_prob_gs = mean(whiff_prob_gs, na.rm = TRUE)) %>%
  mutate(player_id = 'xxxxxx',
         pitcher = 'Player X',
         team_name = 'Seattle Mariners') %>%
  rename(player_full_name = pitcher,
         pitch_name = PitchType) %>%
  select(player_id, player_full_name, team_name, everything())

pitcher_x
## # A tibble: 5 × 11
##   player_id player_full_name team_name pitch_name     n avg_velo h_break v_break
##   <chr>     <chr>            <chr>     <chr>      <int>    <dbl>   <dbl>   <dbl>
## 1 xxxxxx    Player X         Seattle … CHANGEUP     451     85.5   13.9     4.13
## 2 xxxxxx    Player X         Seattle … CURVEBALL    222     79.4   -6.40   -7.54
## 3 xxxxxx    Player X         Seattle … CUTTER       420     89.9   -1.29   10.6 
## 4 xxxxxx    Player X         Seattle … FASTBALL    1065     92.0   11.2    11.0 
## 5 xxxxxx    Player X         Seattle … SLIDER       504     81.7  -10.5     1.14
## # ℹ 3 more variables: swing_prob <dbl>, whiff_prob <dbl>, whiff_prob_gs <dbl>
ch_dat <- full_data %>%
  filter(PitchType == 'CHANGEUP') %>%
  summarize(avg_spin = mean(SpinRate, na.rm = TRUE),
            avg_velo = mean(ReleaseSpeed, na.rm = TRUE),
            h_break = mean(HorzBreak, na.rm = TRUE),
            v_break = mean(InducedVertBreak, na.rm = TRUE)) %>%
  mutate(player_id = 'xxxxxx',
         player_full_name = 'Player X',
         team_name = 'Seattle Mariners') %>%
  select(player_id, player_full_name, team_name, avg_spin, avg_velo, h_break,
         v_break)

League Data

load("G:/My Drive/Baseball/R Projects/Data/Statcast2023.RData")

# Filter to RHP and exclude unnecessary rows
rhp_data <- Statcast2023 %>%
  filter(p_throws == 'R' & !description %in% c('foul_bunt', 'missed_bunt',
                                               'pitchout', 'bunt_foul_tip',
                                               'unknown_strike'))

# Code whiffs
rhp_data$whiff <- if_else(rhp_data$description %in% c('swinging_strike', 
                                                      'swinging_strike_blocked'), 1,0)
# Code swings
rhp_data$swing <- if_else(rhp_data$description %in% c('hit_into_play', 'foul',
                                                      'foul_tip', 'swinging_strike',
                                                      'swinging_strike_blocked'), 1, 0)

# Create summary table
league_metrics <- rhp_data %>%
  group_by(pitcher, pitch_name) %>%
  summarize(
    n = n(),
    avg_velo = mean(release_speed, na.rm = TRUE),
    h_break = mean(pfx_x, na.rm = TRUE) * -12,
    v_break = mean(pfx_z, na.rm = TRUE) * 12,
    swing_prob = mean(swing, na.rm = TRUE),
    whiff_prob = mean(whiff, na.rm = TRUE),
    whiff_prob_gs = sum(whiff)/sum(swing)) %>% # whiff prob given swing
  filter(n > 200) %>% # sample size
  rename(player_id = pitcher)
## `summarise()` has grouped output by 'pitcher'. You can override using the
## `.groups` argument.
# Using baseballr package,  get player names from player_id

mlb_stats_23 <- mlb_stats(stat_type =  'season', player_pool = 'All',
                          stat_group = 'pitching', season = 2023)

league_pitches <- league_metrics %>%
        left_join(mlb_stats_23, by = 'player_id') %>%
        select(player_id, player_full_name,
               team_name, everything(league_metrics)) %>%
  # Match syntax of Pitcher X data
  mutate(pitch_name = if_else(pitch_name == '4-Seam Fastball', 'FASTBALL', 
                      if_else(pitch_name == 'Slider', 'SLIDER',
                      if_else(pitch_name == 'Curveball', 'CURVEBALL',
                      if_else(pitch_name == 'Cutter', 'CUTTER',
                      if_else(pitch_name == 'Changeup', 'CHANGEUP', pitch_name))))))

head(league_pitches)
## # A tibble: 6 × 11
## # Groups:   player_id [2]
##   player_id player_full_name team_name pitch_name     n avg_velo h_break v_break
##       <dbl> <chr>            <chr>     <chr>      <int>    <dbl>   <dbl>   <dbl>
## 1    425794 Adam Wainwright  St. Loui… CURVEBALL    545     71.5  -16.6   -13.6 
## 2    425794 Adam Wainwright  St. Loui… CUTTER       403     82.9   -5.58    7.68
## 3    425794 Adam Wainwright  St. Loui… Sinker       557     86.9   13.2    11.2 
## 4    425844 Zack Greinke     Kansas C… FASTBALL     587     89.6    2.86   15.7 
## 5    425844 Zack Greinke     Kansas C… CHANGEUP     368     86.5   13.2     4.22
## 6    425844 Zack Greinke     Kansas C… CURVEBALL    311     72.5  -11.5   -10.8 
## # ℹ 3 more variables: swing_prob <dbl>, whiff_prob <dbl>, whiff_prob_gs <dbl>

Combine Player X and league

# Combine pitcher x and league

all_pitch_data <- rbind(pitcher_x, league_pitches)

# Export
write.csv(all_pitch_data, 'all_pitch_data.csv')
# Create averages table
league_averages <- all_pitch_data %>%
  group_by(pitch_name) %>%
  summarize(
    Swing = mean(swing_prob),
    Whiff = mean(whiff_prob),
    `Whiff GS` = mean(whiff_prob_gs)) %>%
  rename(`Pitch Type` = pitch_name)

# Export
write.csv(league_averages, 'league_averages.csv')
# For Changeup comparison
ch_league <- rhp_data %>%
  filter(pitch_name == 'Changeup') %>%
  group_by(pitcher) %>%
  summarize(avg_spin = mean(release_spin_rate, na.rm = TRUE),
            avg_velo = mean(release_speed, na.rm = TRUE),
            h_break = mean(pfx_x, na.rm = TRUE) * -12,
            v_break = mean(pfx_z, na.rm = TRUE) * 12) %>%
  # collect those similar to Player X + desired improvements
  filter(avg_spin > 1800 & avg_spin < 2000 & avg_velo > 86 & avg_velo < 87 &
         h_break > 14 & h_break < 17) %>%
  rename(player_id = pitcher)

  # baseballr names
ch_names <- ch_league %>%
        left_join(mlb_stats_23, by = 'player_id') %>%
        select(player_id, player_full_name,
               team_name, everything(ch_league))

changeup_data <- rbind(ch_dat, ch_names)

write.csv(changeup_data, 'changeup_data.csv')