Task:

Create a system to match pitchers with other pitchers around the league who most closely resemble their arsenals and characteristics.

library(baseballr)
## Warning: package 'baseballr' was built under R version 4.2.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'forcats' was built under R version 4.2.3
## Warning: package 'lubridate' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
#load season data

load("G:/My Drive/Baseball/Summer 2023/R Projects/Data/Statcast2022.RData")

# Create player statcast data for subject. ex: Freddy Peralta, RHP, MIL

first_name <- 'Freddy'
last_name <- 'Peralta'
player_id <- playerid_lookup(last_name, first_name)$mlbam_id
player_data <- Statcast2022 %>%
  filter(pitcher == player_id)

# Create usage and pitch profile data for subject

pitch_usage <- data.frame()
all_pitches <- unique(Statcast2022$pitch_name)

for (pitch in all_pitches) {
  
  pitch_count <- sum(player_data$pitch_name == pitch)
  total_count <- nrow(player_data)
  
  usage <- round((pitch_count / total_count),2)
  
  player_pitch_data <- player_data %>%
    filter(pitch_name == pitch)
  
  avg_h_break <- round(mean(player_pitch_data$pfx_x) * 12,2)
  avg_v_break <- round(mean(player_pitch_data$pfx_z) * 12,2)
  avg_velo <- round(mean(player_pitch_data$release_speed),2)
  
  new_row <- data.frame(player_id = player_id,
                        pitch = pitch,
                        count = pitch_count,
                        usage = usage,
                        h_break = avg_h_break,
                        v_break = avg_v_break,
                        velo = avg_velo,
                        stringsAsFactors = FALSE)
  
  pitch_usage <- rbind(pitch_usage, new_row)
  
  
}

# Create same data for every other league pitcher

league_usage <- data.frame()
rh_data <- Statcast2022 %>%
  filter(p_throws == 'R')

all_pitcher <- unique(rh_data$pitcher)

for (guy in all_pitcher) {
  
  league_data <- Statcast2022 %>%
    filter(pitcher == guy)
  
  new_new <- data.frame()
  
  for (pitch in all_pitches) {
    
    pitch_count <- sum(league_data$pitch_name == pitch)
    total_count <- nrow(league_data)
    
    usage <- round((pitch_count / total_count),2)
    
    league_pitch_data <- league_data %>%
      filter(pitch_name == pitch)
    
    avg_h_break <- round(mean(league_pitch_data$pfx_x) * 12,2)
    avg_v_break <- round(mean(league_pitch_data$pfx_z) * 12,2)
    avg_velo <- round(mean(league_pitch_data$release_speed),2)
    
    new_row <- data.frame(player_id = guy,
                          pitch = pitch,
                          count = pitch_count,
                          usage = usage,
                          h_break = avg_h_break,
                          v_break = avg_v_break,
                          velo = avg_velo,
                          stringsAsFactors = FALSE)
    
    new_new <- rbind(new_new, new_row)
    
    
  }
  
  league_usage <- rbind(league_usage, new_new)
}

# Combine both tables with mlb_stats() table to retrieve name and team


mlb_stats_22 <- mlb_stats(stat_type =  'season', player_pool = 'All',
                          stat_group = 'pitching', season = 2022)

league_usage <- league_usage %>%
  left_join(mlb_stats_22, by = 'player_id') %>%
  select(player_id, player_first_name, player_last_name, position_name,
         team_name, everything(league_usage)) %>%
  filter(position_name == 'Pitcher') %>%  # remove position players
  select(-'position_name')



pitch_usage <- pitch_usage %>%
  left_join(mlb_stats_22, by = 'player_id') %>%
  select(player_id, player_first_name, player_last_name, position_name,
         team_name, everything(pitch_usage)) %>%
  filter(position_name == 'Pitcher') %>% 
  select(-'position_name')

print(pitch_usage)
##    player_id player_first_name player_last_name         team_name
## 1     642547            Freddy          Peralta Milwaukee Brewers
## 2     642547            Freddy          Peralta Milwaukee Brewers
## 3     642547            Freddy          Peralta Milwaukee Brewers
## 4     642547            Freddy          Peralta Milwaukee Brewers
## 5     642547            Freddy          Peralta Milwaukee Brewers
## 6     642547            Freddy          Peralta Milwaukee Brewers
## 7     642547            Freddy          Peralta Milwaukee Brewers
## 8     642547            Freddy          Peralta Milwaukee Brewers
## 9     642547            Freddy          Peralta Milwaukee Brewers
## 10    642547            Freddy          Peralta Milwaukee Brewers
## 11    642547            Freddy          Peralta Milwaukee Brewers
## 12    642547            Freddy          Peralta Milwaukee Brewers
## 13    642547            Freddy          Peralta Milwaukee Brewers
## 14    642547            Freddy          Peralta Milwaukee Brewers
## 15    642547            Freddy          Peralta Milwaukee Brewers
## 16    642547            Freddy          Peralta Milwaukee Brewers
##              pitch count usage h_break v_break  velo
## 1  4-Seam Fastball   743  0.55   -8.79   16.08 92.60
## 2           Slider   250  0.18    8.00    3.64 80.26
## 3          Sweeper     0  0.00     NaN     NaN   NaN
## 4           Sinker     0  0.00     NaN     NaN   NaN
## 5        Curveball   219  0.16    5.62   -3.66 75.79
## 6         Changeup   141  0.10  -14.69    3.82 85.87
## 7           Cutter     0  0.00     NaN     NaN   NaN
## 8     Split-Finger     0  0.00     NaN     NaN   NaN
## 9    Knuckle Curve     0  0.00     NaN     NaN   NaN
## 10      Slow Curve     0  0.00     NaN     NaN   NaN
## 11          Slurve     0  0.00     NaN     NaN   NaN
## 12       Pitch Out     0  0.00     NaN     NaN   NaN
## 13           Other     0  0.00     NaN     NaN   NaN
## 14          Eephus     0  0.00     NaN     NaN   NaN
## 15                     0  0.00     NaN     NaN   NaN
## 16     Knuckleball     0  0.00     NaN     NaN   NaN
head(league_usage, n = 40)
##    player_id player_first_name player_last_name          team_name
## 1     506433                Yu          Darvish   San Diego Padres
## 2     506433                Yu          Darvish   San Diego Padres
## 3     506433                Yu          Darvish   San Diego Padres
## 4     506433                Yu          Darvish   San Diego Padres
## 5     506433                Yu          Darvish   San Diego Padres
## 6     506433                Yu          Darvish   San Diego Padres
## 7     506433                Yu          Darvish   San Diego Padres
## 8     506433                Yu          Darvish   San Diego Padres
## 9     506433                Yu          Darvish   San Diego Padres
## 10    506433                Yu          Darvish   San Diego Padres
## 11    506433                Yu          Darvish   San Diego Padres
## 12    506433                Yu          Darvish   San Diego Padres
## 13    506433                Yu          Darvish   San Diego Padres
## 14    506433                Yu          Darvish   San Diego Padres
## 15    506433                Yu          Darvish   San Diego Padres
## 16    506433                Yu          Darvish   San Diego Padres
## 17    425844              Zack          Greinke Kansas City Royals
## 18    425844              Zack          Greinke Kansas City Royals
## 19    425844              Zack          Greinke Kansas City Royals
## 20    425844              Zack          Greinke Kansas City Royals
## 21    425844              Zack          Greinke Kansas City Royals
## 22    425844              Zack          Greinke Kansas City Royals
## 23    425844              Zack          Greinke Kansas City Royals
## 24    425844              Zack          Greinke Kansas City Royals
## 25    425844              Zack          Greinke Kansas City Royals
## 26    425844              Zack          Greinke Kansas City Royals
## 27    425844              Zack          Greinke Kansas City Royals
## 28    425844              Zack          Greinke Kansas City Royals
## 29    425844              Zack          Greinke Kansas City Royals
## 30    425844              Zack          Greinke Kansas City Royals
## 31    425844              Zack          Greinke Kansas City Royals
## 32    425844              Zack          Greinke Kansas City Royals
## 33    641816             Tyler            Mahle    Minnesota Twins
## 34    641816             Tyler            Mahle    Minnesota Twins
## 35    641816             Tyler            Mahle    Minnesota Twins
## 36    641816             Tyler            Mahle    Minnesota Twins
## 37    641816             Tyler            Mahle    Minnesota Twins
## 38    641816             Tyler            Mahle    Minnesota Twins
## 39    641816             Tyler            Mahle    Minnesota Twins
## 40    641816             Tyler            Mahle    Minnesota Twins
##              pitch count usage h_break v_break  velo
## 1  4-Seam Fastball   828  0.25   -7.17   17.08 94.86
## 2           Slider  1037  0.31    5.74   -0.15 85.91
## 3          Sweeper   559  0.17   16.55   -0.07 82.51
## 4           Sinker   300  0.09  -14.05   11.76 94.58
## 5        Curveball   138  0.04   11.53  -14.51 72.54
## 6         Changeup     0  0.00     NaN     NaN   NaN
## 7           Cutter   145  0.04    2.13   10.60 90.56
## 8     Split-Finger   245  0.07   -6.36    5.06 89.21
## 9    Knuckle Curve   103  0.03    9.52  -10.82 80.30
## 10      Slow Curve     2  0.00   10.32  -16.98 65.55
## 11          Slurve     0  0.00     NaN     NaN   NaN
## 12       Pitch Out     0  0.00     NaN     NaN   NaN
## 13           Other     0  0.00     NaN     NaN   NaN
## 14          Eephus     0  0.00     NaN     NaN   NaN
## 15                     0  0.00     NaN     NaN   NaN
## 16     Knuckleball     0  0.00     NaN     NaN   NaN
## 17 4-Seam Fastball   870  0.38   -2.10   15.88 89.15
## 18          Slider   168  0.07    8.38    6.01 81.78
## 19         Sweeper     0  0.00     NaN     NaN   NaN
## 20          Sinker    51  0.02  -11.74    9.89 89.44
## 21       Curveball   462  0.20   12.54  -11.18 71.96
## 22        Changeup   374  0.16  -12.81    3.80 86.47
## 23          Cutter   347  0.15    6.57    6.08 85.82
## 24    Split-Finger     0  0.00     NaN     NaN   NaN
## 25   Knuckle Curve     0  0.00     NaN     NaN   NaN
## 26      Slow Curve     0  0.00     NaN     NaN   NaN
## 27          Slurve     0  0.00     NaN     NaN   NaN
## 28       Pitch Out     0  0.00     NaN     NaN   NaN
## 29           Other     0  0.00     NaN     NaN   NaN
## 30          Eephus     0  0.00     NaN     NaN   NaN
## 31                     0  0.00     NaN     NaN   NaN
## 32     Knuckleball     0  0.00     NaN     NaN   NaN
## 33 4-Seam Fastball  1093  0.52  -10.22   18.07 93.21
## 34          Slider   240  0.11    4.98    1.84 83.86
## 35         Sweeper     0  0.00     NaN     NaN   NaN
## 36          Sinker     0  0.00     NaN     NaN   NaN
## 37       Curveball     0  0.00     NaN     NaN   NaN
## 38        Changeup     0  0.00     NaN     NaN   NaN
## 39          Cutter   272  0.13    1.75   10.23 85.50
## 40    Split-Finger   496  0.24  -14.32    9.92 85.39

Mutate league table to calculate differences between league pitchers and subject pitcher

diff_table <- data.frame()

for (guy in all_pitcher) {
  
  other_usage <- league_usage %>% 
    filter(player_id == guy)
  
  other_diff <- data.frame()
  
  for (p in all_pitches) {
    
    
    
    other_pitch <- other_usage %>% 
      filter(pitch == p) 
    
    pitch_pitch <- pitch_usage %>% 
      filter(pitch == p)
    
    new_row <- other_pitch %>% 
      mutate(usage = usage - pitch_pitch$usage,
             h_break = h_break - pitch_pitch$h_break,
             v_break = v_break - pitch_pitch$v_break,
             velo = velo - pitch_pitch$velo)
    
    other_diff <- rbind(other_diff, new_row)
  }
  
  diff_table <- rbind(diff_table, other_diff)
  
}

## Create row for pythagorean difference in break

diff_table <- diff_table %>% 
  mutate(`break` = round(sqrt(h_break^2 + v_break^2),2))

head(diff_table, n = 40)
##    player_id player_first_name player_last_name          team_name
## 1     506433                Yu          Darvish   San Diego Padres
## 2     506433                Yu          Darvish   San Diego Padres
## 3     506433                Yu          Darvish   San Diego Padres
## 4     506433                Yu          Darvish   San Diego Padres
## 5     506433                Yu          Darvish   San Diego Padres
## 6     506433                Yu          Darvish   San Diego Padres
## 7     506433                Yu          Darvish   San Diego Padres
## 8     506433                Yu          Darvish   San Diego Padres
## 9     506433                Yu          Darvish   San Diego Padres
## 10    506433                Yu          Darvish   San Diego Padres
## 11    506433                Yu          Darvish   San Diego Padres
## 12    506433                Yu          Darvish   San Diego Padres
## 13    506433                Yu          Darvish   San Diego Padres
## 14    506433                Yu          Darvish   San Diego Padres
## 15    506433                Yu          Darvish   San Diego Padres
## 16    506433                Yu          Darvish   San Diego Padres
## 17    425844              Zack          Greinke Kansas City Royals
## 18    425844              Zack          Greinke Kansas City Royals
## 19    425844              Zack          Greinke Kansas City Royals
## 20    425844              Zack          Greinke Kansas City Royals
## 21    425844              Zack          Greinke Kansas City Royals
## 22    425844              Zack          Greinke Kansas City Royals
## 23    425844              Zack          Greinke Kansas City Royals
## 24    425844              Zack          Greinke Kansas City Royals
## 25    425844              Zack          Greinke Kansas City Royals
## 26    425844              Zack          Greinke Kansas City Royals
## 27    425844              Zack          Greinke Kansas City Royals
## 28    425844              Zack          Greinke Kansas City Royals
## 29    425844              Zack          Greinke Kansas City Royals
## 30    425844              Zack          Greinke Kansas City Royals
## 31    425844              Zack          Greinke Kansas City Royals
## 32    425844              Zack          Greinke Kansas City Royals
## 33    641816             Tyler            Mahle    Minnesota Twins
## 34    641816             Tyler            Mahle    Minnesota Twins
## 35    641816             Tyler            Mahle    Minnesota Twins
## 36    641816             Tyler            Mahle    Minnesota Twins
## 37    641816             Tyler            Mahle    Minnesota Twins
## 38    641816             Tyler            Mahle    Minnesota Twins
## 39    641816             Tyler            Mahle    Minnesota Twins
## 40    641816             Tyler            Mahle    Minnesota Twins
##              pitch count usage h_break v_break  velo break
## 1  4-Seam Fastball   828 -0.30    1.62    1.00  2.26  1.90
## 2           Slider  1037  0.13   -2.26   -3.79  5.65  4.41
## 3          Sweeper   559  0.17     NaN     NaN   NaN   NaN
## 4           Sinker   300  0.09     NaN     NaN   NaN   NaN
## 5        Curveball   138 -0.12    5.91  -10.85 -3.25 12.36
## 6         Changeup     0 -0.10     NaN     NaN   NaN   NaN
## 7           Cutter   145  0.04     NaN     NaN   NaN   NaN
## 8     Split-Finger   245  0.07     NaN     NaN   NaN   NaN
## 9    Knuckle Curve   103  0.03     NaN     NaN   NaN   NaN
## 10      Slow Curve     2  0.00     NaN     NaN   NaN   NaN
## 11          Slurve     0  0.00     NaN     NaN   NaN   NaN
## 12       Pitch Out     0  0.00     NaN     NaN   NaN   NaN
## 13           Other     0  0.00     NaN     NaN   NaN   NaN
## 14          Eephus     0  0.00     NaN     NaN   NaN   NaN
## 15                     0  0.00     NaN     NaN   NaN   NaN
## 16     Knuckleball     0  0.00     NaN     NaN   NaN   NaN
## 17 4-Seam Fastball   870 -0.17    6.69   -0.20 -3.45  6.69
## 18          Slider   168 -0.11    0.38    2.37  1.52  2.40
## 19         Sweeper     0  0.00     NaN     NaN   NaN   NaN
## 20          Sinker    51  0.02     NaN     NaN   NaN   NaN
## 21       Curveball   462  0.04    6.92   -7.52 -3.83 10.22
## 22        Changeup   374  0.06    1.88   -0.02  0.60  1.88
## 23          Cutter   347  0.15     NaN     NaN   NaN   NaN
## 24    Split-Finger     0  0.00     NaN     NaN   NaN   NaN
## 25   Knuckle Curve     0  0.00     NaN     NaN   NaN   NaN
## 26      Slow Curve     0  0.00     NaN     NaN   NaN   NaN
## 27          Slurve     0  0.00     NaN     NaN   NaN   NaN
## 28       Pitch Out     0  0.00     NaN     NaN   NaN   NaN
## 29           Other     0  0.00     NaN     NaN   NaN   NaN
## 30          Eephus     0  0.00     NaN     NaN   NaN   NaN
## 31                     0  0.00     NaN     NaN   NaN   NaN
## 32     Knuckleball     0  0.00     NaN     NaN   NaN   NaN
## 33 4-Seam Fastball  1093 -0.03   -1.43    1.99  0.61  2.45
## 34          Slider   240 -0.07   -3.02   -1.80  3.60  3.52
## 35         Sweeper     0  0.00     NaN     NaN   NaN   NaN
## 36          Sinker     0  0.00     NaN     NaN   NaN   NaN
## 37       Curveball     0 -0.16     NaN     NaN   NaN   NaN
## 38        Changeup     0 -0.10     NaN     NaN   NaN   NaN
## 39          Cutter   272  0.13     NaN     NaN   NaN   NaN
## 40    Split-Finger   496  0.24     NaN     NaN   NaN   NaN

Create totals table to simplify and sort

Look for players with the smallest differences in usage and break to determine the best matches.

totals <- diff_table %>% 
  group_by(player_id, player_first_name, player_last_name) %>% 
  summarize(
    total_usage = sum(abs(usage)),
    total_break = sum(`break`, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'player_id', 'player_first_name'. You can
## override using the `.groups` argument.
slice_head(totals, n = 10)
## # A tibble: 589 × 5
## # Groups:   player_id, player_first_name [589]
##    player_id player_first_name player_last_name total_usage total_break
##        <dbl> <chr>             <chr>                  <dbl>       <dbl>
##  1    425794 Adam              Wainwright              1.37       40.8 
##  2    425844 Zack              Greinke                 0.55       21.2 
##  3    434378 Justin            Verlander               0.25       21.7 
##  4    434671 Anibal            Sanchez                 1.22       15.4 
##  5    445276 Kenley            Jansen                  1.73       10.2 
##  6    445926 Jesse             Chavez                  1.74       18.8 
##  7    446372 Corey             Kluber                  1.44       17.6 
##  8    450203 Charlie           Morton                  0.82       19.1 
##  9    453178 Ian               Kennedy                 0.83       13.3 
## 10    453268 Daniel            Bard                    1.52        9.63
## # ℹ 579 more rows
# Select closest matches

matches <- league_usage %>% 
  filter(player_id %in% c(642547,680573,663474), count > 0)

print(matches)
##    player_id player_first_name player_last_name           team_name
## 1     663474           Triston         McKenzie Cleveland Guardians
## 2     663474           Triston         McKenzie Cleveland Guardians
## 3     663474           Triston         McKenzie Cleveland Guardians
## 4     642547            Freddy          Peralta   Milwaukee Brewers
## 5     642547            Freddy          Peralta   Milwaukee Brewers
## 6     642547            Freddy          Peralta   Milwaukee Brewers
## 7     642547            Freddy          Peralta   Milwaukee Brewers
## 8     680573            Simeon Woods Richardson     Minnesota Twins
## 9     680573            Simeon Woods Richardson     Minnesota Twins
## 10    680573            Simeon Woods Richardson     Minnesota Twins
## 11    680573            Simeon Woods Richardson     Minnesota Twins
##              pitch count usage h_break v_break  velo
## 1  4-Seam Fastball  1676  0.56   -3.01   20.55 92.48
## 2           Slider   668  0.22    5.32   10.18 87.27
## 3        Curveball   637  0.21    7.91   -7.95 79.40
## 4  4-Seam Fastball   743  0.55   -8.79   16.08 92.60
## 5           Slider   250  0.18    8.00    3.64 80.26
## 6        Curveball   219  0.16    5.62   -3.66 75.79
## 7         Changeup   141  0.10  -14.69    3.82 85.87
## 8  4-Seam Fastball    50  0.55    1.71   19.78 90.83
## 9           Slider    17  0.19    6.94    4.86 83.70
## 10       Curveball    13  0.14    0.30  -15.14 74.27
## 11        Changeup    11  0.12  -11.49   16.51 80.38

For Freddy Peralta, Triston McKenzie (CLE) and Simeon Woods Richardson (MIN) were selected as the best matches for comparison. Woods Richardson uses the same four pitch mix with nearly identical usage rates. McKenzie utilizes similarly shaped FB, SL, and CBs. McKenzie also provides a more prominent comparison with proven success.

Visualize

# Visualize pitch data

matches %>% 
  ggplot(aes(x = h_break, y = v_break)) +
  geom_point(aes(color = player_last_name, shape = pitch, size = 3)) +
  geom_path(aes(group = pitch), color = 'gray', alpha = 1) +
  xlab('h_break (in)')+
  ylab('v_break (in)')+
  geom_vline(xintercept = 0, size = 1) +
  geom_hline(yintercept = 0, size = 1) +
  xlim(-25, 25) +
  ylim(-20, 25) +
  ggtitle('Pitch Mix By Pitcher') +
  guides(size = FALSE)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# Visualize usage by pitcher

matches %>% 
  ggplot(aes(x = pitch, y = usage, fill = player_last_name)) +
  geom_bar(position = 'dodge', stat = 'identity') +
  labs(title = 'Pitch Usage By Pitcher',
       x = 'Pitch Type',
       y = 'Usage',
       fill = 'Pitcher') +
  scale_fill_manual(values = c('McKenzie' = 'red', 'Peralta' = 'green', 'Woods Richardson' = 'blue')) +
  theme_minimal()

Conclusion and application

These three pitchers can confidently look to one another as bases of comparison. Possible applications include advance scouting, player development, and pro scouting evaluation. This project uses Peralta as the subject, but if it were replicated for all three players, the following are examples of applications.

Peralta: Observe how McKenzie approaches certain hitters. With what pitches and sequences has McKenzie found success against Peralta’s upcoming opponents? In McKenzie’s dominant 2022 season, on what pitches and sequences did he rely most?

McKenzie: Unlike, McKenzie, Peralta features a CH. Should McKenzie try to replicate Peralta’s CH?

Woods Richardson: For a young prospect, these comparisons to big leaguers can be especially valuable. Woods Richarson should follow these pitchers closely, particularly Peralta, to mimic aspects of their approach and development.