# library(tidyverse)
# library(baseballr)
full_data <- read.csv('Pitcher X Data 2024.csv')
This script explores the provided data and prepares it for Tableau
by_pitch <- full_data %>%
filter(!is.na(PitchType)) %>%
group_by(PitchType) %>%
summarize(usage = round(n()/nrow(full_data),2),
swing_prob = round(mean(swing_prob, na.rm = TRUE),3),
whiff_prob_gs = round(mean(whiff_prob_gs, na.rm = TRUE),3),
whiff_prob = round(mean(whiff_prob, na.rm = TRUE),3))
by_pitch
## # A tibble: 5 × 5
## PitchType usage swing_prob whiff_prob_gs whiff_prob
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 CHANGEUP 0.17 0.545 0.398 0.149
## 2 CURVEBALL 0.08 0.361 0.423 0.122
## 3 CUTTER 0.16 0.471 0.297 0.081
## 4 FASTBALL 0.4 0.418 0.242 0.056
## 5 SLIDER 0.19 0.455 0.555 0.176
release_data <- full_data %>%
filter(!is.na(PitchType)) %>%
group_by(PitchType) %>%
summarize(avg_rel_height = mean(ReleaseHeight),
sd_rel_height = sd(ReleaseHeight),
avg_rel_side = mean(ReleaseSide),
sd_rel_side = sd(ReleaseSide))
# release_data
#release_plot <- ggplot(data = release_data) +
# geom_point(aes(x = avg_rel_side, y = avg_rel_height, size = sd_rel_side), alpha = 0.5) +
# xlim(0, 3) +
# ylim(0, 8)
#release_plot
# *exploratory, not utilized
pitcher_x <- full_data %>%
filter(!is.na(PitchType)) %>%
group_by(PitchType) %>%
summarize(
n = n(),
avg_velo = mean(ReleaseSpeed),
h_break = mean(HorzBreak),
v_break = mean(InducedVertBreak),
swing_prob = mean(swing_prob, na.rm = TRUE),
whiff_prob = mean(whiff_prob, na.rm = TRUE),
whiff_prob_gs = mean(whiff_prob_gs, na.rm = TRUE)) %>%
mutate(player_id = 'xxxxxx',
pitcher = 'Player X',
team_name = 'Seattle Mariners') %>%
rename(player_full_name = pitcher,
pitch_name = PitchType) %>%
select(player_id, player_full_name, team_name, everything())
pitcher_x
## # A tibble: 5 × 11
## player_id player_full_name team_name pitch_name n avg_velo h_break v_break
## <chr> <chr> <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 xxxxxx Player X Seattle … CHANGEUP 451 85.5 13.9 4.13
## 2 xxxxxx Player X Seattle … CURVEBALL 222 79.4 -6.40 -7.54
## 3 xxxxxx Player X Seattle … CUTTER 420 89.9 -1.29 10.6
## 4 xxxxxx Player X Seattle … FASTBALL 1065 92.0 11.2 11.0
## 5 xxxxxx Player X Seattle … SLIDER 504 81.7 -10.5 1.14
## # ℹ 3 more variables: swing_prob <dbl>, whiff_prob <dbl>, whiff_prob_gs <dbl>
ch_dat <- full_data %>%
filter(PitchType == 'CHANGEUP') %>%
summarize(avg_spin = mean(SpinRate, na.rm = TRUE),
avg_velo = mean(ReleaseSpeed, na.rm = TRUE),
h_break = mean(HorzBreak, na.rm = TRUE),
v_break = mean(InducedVertBreak, na.rm = TRUE)) %>%
mutate(player_id = 'xxxxxx',
player_full_name = 'Player X',
team_name = 'Seattle Mariners') %>%
select(player_id, player_full_name, team_name, avg_spin, avg_velo, h_break,
v_break)
load("G:/My Drive/Baseball/R Projects/Data/Statcast2023.RData")
# Filter to RHP and exclude unnecessary rows
rhp_data <- Statcast2023 %>%
filter(p_throws == 'R' & !description %in% c('foul_bunt', 'missed_bunt',
'pitchout', 'bunt_foul_tip',
'unknown_strike'))
# Code whiffs
rhp_data$whiff <- if_else(rhp_data$description %in% c('swinging_strike',
'swinging_strike_blocked'), 1,0)
# Code swings
rhp_data$swing <- if_else(rhp_data$description %in% c('hit_into_play', 'foul',
'foul_tip', 'swinging_strike',
'swinging_strike_blocked'), 1, 0)
# Create summary table
league_metrics <- rhp_data %>%
group_by(pitcher, pitch_name) %>%
summarize(
n = n(),
avg_velo = mean(release_speed, na.rm = TRUE),
h_break = mean(pfx_x, na.rm = TRUE) * -12,
v_break = mean(pfx_z, na.rm = TRUE) * 12,
swing_prob = mean(swing, na.rm = TRUE),
whiff_prob = mean(whiff, na.rm = TRUE),
whiff_prob_gs = sum(whiff)/sum(swing)) %>% # whiff prob given swing
filter(n > 200) %>% # sample size
rename(player_id = pitcher)
## `summarise()` has grouped output by 'pitcher'. You can override using the
## `.groups` argument.
# Using baseballr package, get player names from player_id
mlb_stats_23 <- mlb_stats(stat_type = 'season', player_pool = 'All',
stat_group = 'pitching', season = 2023)
league_pitches <- league_metrics %>%
left_join(mlb_stats_23, by = 'player_id') %>%
select(player_id, player_full_name,
team_name, everything(league_metrics)) %>%
# Match syntax of Pitcher X data
mutate(pitch_name = if_else(pitch_name == '4-Seam Fastball', 'FASTBALL',
if_else(pitch_name == 'Slider', 'SLIDER',
if_else(pitch_name == 'Curveball', 'CURVEBALL',
if_else(pitch_name == 'Cutter', 'CUTTER',
if_else(pitch_name == 'Changeup', 'CHANGEUP', pitch_name))))))
head(league_pitches)
## # A tibble: 6 × 11
## # Groups: player_id [2]
## player_id player_full_name team_name pitch_name n avg_velo h_break v_break
## <dbl> <chr> <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 425794 Adam Wainwright St. Loui… CURVEBALL 545 71.5 -16.6 -13.6
## 2 425794 Adam Wainwright St. Loui… CUTTER 403 82.9 -5.58 7.68
## 3 425794 Adam Wainwright St. Loui… Sinker 557 86.9 13.2 11.2
## 4 425844 Zack Greinke Kansas C… FASTBALL 587 89.6 2.86 15.7
## 5 425844 Zack Greinke Kansas C… CHANGEUP 368 86.5 13.2 4.22
## 6 425844 Zack Greinke Kansas C… CURVEBALL 311 72.5 -11.5 -10.8
## # ℹ 3 more variables: swing_prob <dbl>, whiff_prob <dbl>, whiff_prob_gs <dbl>
# Combine pitcher x and league
all_pitch_data <- rbind(pitcher_x, league_pitches)
# Export
write.csv(all_pitch_data, 'all_pitch_data.csv')
# Create averages table
league_averages <- all_pitch_data %>%
group_by(pitch_name) %>%
summarize(
Swing = mean(swing_prob),
Whiff = mean(whiff_prob),
`Whiff GS` = mean(whiff_prob_gs)) %>%
rename(`Pitch Type` = pitch_name)
# Export
write.csv(league_averages, 'league_averages.csv')
# For Changeup comparison
ch_league <- rhp_data %>%
filter(pitch_name == 'Changeup') %>%
group_by(pitcher) %>%
summarize(avg_spin = mean(release_spin_rate, na.rm = TRUE),
avg_velo = mean(release_speed, na.rm = TRUE),
h_break = mean(pfx_x, na.rm = TRUE) * -12,
v_break = mean(pfx_z, na.rm = TRUE) * 12) %>%
# collect those similar to Player X + desired improvements
filter(avg_spin > 1800 & avg_spin < 2000 & avg_velo > 86 & avg_velo < 87 &
h_break > 14 & h_break < 17) %>%
rename(player_id = pitcher)
# baseballr names
ch_names <- ch_league %>%
left_join(mlb_stats_23, by = 'player_id') %>%
select(player_id, player_full_name,
team_name, everything(ch_league))
changeup_data <- rbind(ch_dat, ch_names)
write.csv(changeup_data, 'changeup_data.csv')