Data Prep

# load packages and read data
library(baseballr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
games_data <- read.csv('AnalyticsQuestionnairePitchData.csv')

# fix defective pitch id index
games_data <- games_data %>%
  arrange(GamePk, AtBatNumber, Balls + Strikes) %>%
  mutate(index = if_else(games_data$GamePk == 1,
                                row_number() + 1000,
                                row_number() + 2000)) %>%
          select(index, everything())

# create unique AtBatID
games_data$AtBatID <- if_else(games_data$GamePk == 1,
                              games_data$AtBatNumber + 1000,
                              games_data$AtBatNumber + 2000)

# create wOBA numerator column
games_data$woba_value <- 
  if_else(games_data$PitchCall == 'walk', 0.69,
   if_else(games_data$PitchCall == 'hit_by_pitch', 0.72,
    if_else(games_data$PitchCall == 'single', 0.89,
     if_else(games_data$PitchCall == 'double', 1.27,
      if_else(games_data$PitchCall == 'triple', 1.62,
       if_else(games_data$PitchCall == 'home_run', 2.10,
               0))))))

# create outs-on-play for innings pitched
games_data$outs_on_play <- games_data$PostOuts - games_data$Outs

Task

Produce self-explanatory graphs and tables to summarize the players’ performance in these two games. Summarize your findings with bullet points. Include all code/workbooks.

Data Limitations

  • In the PitchCall column, “strikeout”, “stolen_base_3b”, “stolen_base_2b”, and “wild_pitch” give no indication of swing / no swing. The lack of this information, particularly for all strikeout pitches, makes plate discipline analysis (ZO-swing, etc.) impossible.

  • The data lacks batted ball data, which could have been used to calculate metrics such as Barrel% and AEV

  • The data does not show base situations, which could have been used to calculate run value metrics

Batter Analysis

# Create a table to display relevant batter line stats
unique_batter_id <- unique(games_data$BatterId)
batter_table <- data.frame()

for (i in unique_batter_id) {

batter_data <- games_data %>%
  filter(BatterId == i)

# Calculate PA
plate_appearances <- length(unique(batter_data$AtBatID))

# Calculate wOBA using Fangraphs weights

woba <- sum(batter_data$woba_value) / plate_appearances

# Calculate K

strikeouts <- sum(batter_data$PitchCall == 'strikeout')

# Calculate BB

walks <- sum(batter_data$PitchCall == 'walk')

# Create new row for hitter

new_row <- data.frame(BatterId = i,
                      PA = plate_appearances,
                      wOBA = round(woba,3),
                      K = strikeouts,
                      BB = walks)

batter_table <- rbind(batter_table, new_row)
}

print(batter_table)
##    BatterId PA  wOBA K BB
## 1        19 10 0.178 4  0
## 2        24 10 0.356 0  0
## 3        21 10 0.514 1  1
## 4         4  9 0.240 0  0
## 5        13  9 0.310 3  1
## 6        20  5 0.420 0  0
## 7        33  5 0.000 2  0
## 8        29  5 0.000 2  0
## 9        14  5 0.508 1  0
## 10       18  7 0.226 2  1
## 11       34  9 0.531 1  2
## 12        2  5 0.000 4  0
## 13       17  5 0.316 1  1
## 14        7  5 0.356 1  0
## 15       30  4 0.762 1  0
## 16       28  4 0.000 3  0
## 17       27  3 0.000 0  0
## 18       23  2 0.790 0  1
## 19       22  1 0.000 1  0
## 20       35  1 0.690 0  1
## 21       36  2 0.000 0  0
## 22        1  2 0.445 0  0
## 23        3  1 0.890 0  0
## 24        9  1 0.000 1  0
## 25       26  5 0.000 1  0
## 26       31  4 0.318 1  0
## 27       25  3 0.000 0  0
## 28       15  4 0.000 1  0
## 29        5  4 0.517 0  3
## 30        6  4 0.222 1  0
## 31       12  4 0.222 1  0
## 32       11  3 0.000 2  0
## 33       16  3 0.000 0  0
## 34        8  2 0.000 1  0
## 35       37  1 0.000 0  0
## 36       32  1 0.000 0  0
## 37       10  1 0.000 1  0
# graph wOBA

# calculate overall wOBA to be displayed with orange
games_woba <- sum(games_data$woba_value) / length(unique(games_data$AtBatID))

# chart wOBA with overall wOBA comparison, shaded by number of PAs
batter_graph <- ggplot(data = batter_table) +
  geom_bar(aes(x = BatterId, y = wOBA, fill = PA), stat = 'identity') +
  scale_fill_gradient(low = 'lightblue', high = 'darkblue') +
  geom_hline(yintercept = games_woba, linetype = 'dashed', color = 'darkorange') +
  scale_x_continuous(breaks = batter_table$BatterId, labels =   batter_table$BatterId) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1,
                                   size = 6)) +
  labs(title = 'Batter wOBA', 
       subtitle = '(orange line = overall wOBA)')
           
print(batter_graph)

Findings

  • Batter 34 was the top offensive performer.
  • Batter 30 performed well in limited opportunities, suggesting that he might qualify for increased usage.
  • Batter 2 performed very poorly. He faced almost exclusively hard stuff (FB, CT, SI) with little success. Seeing this, teams would likely take a FB-heavy approach against him.

Pitcher Analysis

unique_pitcher_id <- unique(games_data$PitcherId)
pitcher_table <- data.frame()

for (i in unique_pitcher_id) {
pitcher_data <- games_data %>%
  filter(PitcherId == i)

# Calculate BF
batters_faced <- length(unique(pitcher_data$AtBatID))

# Calculate IP
innings_pitched <- sum(pitcher_data$outs_on_play) %/% 3 +
                    0.1 * sum(pitcher_data$outs_on_play) %% 3

# Calculate K%
k_rate <- sum(pitcher_data$PitchCall == 'strikeout') /
  batters_faced

# Calculate BB%
bb_rate <- sum(pitcher_data$PitchCall == 'walk') /
  batters_faced

# Calculate wOBA-against
woba_against <- sum(pitcher_data$woba_value) / batters_faced

new_row <- data.frame(PitcherId = i,
                      IP = innings_pitched,
                      BF = batters_faced,
                      K = round(k_rate,2),
                      BB = round(bb_rate,2),
                      wOBA_against = round(woba_against,3))
                      

pitcher_table <- rbind(pitcher_table, new_row)

}

print(pitcher_table)
##    PitcherId   IP BF    K   BB wOBA_against
## 1         11  6.0 26 0.19 0.04        0.315
## 2          1 12.0 47 0.28 0.06        0.237
## 3          6  2.0  6 0.33 0.00        0.000
## 4          2  1.0  3 0.67 0.00        0.000
## 5          4  1.0  4 0.00 0.25        0.172
## 6          5  1.0  5 0.00 0.00        0.432
## 7         13  0.2  3 0.00 0.00        0.297
## 8         16  0.1  2 0.50 0.00        0.445
## 9         12  1.0  6 0.17 0.00        0.445
## 10         3  1.1  5 0.60 0.00        0.178
## 11        14  1.0  5 0.00 0.20        0.558
## 12         8  1.0  3 0.00 0.33        0.230
## 13         7  6.0 26 0.19 0.08        0.233
## 14        15  1.0  3 0.67 0.00        0.000
## 15         9  1.0  4 0.75 0.00        0.222
## 16        10  1.0  6 0.00 0.17        0.412
## 17        17  0.2  5 0.00 0.20        0.316
# Chart wOBA with overall wOBA comparison, shaded by number of PAs
pitcher_graph <- ggplot(data = pitcher_table) +
  geom_bar(aes(x = PitcherId, y = wOBA_against, fill = BF), stat = 'identity') +
  scale_fill_gradient(low = 'lightblue', high = 'darkblue') +
  geom_hline(yintercept = games_woba, linetype = 'dashed', color = 'darkorange') +
  scale_x_continuous(breaks = pitcher_table$PitcherId, labels = pitcher_table$PitcherId) +
  labs(title = 'wOBA-against', 
       subtitle = '(orange line = overall wOBA)')
           
print(pitcher_graph)

Pitcher Visualizations

# create a function that produces desired visualizations for input pitcher and game
game_visualizations <- function(pitcherId, gamepk) {

# filter data
pitcher_data <- games_data %>%
  filter(PitcherId == pitcherId, GamePk == gamepk) %>%
  na.omit()

#  plot pitch usage 
use_plot <- ggplot(data = pitcher_data) +
  geom_bar(aes(x = PitchType, fill = PitchType)) +
  labs(title = 'Pitch Usage')


# plot pitch movement
mov_plot <- ggplot(data = pitcher_data) +
  geom_point(aes(x = TrajectoryHorizontalBreak*12, 
                 y = TrajectoryVerticalBreakInduced*12, 
                 color = PitchType,))  +
  geom_hline(yintercept = 0, color = 'black') +
  geom_vline(xintercept = 0, color = 'black') +
  xlim(-20,20) +
  ylim(-20, 20) +
  coord_fixed(ratio = 1) +
  labs(title = 'Pitch Movement',
       x = 'Horizontal Break (in)',
       y = 'Induced Vertical Break (in)')

# plot release point

rel_plot <- ggplot(data = pitcher_data) +
  geom_point(aes(x = ReleasePositionX, 
                 y = ReleasePositionY, 
                 color = PitchType))  +
  xlim(-5,5) +
  ylim(40,70) +
  labs(title = 'Release Point', 
       x = 'Horizontal Release',
       y = 'Vertical Release')

# velocity density chart
dens_plot <- ggplot(pitcher_data, aes(x = ReleaseSpeed, fill = PitchType)) +
  geom_density(alpha = 0.5) +
  scale_fill_discrete(name = 'Pitch Type') +
  theme_minimal()
  labs(title = 'Velocity by Pitch Type',
       x = 'Velocity',
       y = 'Density')

# velocity over time plot
vel_plot <- ggplot(data = pitcher_data) +
  geom_line(aes(x = row_number(pitcher_data), y = ReleaseSpeed, color = PitchType),
            size = 1.25) +
  ylim(70,105) +
  labs(title = 'Velocity over time',
       x = 'Pitch Number',
       y = 'Velocity')

print(use_plot)
print(mov_plot)
print(rel_plot)
print(dens_plot)
print(vel_plot)
}
game_visualizations(pitcherId = 11, gamepk = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#### Findings

  • Pitcher 1 performed very well. He has very high strikeout numbers, in part because he is able to maintain 96-98 velocity deep into starts.

  • Pitcher 11 struggled. One possible explanation is the inconsistent release point. He tended to release his SL from a slightly wider release point

Question 5

# Load Statcast season data
load("G:/My Drive/Baseball/Summer 2023/R Projects/Data/Statcast2022.RData")
# Briefly collect velo and spin grades

league_pitch_summaries <- Statcast2022 %>%
              filter(pitch_type %in% c('FF', 'SL','CH')) %>%
              group_by(pitch_type) %>%
              summarize(
              avg_release_speed = mean(release_speed, na.rm = TRUE),
              sd_release_speed = sd(release_speed, na.rm = TRUE),
              avg_spin_rate = mean(release_spin_rate, na.rm = TRUE),
              sd_spin_rate = sd(release_spin_rate, na.rm = TRUE))

pitcher_table <- data.frame(
  pitch_type = c( "CH","FF", "SL"),
  pitcher_release_speed = c( 84.5, 92.7, 81.4),
  pitcher_spin_rate = c(1760, 2145, 2675))

pitcher_table <- inner_join(pitcher_table, league_pitch_summaries, by = 'pitch_type')



pitcher_table_grades <- pitcher_table %>%
          mutate(velo_grade = round(((pitcher_release_speed - avg_release_speed) /
                   sd_release_speed) * 10 + 50,1),
                spin_grade = round(((pitcher_spin_rate - avg_spin_rate) /
                   sd_spin_rate) * 10 + 50,1)) %>%
  select(pitch_type, velo_grade, spin_grade)

print(pitcher_table_grades)
##   pitch_type velo_grade spin_grade
## 1         CH       47.5       50.3
## 2         FF       45.1       42.3
## 3         SL       39.0       60.9