# load packages and read data
library(baseballr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
games_data <- read.csv('AnalyticsQuestionnairePitchData.csv')
# fix defective pitch id index
games_data <- games_data %>%
arrange(GamePk, AtBatNumber, Balls + Strikes) %>%
mutate(index = if_else(games_data$GamePk == 1,
row_number() + 1000,
row_number() + 2000)) %>%
select(index, everything())
# create unique AtBatID
games_data$AtBatID <- if_else(games_data$GamePk == 1,
games_data$AtBatNumber + 1000,
games_data$AtBatNumber + 2000)
# create wOBA numerator column
games_data$woba_value <-
if_else(games_data$PitchCall == 'walk', 0.69,
if_else(games_data$PitchCall == 'hit_by_pitch', 0.72,
if_else(games_data$PitchCall == 'single', 0.89,
if_else(games_data$PitchCall == 'double', 1.27,
if_else(games_data$PitchCall == 'triple', 1.62,
if_else(games_data$PitchCall == 'home_run', 2.10,
0))))))
# create outs-on-play for innings pitched
games_data$outs_on_play <- games_data$PostOuts - games_data$Outs
Produce self-explanatory graphs and tables to summarize the players’ performance in these two games. Summarize your findings with bullet points. Include all code/workbooks.
In the PitchCall column, “strikeout”, “stolen_base_3b”, “stolen_base_2b”, and “wild_pitch” give no indication of swing / no swing. The lack of this information, particularly for all strikeout pitches, makes plate discipline analysis (ZO-swing, etc.) impossible.
The data lacks batted ball data, which could have been used to calculate metrics such as Barrel% and AEV
The data does not show base situations, which could have been used to calculate run value metrics
# Create a table to display relevant batter line stats
unique_batter_id <- unique(games_data$BatterId)
batter_table <- data.frame()
for (i in unique_batter_id) {
batter_data <- games_data %>%
filter(BatterId == i)
# Calculate PA
plate_appearances <- length(unique(batter_data$AtBatID))
# Calculate wOBA using Fangraphs weights
woba <- sum(batter_data$woba_value) / plate_appearances
# Calculate K
strikeouts <- sum(batter_data$PitchCall == 'strikeout')
# Calculate BB
walks <- sum(batter_data$PitchCall == 'walk')
# Create new row for hitter
new_row <- data.frame(BatterId = i,
PA = plate_appearances,
wOBA = round(woba,3),
K = strikeouts,
BB = walks)
batter_table <- rbind(batter_table, new_row)
}
print(batter_table)
## BatterId PA wOBA K BB
## 1 19 10 0.178 4 0
## 2 24 10 0.356 0 0
## 3 21 10 0.514 1 1
## 4 4 9 0.240 0 0
## 5 13 9 0.310 3 1
## 6 20 5 0.420 0 0
## 7 33 5 0.000 2 0
## 8 29 5 0.000 2 0
## 9 14 5 0.508 1 0
## 10 18 7 0.226 2 1
## 11 34 9 0.531 1 2
## 12 2 5 0.000 4 0
## 13 17 5 0.316 1 1
## 14 7 5 0.356 1 0
## 15 30 4 0.762 1 0
## 16 28 4 0.000 3 0
## 17 27 3 0.000 0 0
## 18 23 2 0.790 0 1
## 19 22 1 0.000 1 0
## 20 35 1 0.690 0 1
## 21 36 2 0.000 0 0
## 22 1 2 0.445 0 0
## 23 3 1 0.890 0 0
## 24 9 1 0.000 1 0
## 25 26 5 0.000 1 0
## 26 31 4 0.318 1 0
## 27 25 3 0.000 0 0
## 28 15 4 0.000 1 0
## 29 5 4 0.517 0 3
## 30 6 4 0.222 1 0
## 31 12 4 0.222 1 0
## 32 11 3 0.000 2 0
## 33 16 3 0.000 0 0
## 34 8 2 0.000 1 0
## 35 37 1 0.000 0 0
## 36 32 1 0.000 0 0
## 37 10 1 0.000 1 0
# graph wOBA
# calculate overall wOBA to be displayed with orange
games_woba <- sum(games_data$woba_value) / length(unique(games_data$AtBatID))
# chart wOBA with overall wOBA comparison, shaded by number of PAs
batter_graph <- ggplot(data = batter_table) +
geom_bar(aes(x = BatterId, y = wOBA, fill = PA), stat = 'identity') +
scale_fill_gradient(low = 'lightblue', high = 'darkblue') +
geom_hline(yintercept = games_woba, linetype = 'dashed', color = 'darkorange') +
scale_x_continuous(breaks = batter_table$BatterId, labels = batter_table$BatterId) +
theme(axis.text.x = element_text(angle = 45, hjust = 1,
size = 6)) +
labs(title = 'Batter wOBA',
subtitle = '(orange line = overall wOBA)')
print(batter_graph)
unique_pitcher_id <- unique(games_data$PitcherId)
pitcher_table <- data.frame()
for (i in unique_pitcher_id) {
pitcher_data <- games_data %>%
filter(PitcherId == i)
# Calculate BF
batters_faced <- length(unique(pitcher_data$AtBatID))
# Calculate IP
innings_pitched <- sum(pitcher_data$outs_on_play) %/% 3 +
0.1 * sum(pitcher_data$outs_on_play) %% 3
# Calculate K%
k_rate <- sum(pitcher_data$PitchCall == 'strikeout') /
batters_faced
# Calculate BB%
bb_rate <- sum(pitcher_data$PitchCall == 'walk') /
batters_faced
# Calculate wOBA-against
woba_against <- sum(pitcher_data$woba_value) / batters_faced
new_row <- data.frame(PitcherId = i,
IP = innings_pitched,
BF = batters_faced,
K = round(k_rate,2),
BB = round(bb_rate,2),
wOBA_against = round(woba_against,3))
pitcher_table <- rbind(pitcher_table, new_row)
}
print(pitcher_table)
## PitcherId IP BF K BB wOBA_against
## 1 11 6.0 26 0.19 0.04 0.315
## 2 1 12.0 47 0.28 0.06 0.237
## 3 6 2.0 6 0.33 0.00 0.000
## 4 2 1.0 3 0.67 0.00 0.000
## 5 4 1.0 4 0.00 0.25 0.172
## 6 5 1.0 5 0.00 0.00 0.432
## 7 13 0.2 3 0.00 0.00 0.297
## 8 16 0.1 2 0.50 0.00 0.445
## 9 12 1.0 6 0.17 0.00 0.445
## 10 3 1.1 5 0.60 0.00 0.178
## 11 14 1.0 5 0.00 0.20 0.558
## 12 8 1.0 3 0.00 0.33 0.230
## 13 7 6.0 26 0.19 0.08 0.233
## 14 15 1.0 3 0.67 0.00 0.000
## 15 9 1.0 4 0.75 0.00 0.222
## 16 10 1.0 6 0.00 0.17 0.412
## 17 17 0.2 5 0.00 0.20 0.316
# Chart wOBA with overall wOBA comparison, shaded by number of PAs
pitcher_graph <- ggplot(data = pitcher_table) +
geom_bar(aes(x = PitcherId, y = wOBA_against, fill = BF), stat = 'identity') +
scale_fill_gradient(low = 'lightblue', high = 'darkblue') +
geom_hline(yintercept = games_woba, linetype = 'dashed', color = 'darkorange') +
scale_x_continuous(breaks = pitcher_table$PitcherId, labels = pitcher_table$PitcherId) +
labs(title = 'wOBA-against',
subtitle = '(orange line = overall wOBA)')
print(pitcher_graph)
# create a function that produces desired visualizations for input pitcher and game
game_visualizations <- function(pitcherId, gamepk) {
# filter data
pitcher_data <- games_data %>%
filter(PitcherId == pitcherId, GamePk == gamepk) %>%
na.omit()
# plot pitch usage
use_plot <- ggplot(data = pitcher_data) +
geom_bar(aes(x = PitchType, fill = PitchType)) +
labs(title = 'Pitch Usage')
# plot pitch movement
mov_plot <- ggplot(data = pitcher_data) +
geom_point(aes(x = TrajectoryHorizontalBreak*12,
y = TrajectoryVerticalBreakInduced*12,
color = PitchType,)) +
geom_hline(yintercept = 0, color = 'black') +
geom_vline(xintercept = 0, color = 'black') +
xlim(-20,20) +
ylim(-20, 20) +
coord_fixed(ratio = 1) +
labs(title = 'Pitch Movement',
x = 'Horizontal Break (in)',
y = 'Induced Vertical Break (in)')
# plot release point
rel_plot <- ggplot(data = pitcher_data) +
geom_point(aes(x = ReleasePositionX,
y = ReleasePositionY,
color = PitchType)) +
xlim(-5,5) +
ylim(40,70) +
labs(title = 'Release Point',
x = 'Horizontal Release',
y = 'Vertical Release')
# velocity density chart
dens_plot <- ggplot(pitcher_data, aes(x = ReleaseSpeed, fill = PitchType)) +
geom_density(alpha = 0.5) +
scale_fill_discrete(name = 'Pitch Type') +
theme_minimal()
labs(title = 'Velocity by Pitch Type',
x = 'Velocity',
y = 'Density')
# velocity over time plot
vel_plot <- ggplot(data = pitcher_data) +
geom_line(aes(x = row_number(pitcher_data), y = ReleaseSpeed, color = PitchType),
size = 1.25) +
ylim(70,105) +
labs(title = 'Velocity over time',
x = 'Pitch Number',
y = 'Velocity')
print(use_plot)
print(mov_plot)
print(rel_plot)
print(dens_plot)
print(vel_plot)
}
game_visualizations(pitcherId = 11, gamepk = 1)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
#### Findings
Pitcher 1 performed very well. He has very high strikeout numbers, in part because he is able to maintain 96-98 velocity deep into starts.
Pitcher 11 struggled. One possible explanation is the inconsistent release point. He tended to release his SL from a slightly wider release point
# Load Statcast season data
load("G:/My Drive/Baseball/Summer 2023/R Projects/Data/Statcast2022.RData")
# Briefly collect velo and spin grades
league_pitch_summaries <- Statcast2022 %>%
filter(pitch_type %in% c('FF', 'SL','CH')) %>%
group_by(pitch_type) %>%
summarize(
avg_release_speed = mean(release_speed, na.rm = TRUE),
sd_release_speed = sd(release_speed, na.rm = TRUE),
avg_spin_rate = mean(release_spin_rate, na.rm = TRUE),
sd_spin_rate = sd(release_spin_rate, na.rm = TRUE))
pitcher_table <- data.frame(
pitch_type = c( "CH","FF", "SL"),
pitcher_release_speed = c( 84.5, 92.7, 81.4),
pitcher_spin_rate = c(1760, 2145, 2675))
pitcher_table <- inner_join(pitcher_table, league_pitch_summaries, by = 'pitch_type')
pitcher_table_grades <- pitcher_table %>%
mutate(velo_grade = round(((pitcher_release_speed - avg_release_speed) /
sd_release_speed) * 10 + 50,1),
spin_grade = round(((pitcher_spin_rate - avg_spin_rate) /
sd_spin_rate) * 10 + 50,1)) %>%
select(pitch_type, velo_grade, spin_grade)
print(pitcher_table_grades)
## pitch_type velo_grade spin_grade
## 1 CH 47.5 50.3
## 2 FF 45.1 42.3
## 3 SL 39.0 60.9