Introduction:
In this analysis, I examine MLB teams’ pitching strategies by analyzing the usage of different pitch types. We combine data on pitch percentages to calculate overall fastball, sinker/cutter, and off-speed usage rates, then apply clustering to understand trends. Correlations with ERA are explored to gain insights into effectiveness.
# Load the datasets
complete_pitchers = read.csv("complete_pitchers.csv")
pitch_type_data = read.csv("pitch_type_data.csv")
# Merge datasets on 'Name' and 'Team' columns
merged_data = merge(complete_pitchers, pitch_type_data, by = c("Name", "Team"))
Data Preparation
# Convert percentage columns from character to numeric
percentage_columns = c("FA.", "FC.", "FS.", "SI.", "CU.")
# Function to clean and convert percentage columns
clean_percentage = function(col) {
as.numeric(gsub("%", "", col)) / 100
}
# Apply conversion and handle NAs in percentage columns
merged_data = merged_data |>
mutate(across(all_of(percentage_columns), ~ clean_percentage(.))) |>
mutate(across(all_of(percentage_columns), ~ ifelse(is.na(.), 0, .)))
Calculate Pitch Usage Calculations
# Fastball Usage: FA% + FC% + FS%
merged_data = merged_data |>
mutate(Fastball_Usage = FA. + FC. + FS.)
# Sinker/Cutter Usage: SI% + CU%
merged_data = merged_data |>
mutate(Sinker_Cutter_Usage = SI. + CU.)
# Off-speed Usage: 1 - (Fastball + Sinker/Cutter)
merged_data = merged_data |>
mutate(Offspeed_Usage = 1 - (Fastball_Usage + Sinker_Cutter_Usage))
Aggregate Data by Team
# Calculate average metrics by team
team_averages = merged_data |>
group_by(Team) |>
summarise(
Avg_ERA = mean(ERA, na.rm = TRUE),
Avg_Fastball = mean(Fastball_Usage, na.rm = TRUE),
Avg_Sinker_Cutter = mean(Sinker_Cutter_Usage, na.rm = TRUE),
Avg_Offspeed = mean(Offspeed_Usage, na.rm = TRUE)
)
K-Means Clustering
# Set seed for reproducibility
set.seed(123)
# Perform clustering
fastball_clusters = kmeans(team_averages[, c("Avg_ERA", "Avg_Fastball")], centers = 3)
sinker_cutter_clusters = kmeans(team_averages[, c("Avg_ERA", "Avg_Sinker_Cutter")], centers = 3)
offspeed_clusters = kmeans(team_averages[, c("Avg_ERA", "Avg_Offspeed")], centers = 3)
# Assign clusters to data
team_averages = team_averages |>
mutate(
Fastball_Cluster = as.factor(fastball_clusters$cluster),
Sinker_Cutter_Cluster = as.factor(sinker_cutter_clusters$cluster),
Offspeed_Cluster = as.factor(offspeed_clusters$cluster)
)
Correlation Analysis
# Calculate and print correlations
fastball_correlation = cor(team_averages$Avg_Fastball, team_averages$Avg_ERA, use = "complete.obs")
sinker_cutter_correlation = cor(team_averages$Avg_Sinker_Cutter, team_averages$Avg_ERA, use = "complete.obs")
offspeed_correlation = cor(team_averages$Avg_Offspeed, team_averages$Avg_ERA, use = "complete.obs")
print(paste("Fastball Usage & ERA Correlation:", round(fastball_correlation, 2)))
## [1] "Fastball Usage & ERA Correlation: 0.94"
print(paste("Sinker/Cutter Usage & ERA Correlation:", round(sinker_cutter_correlation, 2)))
## [1] "Sinker/Cutter Usage & ERA Correlation: -0.75"
print(paste("Off-speed Usage & ERA Correlation:", round(offspeed_correlation, 2)))
## [1] "Off-speed Usage & ERA Correlation: -0.01"
Vizualization with Team Logos
# Adding Team Logos
team_averages = team_averages |>
mutate(logo = case_when(
Team == "NYY" ~ "yankees_logo.png",
Team == "NYM" ~ "mets_logo.png",
Team == "LAD" ~ "dodgers_logo.png",
Team == "CLE" ~ "guardians_logo.png"
))
# Plot: Fastball Usage vs. ERA
ggplot(team_averages, aes(x = Avg_Fastball, y = Avg_ERA)) +
geom_image(aes(image = logo), size = 0.15) +
geom_smooth(method = "lm", se = FALSE, color = "red") +
labs(title = "Fastball Usage vs. ERA", x = "Avg Fastball Usage (%)", y = "Avg ERA") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#Plot: Sinker/Cutter Usage vs. ERA
ggplot(team_averages, aes(x = Avg_Sinker_Cutter, y = Avg_ERA)) +
geom_image(aes(image = logo), size = 0.15) +
geom_smooth(method = "lm", se = FALSE, color = "green") +
labs(title = "Avg Sinker/Cutter Usage and Avg ERA", x = "Avg Sinker/Cutter Usage (%)", y = "Avg ERA") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
#Plot: Offspeed usage vs. ERA
ggplot(team_averages, aes(x = Avg_Offspeed, y = Avg_ERA)) +
geom_image(aes(image = logo), size = 0.15) +
geom_smooth(method = "lm", se = FALSE, color = "yellow") +
labs(title = "Avg Offspeed Usage and Avg ERA", x = "Avg Offspeed Usage (%)", y = "Avg ERA") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Conclusion: Our analysis of 32 pitchers revealed that teams with higher fastball usage tend to have higher ERAs, suggesting that over-reliance on fastballs can lead to predictability and easier reads for hitters. The Mets, with the lowest fastball usage (21%) among postseason teams, also have the lowest ERA, while the Dodgers, who use fastballs 57% of the time, have the highest ERA, indicating the risks of overuse. Conversely, teams that incorporate more sinkers and cutters tend to have lower ERAs, as these pitches induce weak contact. The Mets lead in sinker/cutter usage (46%), which correlates with their low ERA, while the Dodgers’ minimal use (8%) aligns with their struggles. Off-speed pitches, however, showed minimal correlation with ERA, suggesting their success depends on strategic, situational use by individual pitchers rather than a consistent team-wide approach.