This is the supporting code used to create the data for my scouting dashboard. The dashboard is meant to provide all of the information that I like to collect before scouting a pitcher. All of the information is derived from raw Statcast data.
In the context of a pro scouting department, this code can be manipulated based on each scout’s preferred metrics.
One benefit of using Tableau is that it can be easily formatted for mobile use. This would be highly beneficial for in-person scouting.
#library(tidyverse)
#library(baseballr)
load("G:/My Drive/Baseball/R Projects/Data/Statcast2023.RData")
load("G:/My Drive/Baseball/R Projects/Data/Statcast2022.RData")
statcast <- rbind(Statcast2022, Statcast2023)
For the purpose of demonstration, I will use five MLB pitchers.
Using the baseballr package, collect the statcast IDs for each pitcher.
kremerid <- playerid_lookup(last_name = "Kremer", first_name = "Dean")$mlbam_id[1]
rogersid <- playerid_lookup(last_name = "Rogers", first_name = "Trevor")$mlbam_id[3]
steeleid <- playerid_lookup(last_name = "Steele", first_name = "Justin")$mlbam_id[1]
megillid <- playerid_lookup(last_name = "Megill", first_name = "Tylor")$mlbam_id[1]
greeneid <- playerid_lookup(last_name = "Greene", first_name = "Hunter")$mlbam_id[1]
pitcher_ids <- data.frame(Name = c('Dean Kremer', 'Trevor Rogers',
'Justin Steele', 'Tylor Megill',
'Hunter Greene'),
ID = c(kremerid, rogersid, steeleid, megillid,
greeneid))
# Define vectors necessary for sorting Statcast data
hits_vector <- c('single', 'double', 'triple', 'home_run')
outs_vector <- c('field_out', 'force_out', 'fielders_choice', 'sac_fly_double_play',
'strikeout_double_play', 'caught_stealing_2b', 'pickoff_1b', 'caught_stealing_3b', 'pickoff_caught_stealing_2b', 'triple_play', 'pickoff_2b',
'grounded_into_double_play', 'sac_fly', 'fielders_choice_out',
'sac_bunt', 'double_play', 'caught_stealing_home', 'other_out',
'pickoff_3b', 'pickoff_caught_stealing_home', 'strikeout')
at_bats_vector <- c('field_out', 'strikeout', 'grounded_into_double_play', 'fielders_choice', 'fielders_choice_out', 'triple_play', 'strikeout_double_play', 'double_play', 'field_error', 'force_out', hits_vector)
plate_appearances_vector <- c(at_bats_vector, 'sac_fly', 'walk', 'hit_by_pitch', 'sac_bunt', 'sac_bunt_double_play', 'sac_fly_double_play', 'catcher_interf')
###############################################################################
#################### Loop stat calcs for each pitcher #########################
###############################################################################
stats <- data.frame()
for(year in c(2022, 2023)){ # Calculate individually for each year
year_data <- filter(statcast, game_year == year)
for(pitcher_name in pitcher_ids$ID){
player_data <- filter(year_data, pitcher == pitcher_name)
# G
games <- length(unique(player_data$game_pk))
# GS
start_data <- player_data %>%
filter(inning == 1 & outs_when_up == 0)
games_started <- length(unique(start_data$game_pk))
# IP
out_count <- sum(player_data$events %in% outs_vector) +
sum(player_data$events %in% c('sac_fly_double_play',
'strikeout_double_play',
'grounded_into_double_play',
'double_play')) +
2 * sum(player_data$events == 'triple_play')
innings_pitched <- out_count / 3
# Create version of IP that uses "0.1" and "0.2" rather than "0.33" and "0.67"
innings_pitched2 <- if_else(innings_pitched %% 1 == 0, innings_pitched,
if_else(innings_pitched %% 1 > 0.66,
floor(innings_pitched) + 0.2,
floor(innings_pitched) + 0.1))
# BF/G
batters_faced <- sum(player_data$events %in% plate_appearances_vector)
bf_per_game <- batters_faced / games
# BB%
walk_count <- sum(player_data$events == 'walk')
bb_rate <- walk_count / batters_faced
# K%
strikeout_count <- sum(player_data$events == 'strikeout')
k_rate <- strikeout_count / batters_faced
# WHIP
hit_count <- sum(player_data$events %in% hits_vector)
whip <- (walk_count + hit_count) / innings_pitched
# xwOBA against
hit_by_pitch_count <- sum(player_data$events == 'hit_by_pitch')
xwoba_sum <- sum(player_data$estimated_woba_using_speedangle, na.rm = TRUE) +
0.7 * (walk_count + hit_by_pitch_count)
woba_denominator <- sum(player_data$events %in% at_bats_vector) +
walk_count +
hit_by_pitch_count +
sum(player_data$events == 'sac_fly')
xwoba <- xwoba_sum / woba_denominator
# New row for each pitcher / year
new_row <- data.frame(ID = pitcher_name,
Year = year,
Level = 'MLB',
G = games,
GS = games_started,
IP = innings_pitched2,
K = k_rate,
BB = bb_rate,
WHIP = whip,
BF_G = bf_per_game,
xWOBA = xwoba)
# Add to data frame
stats <- rbind(stats, new_row)
}
}
# Add names
stats <- stats %>%
left_join(pitcher_ids, by = 'ID') %>%
select(Name, everything(), -ID)
head(stats, 10)
## Name Year Level G GS IP K BB WHIP BF_G
## 1 Dean Kremer 2022 MLB 22 21 123.0 0.1699219 0.06640625 1.276423 23.27273
## 2 Trevor Rogers 2022 MLB 23 23 106.0 0.2222222 0.09433962 1.518868 20.73913
## 3 Justin Steele 2022 MLB 24 24 118.1 0.2441406 0.09765625 1.360563 21.33333
## 4 Tylor Megill 2022 MLB 15 9 47.2 0.2500000 0.06500000 1.237762 13.33333
## 5 Hunter Greene 2022 MLB 24 24 124.2 0.3069680 0.09039548 1.219251 22.12500
## 6 Dean Kremer 2023 MLB 33 33 172.1 0.2061580 0.07362784 1.352031 22.63636
## 7 Trevor Rogers 2023 MLB 4 4 17.2 0.2405063 0.07594937 1.245283 19.75000
## 8 Justin Steele 2023 MLB 30 30 173.0 0.2419580 0.05034965 1.173410 23.83333
## 9 Tylor Megill 2023 MLB 25 25 125.2 0.1851852 0.10229277 1.583554 22.68000
## 10 Hunter Greene 2023 MLB 22 22 111.1 0.3058350 0.09456740 1.419162 22.59091
## xWOBA
## 1 0.3263725
## 2 0.3369329
## 3 0.2933053
## 4 0.3244091
## 5 0.3082981
## 6 0.3435596
## 7 0.3089747
## 8 0.2890324
## 9 0.3638304
## 10 0.3004404
This is very similar to the previous chunk, but it includes a third nested loop for each date.
time_stats <- data.frame()
for(year in c(2022, 2023)){
year_data <- filter(statcast, game_year == year)
for(pitcher_name in pitcher_ids$ID){
player_data <- filter(year_data, pitcher == pitcher_name)
unique_dates <- unique(player_data$game_date)
##############################################################################
################## Loop for each date ########################################
##############################################################################
for(d in unique_dates){
d <- as.Date(d, origin = "1970-01-01")
# Rolling calculation for the last 50 days (approx 10 starts)
last_50 <- player_data %>%
filter(game_date <= d & game_date >= d - 50)
games <- length(unique(last_50$game_pk))
start_data <- last_50 %>%
filter(inning == 1 & outs_when_up == 0)
games_started <- length(unique(start_data$game_pk))
out_count <- sum(last_50$events %in% outs_vector) +
sum(last_50$events %in% c('sac_fly_double_play',
'strikeout_double_play',
'grounded_into_double_play',
'double_play')) +
2 * sum(last_50$events == 'triple_play')
innings_pitched <- out_count / 3
innings_pitched2 <- if_else(innings_pitched %% 1 == 0, innings_pitched,
if_else(innings_pitched %% 1 > 0.66,
floor(innings_pitched) + 0.2,
floor(innings_pitched) + 0.1))
batters_faced <- sum(last_50$events %in% plate_appearances_vector)
bf_per_game <- batters_faced / games
walk_count <- sum(last_50$events == 'walk')
bb_rate <- walk_count / batters_faced
strikeout_count <- sum(last_50$events == 'strikeout')
k_rate <- strikeout_count / batters_faced
hit_count <- sum(last_50$events %in% hits_vector)
whip <- (walk_count + hit_count) / innings_pitched
hit_by_pitch_count <- sum(player_data$events == 'hit_by_pitch')
xwoba_sum <- sum(player_data$estimated_woba_using_speedangle, na.rm = TRUE) +
0.7 * (walk_count + hit_by_pitch_count)
woba_denominator <- sum(player_data$events %in% at_bats_vector) +
walk_count +
hit_by_pitch_count +
sum(player_data$events == 'sac_fly')
xwoba <- xwoba_sum / woba_denominator
new_row <- data.frame(ID = pitcher_name,
Year = year,
Date = d,
Level = 'MLB',
G = games,
GS = games_started,
IP = innings_pitched2,
K = k_rate,
BB = bb_rate,
WHIP = whip,
BF_G = bf_per_game,
xWOBA = xwoba)
time_stats <- rbind(time_stats, new_row)
}
}
}
time_stats <- time_stats %>%
left_join(pitcher_ids, by = 'ID') %>%
select(Name, everything(), -ID)
head(time_stats, 10)
## Name Year Date Level G GS IP K BB WHIP
## 1 Dean Kremer 2022 2022-06-05 MLB 1 1 4.1 0.1578947 0.05263158 1.384615
## 2 Dean Kremer 2022 2022-06-12 MLB 2 2 9.2 0.1219512 0.07317073 1.241379
## 3 Dean Kremer 2022 2022-06-17 MLB 3 3 15.0 0.1612903 0.06451613 1.200000
## 4 Dean Kremer 2022 2022-06-23 MLB 4 4 20.1 0.1627907 0.05813953 1.278689
## 5 Dean Kremer 2022 2022-06-28 MLB 5 5 27.0 0.1711712 0.06306306 1.185185
## 6 Dean Kremer 2022 2022-07-04 MLB 6 6 31.2 0.1703704 0.06666667 1.326316
## 7 Dean Kremer 2022 2022-07-09 MLB 7 7 36.2 0.1935484 0.07741935 1.281818
## 8 Dean Kremer 2022 2022-07-16 MLB 8 8 40.2 0.1839080 0.06896552 1.352459
## 9 Dean Kremer 2022 2022-07-24 MLB 9 9 45.2 0.1938776 0.06632653 1.335766
## 10 Dean Kremer 2022 2022-07-30 MLB 9 9 45.2 0.1890547 0.06467662 1.445255
## BF_G xWOBA
## 1 19.00000 0.3005241
## 2 20.50000 0.3021921
## 3 20.66667 0.3030208
## 4 21.50000 0.3038462
## 5 22.20000 0.3054865
## 6 22.50000 0.3071134
## 7 22.14286 0.3095287
## 8 21.75000 0.3095287
## 9 21.77778 0.3103272
## 10 22.33333 0.3103272
player_data <- statcast %>%
filter(pitcher %in% pitcher_ids$ID & game_year == 2023)
# two "group by's" are necessary to calculate for each individual pitcher
pitch_usage <- player_data %>%
group_by(pitcher, pitch_name) %>%
summarize(count = n()) %>%
group_by(pitcher) %>%
mutate(total_count = sum(count)) %>%
mutate(usage = count / total_count,
# rank of usage for each pitcher
# This is necessary so that the dashboard can filter based on "Pitch 1" rather than a specific type of pitch. This allows the dashboard to display only the pitches thrown by the selected pitcher.
rank = dense_rank(desc(usage))) %>%
filter(usage > 0.04) # Only include pitches thrown over 4% of the time
## `summarise()` has grouped output by 'pitcher'. You can override using the
## `.groups` argument.
metrics <- player_data %>%
group_by(pitcher, pitch_name) %>%
summarize(velocity = mean(release_speed, na.rm = T),
spin_rate = mean(release_spin_rate, na.rm = T),
vert_break = mean(pfx_z, na.rm = T) * 12, # feet to inches
horz_break = mean(pfx_x, na.rm = T) * -12, # pitcher perspective
count = n())
## `summarise()` has grouped output by 'pitcher'. You can override using the
## `.groups` argument.
# join tables
pitch_data <- pitch_usage %>%
left_join(metrics, by = c('pitcher', 'pitch_name')) %>%
rename(ID = pitcher) %>%
left_join(pitcher_ids, by = 'ID') %>%
select(Name, pitch_name, usage, rank, velocity, spin_rate, vert_break,
horz_break, count.x, total_count) %>%
rename(count = count.x)
## Adding missing grouping variables: `ID`
head(pitch_data, 11)
## # A tibble: 11 × 11
## # Groups: ID [3]
## ID Name pitch_name usage rank velocity spin_rate vert_break horz_break
## <dbl> <chr> <chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 656731 Tylo… 4-Seam Fa… 0.557 1 94.9 2126. 14.7 7.75
## 2 656731 Tylo… Changeup 0.149 3 87.9 1607. 6.23 14.4
## 3 656731 Tylo… Curveball 0.0859 4 77.2 2316. -13.6 -8.00
## 4 656731 Tylo… Slider 0.206 2 84.4 2225. -1.59 -4.18
## 5 657006 Just… 4-Seam Fa… 0.626 1 91.8 2402. 10.8 1.33
## 6 657006 Just… Slider 0.339 2 83.3 2658. -0.361 14.1
## 7 665152 Dean… 4-Seam Fa… 0.373 1 94.6 2304. 16.8 9.65
## 8 665152 Dean… Changeup 0.121 4 85.3 1714. 4.97 15.3
## 9 665152 Dean… Curveball 0.0897 5 78.2 2758. -10.7 -10.3
## 10 665152 Dean… Cutter 0.239 2 88.8 2435. 8.61 -3.79
## 11 665152 Dean… Sinker 0.145 3 91.9 2002. 9.79 16.0
## # ℹ 2 more variables: count <int>, total_count <int>