This is the supporting code used to create the data for my scouting dashboard. The dashboard is meant to provide all of the information that I like to collect before scouting a pitcher. All of the information is derived from raw Statcast data.

In the context of a pro scouting department, this code can be manipulated based on each scout’s preferred metrics.

Sample screenshot:

One benefit of using Tableau is that it can be easily formatted for mobile use. This would be highly beneficial for in-person scouting.

Sample mobile layout:

Code used to create data

Necessary data sets:
  • Player Information
  • Relevant line stats
  • Stats over time
  • Pitch metrics and usages
#library(tidyverse)
#library(baseballr)
load("G:/My Drive/Baseball/R Projects/Data/Statcast2023.RData")
load("G:/My Drive/Baseball/R Projects/Data/Statcast2022.RData")

statcast <- rbind(Statcast2022, Statcast2023)

For the purpose of demonstration, I will use five MLB pitchers.

Using the baseballr package, collect the statcast IDs for each pitcher.

kremerid <- playerid_lookup(last_name = "Kremer", first_name = "Dean")$mlbam_id[1]
rogersid <- playerid_lookup(last_name = "Rogers", first_name = "Trevor")$mlbam_id[3]
steeleid <- playerid_lookup(last_name = "Steele", first_name = "Justin")$mlbam_id[1]
megillid <- playerid_lookup(last_name = "Megill", first_name = "Tylor")$mlbam_id[1]
greeneid <- playerid_lookup(last_name = "Greene", first_name = "Hunter")$mlbam_id[1]

pitcher_ids <- data.frame(Name = c('Dean Kremer', 'Trevor Rogers', 
                                   'Justin Steele', 'Tylor Megill', 
                                   'Hunter Greene'),
                          ID = c(kremerid, rogersid, steeleid, megillid, 
                                 greeneid))

Relevant line stats

# Define vectors necessary for sorting Statcast data
hits_vector <- c('single', 'double', 'triple', 'home_run')
outs_vector <- c('field_out', 'force_out', 'fielders_choice', 'sac_fly_double_play',
                 'strikeout_double_play', 'caught_stealing_2b', 'pickoff_1b',  'caught_stealing_3b', 'pickoff_caught_stealing_2b', 'triple_play', 'pickoff_2b',
                 'grounded_into_double_play', 'sac_fly', 'fielders_choice_out', 
                 'sac_bunt', 'double_play', 'caught_stealing_home', 'other_out',
                 'pickoff_3b', 'pickoff_caught_stealing_home', 'strikeout')
at_bats_vector <- c('field_out', 'strikeout', 'grounded_into_double_play', 'fielders_choice', 'fielders_choice_out', 'triple_play', 'strikeout_double_play', 'double_play', 'field_error', 'force_out', hits_vector)
plate_appearances_vector <- c(at_bats_vector, 'sac_fly', 'walk', 'hit_by_pitch', 'sac_bunt', 'sac_bunt_double_play', 'sac_fly_double_play', 'catcher_interf')


###############################################################################
#################### Loop stat calcs for each pitcher #########################
###############################################################################

stats <- data.frame()

for(year in c(2022, 2023)){ # Calculate individually for each year 
  
  year_data <- filter(statcast, game_year == year)
  
  for(pitcher_name in pitcher_ids$ID){
    
  player_data <- filter(year_data, pitcher == pitcher_name)
  
  
  
# G
games <- length(unique(player_data$game_pk))




# GS
start_data <- player_data %>%
  filter(inning == 1 & outs_when_up == 0)

games_started <- length(unique(start_data$game_pk))




# IP
out_count <- sum(player_data$events %in% outs_vector) + 
               sum(player_data$events %in% c('sac_fly_double_play',
                                                  'strikeout_double_play',
                                                  'grounded_into_double_play',
                                                  'double_play')) +
                2 * sum(player_data$events == 'triple_play')

innings_pitched <- out_count / 3

          # Create version of IP that uses "0.1" and "0.2" rather than "0.33" and "0.67"
innings_pitched2 <- if_else(innings_pitched %% 1 == 0, innings_pitched,
                      if_else(innings_pitched %% 1 > 0.66,
                              floor(innings_pitched) + 0.2,
                              floor(innings_pitched) + 0.1))




# BF/G
batters_faced <- sum(player_data$events %in% plate_appearances_vector)

bf_per_game <- batters_faced / games




# BB%
walk_count <- sum(player_data$events == 'walk')
bb_rate <- walk_count / batters_faced




# K% 
strikeout_count <- sum(player_data$events == 'strikeout')
k_rate <- strikeout_count / batters_faced




# WHIP
hit_count <- sum(player_data$events %in% hits_vector)
whip <- (walk_count + hit_count) / innings_pitched





# xwOBA against
hit_by_pitch_count <- sum(player_data$events == 'hit_by_pitch')
  xwoba_sum <- sum(player_data$estimated_woba_using_speedangle, na.rm = TRUE) +
                0.7 * (walk_count + hit_by_pitch_count)
  woba_denominator <- sum(player_data$events %in% at_bats_vector) +
    walk_count +
    hit_by_pitch_count +
    sum(player_data$events == 'sac_fly')
  xwoba <- xwoba_sum / woba_denominator

  
  
# New row for each pitcher / year
new_row <- data.frame(ID = pitcher_name,
                    Year = year,
                    Level = 'MLB',
                    G = games, 
                    GS = games_started,
                    IP = innings_pitched2,
                    K = k_rate,
                    BB = bb_rate,
                    WHIP = whip,
                    BF_G = bf_per_game,
                    xWOBA = xwoba)

# Add to data frame 
stats <- rbind(stats, new_row)

  }
}

# Add names 
stats <- stats %>%
  left_join(pitcher_ids, by = 'ID') %>%
  select(Name, everything(), -ID)

head(stats, 10)
##             Name Year Level  G GS    IP         K         BB     WHIP     BF_G
## 1    Dean Kremer 2022   MLB 22 21 123.0 0.1699219 0.06640625 1.276423 23.27273
## 2  Trevor Rogers 2022   MLB 23 23 106.0 0.2222222 0.09433962 1.518868 20.73913
## 3  Justin Steele 2022   MLB 24 24 118.1 0.2441406 0.09765625 1.360563 21.33333
## 4   Tylor Megill 2022   MLB 15  9  47.2 0.2500000 0.06500000 1.237762 13.33333
## 5  Hunter Greene 2022   MLB 24 24 124.2 0.3069680 0.09039548 1.219251 22.12500
## 6    Dean Kremer 2023   MLB 33 33 172.1 0.2061580 0.07362784 1.352031 22.63636
## 7  Trevor Rogers 2023   MLB  4  4  17.2 0.2405063 0.07594937 1.245283 19.75000
## 8  Justin Steele 2023   MLB 30 30 173.0 0.2419580 0.05034965 1.173410 23.83333
## 9   Tylor Megill 2023   MLB 25 25 125.2 0.1851852 0.10229277 1.583554 22.68000
## 10 Hunter Greene 2023   MLB 22 22 111.1 0.3058350 0.09456740 1.419162 22.59091
##        xWOBA
## 1  0.3263725
## 2  0.3369329
## 3  0.2933053
## 4  0.3244091
## 5  0.3082981
## 6  0.3435596
## 7  0.3089747
## 8  0.2890324
## 9  0.3638304
## 10 0.3004404

Stats over time

This is very similar to the previous chunk, but it includes a third nested loop for each date.

time_stats <- data.frame()

for(year in c(2022, 2023)){
  
  year_data <- filter(statcast, game_year == year)
  
  for(pitcher_name in pitcher_ids$ID){
    
  player_data <- filter(year_data, pitcher == pitcher_name)
  unique_dates <- unique(player_data$game_date)
  
  
  ##############################################################################
  ################## Loop for each date ########################################
  ##############################################################################
  
  
    for(d in unique_dates){
      
    d <- as.Date(d, origin = "1970-01-01")
    # Rolling calculation for the last 50 days (approx 10 starts)
    last_50 <- player_data %>%
      filter(game_date <= d & game_date >= d - 50)

games <- length(unique(last_50$game_pk))
start_data <- last_50 %>%
  filter(inning == 1 & outs_when_up == 0)
games_started <- length(unique(start_data$game_pk))
out_count <- sum(last_50$events %in% outs_vector) + 
               sum(last_50$events %in% c('sac_fly_double_play',
                                                  'strikeout_double_play',
                                                  'grounded_into_double_play',
                                                  'double_play')) +
                2 * sum(last_50$events == 'triple_play')
innings_pitched <- out_count / 3
innings_pitched2 <- if_else(innings_pitched %% 1 == 0, innings_pitched,
                      if_else(innings_pitched %% 1 > 0.66,
                              floor(innings_pitched) + 0.2,
                              floor(innings_pitched) + 0.1))
batters_faced <- sum(last_50$events %in% plate_appearances_vector)
bf_per_game <- batters_faced / games
walk_count <- sum(last_50$events == 'walk')
bb_rate <- walk_count / batters_faced
strikeout_count <- sum(last_50$events == 'strikeout')
k_rate <- strikeout_count / batters_faced
hit_count <- sum(last_50$events %in% hits_vector)
whip <- (walk_count + hit_count) / innings_pitched
hit_by_pitch_count <- sum(player_data$events == 'hit_by_pitch')
  xwoba_sum <- sum(player_data$estimated_woba_using_speedangle, na.rm = TRUE) +
                0.7 * (walk_count + hit_by_pitch_count)
  woba_denominator <- sum(player_data$events %in% at_bats_vector) +
    walk_count +
    hit_by_pitch_count +
    sum(player_data$events == 'sac_fly')
  xwoba <- xwoba_sum / woba_denominator


new_row <- data.frame(ID = pitcher_name,
                    Year = year,
                    Date = d,
                    Level = 'MLB',
                    G = games, 
                    GS = games_started,
                    IP = innings_pitched2,
                    K = k_rate,
                    BB = bb_rate,
                    WHIP = whip,
                    BF_G = bf_per_game,
                    xWOBA = xwoba)

time_stats <- rbind(time_stats, new_row)

    }
  }
}

time_stats <- time_stats %>%
  left_join(pitcher_ids, by = 'ID') %>%
  select(Name, everything(), -ID)

head(time_stats, 10)
##           Name Year       Date Level G GS   IP         K         BB     WHIP
## 1  Dean Kremer 2022 2022-06-05   MLB 1  1  4.1 0.1578947 0.05263158 1.384615
## 2  Dean Kremer 2022 2022-06-12   MLB 2  2  9.2 0.1219512 0.07317073 1.241379
## 3  Dean Kremer 2022 2022-06-17   MLB 3  3 15.0 0.1612903 0.06451613 1.200000
## 4  Dean Kremer 2022 2022-06-23   MLB 4  4 20.1 0.1627907 0.05813953 1.278689
## 5  Dean Kremer 2022 2022-06-28   MLB 5  5 27.0 0.1711712 0.06306306 1.185185
## 6  Dean Kremer 2022 2022-07-04   MLB 6  6 31.2 0.1703704 0.06666667 1.326316
## 7  Dean Kremer 2022 2022-07-09   MLB 7  7 36.2 0.1935484 0.07741935 1.281818
## 8  Dean Kremer 2022 2022-07-16   MLB 8  8 40.2 0.1839080 0.06896552 1.352459
## 9  Dean Kremer 2022 2022-07-24   MLB 9  9 45.2 0.1938776 0.06632653 1.335766
## 10 Dean Kremer 2022 2022-07-30   MLB 9  9 45.2 0.1890547 0.06467662 1.445255
##        BF_G     xWOBA
## 1  19.00000 0.3005241
## 2  20.50000 0.3021921
## 3  20.66667 0.3030208
## 4  21.50000 0.3038462
## 5  22.20000 0.3054865
## 6  22.50000 0.3071134
## 7  22.14286 0.3095287
## 8  21.75000 0.3095287
## 9  21.77778 0.3103272
## 10 22.33333 0.3103272

Pitch data (usage and metrics)

player_data <- statcast %>%
  filter(pitcher %in% pitcher_ids$ID & game_year == 2023)

  # two "group by's" are necessary to calculate for each individual pitcher
  pitch_usage <- player_data %>%
    group_by(pitcher, pitch_name) %>%
    summarize(count = n()) %>%
    group_by(pitcher) %>%
    mutate(total_count = sum(count)) %>%
    mutate(usage = count / total_count,
     # rank of usage for each pitcher
     # This is necessary so that the dashboard can filter based on "Pitch 1" rather than a specific type of pitch. This allows the dashboard to display only the pitches thrown by the selected pitcher. 
           rank = dense_rank(desc(usage))) %>%
    filter(usage > 0.04) # Only include pitches thrown over 4% of the time
## `summarise()` has grouped output by 'pitcher'. You can override using the
## `.groups` argument.
  metrics <- player_data %>%
    group_by(pitcher, pitch_name) %>%
    summarize(velocity = mean(release_speed, na.rm = T),
            spin_rate = mean(release_spin_rate, na.rm = T),
            vert_break = mean(pfx_z, na.rm = T) * 12, # feet to inches
            horz_break = mean(pfx_x, na.rm = T) * -12, # pitcher perspective
            count = n())
## `summarise()` has grouped output by 'pitcher'. You can override using the
## `.groups` argument.
  # join tables
  pitch_data <- pitch_usage %>%
    left_join(metrics, by = c('pitcher', 'pitch_name')) %>%
    rename(ID = pitcher) %>%
    left_join(pitcher_ids, by = 'ID') %>%
    select(Name, pitch_name, usage, rank, velocity, spin_rate, vert_break, 
           horz_break, count.x, total_count) %>%
    rename(count = count.x)
## Adding missing grouping variables: `ID`
head(pitch_data, 11)
## # A tibble: 11 × 11
## # Groups:   ID [3]
##        ID Name  pitch_name  usage  rank velocity spin_rate vert_break horz_break
##     <dbl> <chr> <chr>       <dbl> <int>    <dbl>     <dbl>      <dbl>      <dbl>
##  1 656731 Tylo… 4-Seam Fa… 0.557      1     94.9     2126.     14.7         7.75
##  2 656731 Tylo… Changeup   0.149      3     87.9     1607.      6.23       14.4 
##  3 656731 Tylo… Curveball  0.0859     4     77.2     2316.    -13.6        -8.00
##  4 656731 Tylo… Slider     0.206      2     84.4     2225.     -1.59       -4.18
##  5 657006 Just… 4-Seam Fa… 0.626      1     91.8     2402.     10.8         1.33
##  6 657006 Just… Slider     0.339      2     83.3     2658.     -0.361      14.1 
##  7 665152 Dean… 4-Seam Fa… 0.373      1     94.6     2304.     16.8         9.65
##  8 665152 Dean… Changeup   0.121      4     85.3     1714.      4.97       15.3 
##  9 665152 Dean… Curveball  0.0897     5     78.2     2758.    -10.7       -10.3 
## 10 665152 Dean… Cutter     0.239      2     88.8     2435.      8.61       -3.79
## 11 665152 Dean… Sinker     0.145      3     91.9     2002.      9.79       16.0 
## # ℹ 2 more variables: count <int>, total_count <int>