#library(tidyverse)
#library(readxl)
#library(data.table)

Question 9

9a: How many pitches do not have a recorded spin axis?

pitch_data <- read.csv('pitch_data.csv')
sum(is.na(pitch_data$spin_axis))
## [1] 74

Answer: 74 pitches do not have a recorded spin axis.

9b: What is the ID of the pitcher who threw the highest percentage of fastballs (4-seam fastball + sinker) with a minimum of 30 total pitches? What is that percentage?

pitcher_table1 <- data.frame()

pitcher_list <- unique(pitch_data$pitcher_id)

for(i in pitcher_list){
  i_table <- filter(pitch_data, pitcher_id == i)
  
  pitch_count <- nrow(i_table)
  
  i_fb <- filter(i_table, pitch_name == '4-seam fastball' | pitch_name == 'Sinker')
  
  fb_count <- nrow(i_fb)
  
  fb_percentage <- fb_count/pitch_count
  
  new_row <- data.frame(pitcher_id = i,
                        total = pitch_count,
                        fb_perc = fb_percentage)
  
  pitcher_table1 <- rbind(pitcher_table1, new_row)
  
}

pitcher_table1 <- pitcher_table1 %>% 
  filter(total >= 30) %>% 
  arrange(desc(fb_perc))
head(pitcher_table1, 1)
##   pitcher_id total   fb_perc
## 1      87004    38 0.8421053

Answer: Pitcher 87004 threw the highest percentage of fastballs at 84.2%

9c: What is the ID of the pitcher who on average has the furthest break from pitching hand to glove side on a slider or sweeper? Which pitch type is it? What is the average pitching hand to glove side break on that pitcher’s pitch? Please group sliders and sweepers separately for pitchers who throw both.

pitcher_table2 <- pitch_data %>%
  filter(pitcher_id %in% pitcher_list, pitch_name %in% c('Slider', 'Sweeper')) %>%
  group_by(pitcher_id, pitch_name) %>%
  summarize(avg_break = mean(abs(pfx_x))) %>%
  pivot_wider(names_from = pitch_name, values_from = avg_break) %>%
  mutate(max = max(Slider, Sweeper, na.rm = T)) %>%
  arrange(order_by = desc(max))
## `summarise()` has grouped output by 'pitcher_id'. You can override using the
## `.groups` argument.
head(pitcher_table2, 1)
## # A tibble: 1 × 4
## # Groups:   pitcher_id [1]
##   pitcher_id Slider Sweeper   max
##        <int>  <dbl>   <dbl> <dbl>
## 1      22327     NA    1.82  1.82

Answer: Pitcher 22327 has the highest average break on SL or SW, with 1.82 ft (21.81 in) on his Sweeper.

Question 10

Determine which player (return the player ID) has the highest average speed, what that average speed is, and what date the player achieved his max speed.

speed_table1 <- read_excel('SpeedQuestion.xlsx', sheet = 'Coordinates')
speed_table2 <- read_excel('SpeedQuestion.xlsx', sheet = 'Pitches')
# separate coordinates by frame
# every play has fewer than 300 frames of recorded data, and missing frames will create NAs

frame_coords <- speed_table1 %>%
  separate(col = x, into = paste0('x', 1:300), sep = ',') %>%
  separate(col = y, into = paste0('y', 1:300), sep = ',') 
## Warning: Expected 300 pieces. Missing pieces filled with `NA` in 150000 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## Expected 300 pieces. Missing pieces filled with `NA` in 150000 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# pivot each x and y coordinate into individual rows
x_coords <- frame_coords %>%
  select(-starts_with('y')) %>%
  pivot_longer(cols = starts_with('x'), names_to = 'frame', values_to = 'x') %>%
  mutate(frame = str_remove_all(frame, 'x'),
         x = as.numeric(str_remove_all(x, '\\[|\\]')))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `x = as.numeric(str_remove_all(x, "\\[|\\]"))`.
## Caused by warning:
## ! NAs introduced by coercion
y_coords <- frame_coords %>%
  select(-starts_with('x')) %>%
  pivot_longer(cols = starts_with('y'), names_to = 'frame', values_to = 'y') %>%
  mutate(frame = str_remove_all(frame, 'y'),
         y = as.numeric(str_remove_all(y, '\\[|\\]')))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `y = as.numeric(str_remove_all(y, "\\[|\\]"))`.
## Caused by warning:
## ! NAs introduced by coercion
coords <- x_coords %>%
  left_join(y_coords, by = c('pitch_id', 'player_id', 'pos', 'inn', 'frame'))

head(coords)
## # A tibble: 6 × 7
##   pitch_id player_id   pos   inn frame     x     y
##      <dbl>     <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 40851318      1282     2     1 1         0    -5
## 2 40851318      1282     2     1 2         0    -5
## 3 40851318      1282     2     1 3         0    -5
## 4 40851318      1282     2     1 4         0    -5
## 5 40851318      1282     2     1 5         0    -5
## 6 40851318      1282     2     1 6         0    -5
# calculate distance between each frame
# lag is previous location
coords2 <- coords %>%
  mutate(x_lag = shift(x, n = 5, type = "lag", fill = NA), # n = 5 means 5 frames prior (1 second)
         y_lag = shift(y, n = 5, type = "lag", fill = NA),
         fps = sqrt((x - x_lag)^2 + (y - y_lag)^2), # feet per second
         mph =  fps * 0.681818) # convert to mph

head(coords2, 40)
## # A tibble: 40 × 11
##    pitch_id player_id   pos   inn frame     x     y x_lag y_lag   fps   mph
##       <dbl>     <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 40851318      1282     2     1 1         0    -5    NA    NA    NA    NA
##  2 40851318      1282     2     1 2         0    -5    NA    NA    NA    NA
##  3 40851318      1282     2     1 3         0    -5    NA    NA    NA    NA
##  4 40851318      1282     2     1 4         0    -5    NA    NA    NA    NA
##  5 40851318      1282     2     1 5         0    -5    NA    NA    NA    NA
##  6 40851318      1282     2     1 6         0    -5     0    -5     0     0
##  7 40851318      1282     2     1 7         0    -5     0    -5     0     0
##  8 40851318      1282     2     1 8         0    -5     0    -5     0     0
##  9 40851318      1282     2     1 9         0    -5     0    -5     0     0
## 10 40851318      1282     2     1 10        0    -5     0    -5     0     0
## # ℹ 30 more rows

Issues

head(coords2 %>% arrange(order_by = desc(fps)), 10)
## # A tibble: 10 × 11
##    pitch_id player_id   pos   inn frame     x     y x_lag y_lag   fps   mph
##       <dbl>     <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 41637666     25377     4     9 47      -29    -9     9   130  144.  98.3
##  2 41637666     25377     4     9 48      -30    -9     9   128  142.  97.1
##  3 41637666     25377     4     9 49      -31   -10     9   126  142.  96.7
##  4 41637666     25377     4     9 50      -32   -10     8   124  140.  95.3
##  5 41637666     25377     4     9 51      -33   -10     8   121  137.  93.6
##  6 41074702      1623    10     8 6        -3     0   -12   121  121.  82.7
##  7 41074702      1623    10     8 7        -3     0   -12   121  121.  82.7
##  8 41074702      1623    10     8 8        -3     0   -12   121  121.  82.7
##  9 41074702      1623    10     8 9        -3     0   -11   121  121.  82.7
## 10 41074702      1623    10     8 10       -3     0   -11   121  121.  82.7

As shown above, many frames (around 200 out of 45 million) have unrealistically high speeds caused by misreads by the tracking system. Examples of errors are shown on the plots below.

issue <- filter(coords2, player_id == 25377 & pitch_id == 41637666)
plot(issue$x, issue$y)

plot(issue$frame, issue$fps, type = 'l')

issue2 <- filter(coords2, player_id == 19756 & pitch_id == 40873735)
plot(issue2$x, issue2$y)

plot(issue2$frame, issue2$fps, type = 'l')

As shown in both of these examples, the player appeares to spontaneously jump from one spot on the field to another, which creates an incorrect fps calculation. After trying several methods, I could not find a way to reliably remove all of these errors.

Also, to calculate players’ average speeds, I would need to classify frames as running, rather than include sedentary moments in the average. I am unsure of how to do this, but I will continue to do research on the topic, and I look forward to learning!