#library(tidyverse)
#library(readxl)
#library(data.table)
9a: How many pitches do not have a recorded spin axis?
pitch_data <- read.csv('pitch_data.csv')
sum(is.na(pitch_data$spin_axis))
## [1] 74
Answer: 74 pitches do not have a recorded spin axis.
9b: What is the ID of the pitcher who threw the highest percentage of fastballs (4-seam fastball + sinker) with a minimum of 30 total pitches? What is that percentage?
pitcher_table1 <- data.frame()
pitcher_list <- unique(pitch_data$pitcher_id)
for(i in pitcher_list){
i_table <- filter(pitch_data, pitcher_id == i)
pitch_count <- nrow(i_table)
i_fb <- filter(i_table, pitch_name == '4-seam fastball' | pitch_name == 'Sinker')
fb_count <- nrow(i_fb)
fb_percentage <- fb_count/pitch_count
new_row <- data.frame(pitcher_id = i,
total = pitch_count,
fb_perc = fb_percentage)
pitcher_table1 <- rbind(pitcher_table1, new_row)
}
pitcher_table1 <- pitcher_table1 %>%
filter(total >= 30) %>%
arrange(desc(fb_perc))
head(pitcher_table1, 1)
## pitcher_id total fb_perc
## 1 87004 38 0.8421053
Answer: Pitcher 87004 threw the highest percentage of fastballs at 84.2%
9c: What is the ID of the pitcher who on average has the furthest break from pitching hand to glove side on a slider or sweeper? Which pitch type is it? What is the average pitching hand to glove side break on that pitcher’s pitch? Please group sliders and sweepers separately for pitchers who throw both.
pitcher_table2 <- pitch_data %>%
filter(pitcher_id %in% pitcher_list, pitch_name %in% c('Slider', 'Sweeper')) %>%
group_by(pitcher_id, pitch_name) %>%
summarize(avg_break = mean(abs(pfx_x))) %>%
pivot_wider(names_from = pitch_name, values_from = avg_break) %>%
mutate(max = max(Slider, Sweeper, na.rm = T)) %>%
arrange(order_by = desc(max))
## `summarise()` has grouped output by 'pitcher_id'. You can override using the
## `.groups` argument.
head(pitcher_table2, 1)
## # A tibble: 1 × 4
## # Groups: pitcher_id [1]
## pitcher_id Slider Sweeper max
## <int> <dbl> <dbl> <dbl>
## 1 22327 NA 1.82 1.82
Answer: Pitcher 22327 has the highest average break on SL or SW, with 1.82 ft (21.81 in) on his Sweeper.
Determine which player (return the player ID) has the highest average speed, what that average speed is, and what date the player achieved his max speed.
speed_table1 <- read_excel('SpeedQuestion.xlsx', sheet = 'Coordinates')
speed_table2 <- read_excel('SpeedQuestion.xlsx', sheet = 'Pitches')
# separate coordinates by frame
# every play has fewer than 300 frames of recorded data, and missing frames will create NAs
frame_coords <- speed_table1 %>%
separate(col = x, into = paste0('x', 1:300), sep = ',') %>%
separate(col = y, into = paste0('y', 1:300), sep = ',')
## Warning: Expected 300 pieces. Missing pieces filled with `NA` in 150000 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## Expected 300 pieces. Missing pieces filled with `NA` in 150000 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# pivot each x and y coordinate into individual rows
x_coords <- frame_coords %>%
select(-starts_with('y')) %>%
pivot_longer(cols = starts_with('x'), names_to = 'frame', values_to = 'x') %>%
mutate(frame = str_remove_all(frame, 'x'),
x = as.numeric(str_remove_all(x, '\\[|\\]')))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `x = as.numeric(str_remove_all(x, "\\[|\\]"))`.
## Caused by warning:
## ! NAs introduced by coercion
y_coords <- frame_coords %>%
select(-starts_with('x')) %>%
pivot_longer(cols = starts_with('y'), names_to = 'frame', values_to = 'y') %>%
mutate(frame = str_remove_all(frame, 'y'),
y = as.numeric(str_remove_all(y, '\\[|\\]')))
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `y = as.numeric(str_remove_all(y, "\\[|\\]"))`.
## Caused by warning:
## ! NAs introduced by coercion
coords <- x_coords %>%
left_join(y_coords, by = c('pitch_id', 'player_id', 'pos', 'inn', 'frame'))
head(coords)
## # A tibble: 6 × 7
## pitch_id player_id pos inn frame x y
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 40851318 1282 2 1 1 0 -5
## 2 40851318 1282 2 1 2 0 -5
## 3 40851318 1282 2 1 3 0 -5
## 4 40851318 1282 2 1 4 0 -5
## 5 40851318 1282 2 1 5 0 -5
## 6 40851318 1282 2 1 6 0 -5
# calculate distance between each frame
# lag is previous location
coords2 <- coords %>%
mutate(x_lag = shift(x, n = 5, type = "lag", fill = NA), # n = 5 means 5 frames prior (1 second)
y_lag = shift(y, n = 5, type = "lag", fill = NA),
fps = sqrt((x - x_lag)^2 + (y - y_lag)^2), # feet per second
mph = fps * 0.681818) # convert to mph
head(coords2, 40)
## # A tibble: 40 × 11
## pitch_id player_id pos inn frame x y x_lag y_lag fps mph
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 40851318 1282 2 1 1 0 -5 NA NA NA NA
## 2 40851318 1282 2 1 2 0 -5 NA NA NA NA
## 3 40851318 1282 2 1 3 0 -5 NA NA NA NA
## 4 40851318 1282 2 1 4 0 -5 NA NA NA NA
## 5 40851318 1282 2 1 5 0 -5 NA NA NA NA
## 6 40851318 1282 2 1 6 0 -5 0 -5 0 0
## 7 40851318 1282 2 1 7 0 -5 0 -5 0 0
## 8 40851318 1282 2 1 8 0 -5 0 -5 0 0
## 9 40851318 1282 2 1 9 0 -5 0 -5 0 0
## 10 40851318 1282 2 1 10 0 -5 0 -5 0 0
## # ℹ 30 more rows
Issues
head(coords2 %>% arrange(order_by = desc(fps)), 10)
## # A tibble: 10 × 11
## pitch_id player_id pos inn frame x y x_lag y_lag fps mph
## <dbl> <dbl> <dbl> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 41637666 25377 4 9 47 -29 -9 9 130 144. 98.3
## 2 41637666 25377 4 9 48 -30 -9 9 128 142. 97.1
## 3 41637666 25377 4 9 49 -31 -10 9 126 142. 96.7
## 4 41637666 25377 4 9 50 -32 -10 8 124 140. 95.3
## 5 41637666 25377 4 9 51 -33 -10 8 121 137. 93.6
## 6 41074702 1623 10 8 6 -3 0 -12 121 121. 82.7
## 7 41074702 1623 10 8 7 -3 0 -12 121 121. 82.7
## 8 41074702 1623 10 8 8 -3 0 -12 121 121. 82.7
## 9 41074702 1623 10 8 9 -3 0 -11 121 121. 82.7
## 10 41074702 1623 10 8 10 -3 0 -11 121 121. 82.7
As shown above, many frames (around 200 out of 45 million) have unrealistically high speeds caused by misreads by the tracking system. Examples of errors are shown on the plots below.
issue <- filter(coords2, player_id == 25377 & pitch_id == 41637666)
plot(issue$x, issue$y)
plot(issue$frame, issue$fps, type = 'l')
issue2 <- filter(coords2, player_id == 19756 & pitch_id == 40873735)
plot(issue2$x, issue2$y)
plot(issue2$frame, issue2$fps, type = 'l')
As shown in both of these examples, the player appeares to spontaneously jump from one spot on the field to another, which creates an incorrect fps calculation. After trying several methods, I could not find a way to reliably remove all of these errors.
Also, to calculate players’ average speeds, I would need to classify frames as running, rather than include sedentary moments in the average. I am unsure of how to do this, but I will continue to do research on the topic, and I look forward to learning!