throw_table <- read.csv('dataset_2024.csv')

head(throw_table)
##   throw_id team_id fielder_id fielder_position thrower_id thrower_position
## 1        3      11        400                6        400                6
## 2        6      11        228                5        390                4
## 3        7       8        415                4        415                4
## 4        8       8        308                1        308                1
## 5       10       1        314                4        300                6
## 6       11       2        312                4        312                4
##   receiver_id receiver_position exchange_time throw_pos_x throw_pos_y
## 1          63                 3         1.533  -60.116123   132.18728
## 2          63                 3         0.534   -0.562563   122.82194
## 3         143                 3         1.266    1.598751   123.03693
## 4         143                 3         1.800   25.403185    59.92459
## 5         514                 3         0.733    8.957441   126.16588
## 6         695                 3         1.200   15.075073   138.72246
##   throw_velo_x throw_velo_y throw_velo_z batter_pos_x_at_throw
## 1     60.04756   -33.596364     8.583470              26.91291
## 2     51.29392   -45.316519     8.831870              43.39641
## 3     50.04965   -41.717447     5.986785              25.30078
## 4     24.01967     3.289158    12.783662              19.69922
## 5     51.63067   -63.361730     7.811719              40.11830
## 6     41.26329   -62.250650     6.230779              39.70496
##   batter_pos_y_at_throw batter_velo_at_throw bounce_pos_x bounce_pos_y
## 1              26.55610             25.78753     38.06139     69.47463
## 2              44.29049             28.64788           NA           NA
## 3              29.97072             25.13043           NA           NA
## 4              15.24991             18.50923           NA           NA
## 5              41.43340             27.31851           NA           NA
## 6              41.80575             27.57804           NA           NA
##   bounce_velo_x bounce_velo_y bounce_velo_z receiver_pos_x receiver_pos_y
## 1      32.81451      -25.4525      7.330194       56.19932       60.18778
## 2            NA            NA            NA       59.01313       65.65794
## 3            NA            NA            NA       61.03749       64.42913
## 4            NA            NA            NA       63.14208       65.64872
## 5            NA            NA            NA       59.77040       64.04606
## 6            NA            NA            NA       59.62286       63.94086
##   receiver_dist_from_1b throw_deflected_by_receiver start_state end_state
## 1              8.202015                           0       ____1     1___1
## 2              5.047566                           0       1___1     1___2
## 3              2.719261                           0       123_1     ____3
## 4              2.069797                           0       ____1     ____2
## 5              3.890504                           0       123_1     ____3
## 6              4.028031                           0       ____0     ____1
##   runs_on_play batter_result
## 1            0         first
## 2            0         first
## 3            0           out
## 4            0           out
## 5            0           out
## 6            0           out

1. Which 5 infielders had the quickest exchange times on throws to first base?

table1 <- throw_table %>%
  group_by(thrower_id) %>%
  summarize(med_exchange = median(exchange_time),
            count = n()) %>%
  arrange(order_by = med_exchange) %>%
  filter(count > mean(count))

head(table1, 5)
## # A tibble: 5 × 3
##   thrower_id med_exchange count
##        <int>        <dbl> <int>
## 1        731        0.866    65
## 2         13        0.867    31
## 3         39        0.867   243
## 4        159        0.867   194
## 5        116        0.9      94

Answer: Thrower 731 Thrower 13 Thrower 39 Thrower 159 Thrower 116

2. The infield coach wants to see which teams made the most errant throws to first base. An errant throw is described as a throw that bounced and resulted in the runner being safe. Please create a basic visual that you would present to the infield coach to present your findings.

table2 <- throw_table %>%
  mutate(errant = if_else(!is.na(bounce_pos_x) & batter_result != 'out', 1, 0)) %>%
  filter(errant == 1) %>%
  group_by(team_id) %>%
  summarize(errant_throws = n()) %>%
  arrange(order_by = desc(errant_throws))

table2
## # A tibble: 15 × 2
##    team_id errant_throws
##      <int>         <int>
##  1       2            85
##  2       5            79
##  3       1            76
##  4       7            75
##  5       8            75
##  6       9            75
##  7      15            74
##  8      10            73
##  9      11            73
## 10       4            67
## 11       6            66
## 12      12            62
## 13      14            56
## 14       3            49
## 15      13            44
plot2 <- ggplot(data = table2, aes(x = team_id, y = errant_throws)) +
  geom_col(fill = 'royalblue') +
  labs(x = 'Team ID', 
       y = 'Errant Throws',
       title = 'Errant Throws by Team',
       subtitle = '(red line = league average)') +
  geom_hline(yintercept = mean(table2$errant_throws), color = 'darkred') 

plot2

3. Looking at all infield throws to first base, given that the distance of the throw to first base was in the top 90th percentile, what team had the best average exchange time? Which team had the largest variation in exchange time on these throws?

dist <- function(x1, y1, x2, y2){
  d <- sqrt((x2-x1)^2 + (y2 - y1)^2)
  return(d)
}

table3 <- throw_table %>%
  mutate(throw_dist = dist(x1 = throw_pos_x,
                           y1 = throw_pos_y,
                           x2 = receiver_pos_x,
                           y2 = receiver_pos_y) + receiver_dist_from_1b,
         throw_dist_perc = percent_rank(throw_dist)) %>%
  filter(throw_dist_perc >= 0.9) %>%
  group_by(team_id) %>%
  summarize(avg_exchange = mean(exchange_time),
            var_exchange = var(exchange_time))

table3 %>% arrange(order_by = avg_exchange)
## # A tibble: 15 × 3
##    team_id avg_exchange var_exchange
##      <int>        <dbl>        <dbl>
##  1       9         1.15        0.197
##  2      10         1.17        0.271
##  3      15         1.17        0.268
##  4       4         1.17        0.140
##  5       8         1.19        0.183
##  6       2         1.20        0.292
##  7       7         1.20        0.248
##  8       3         1.22        0.190
##  9       6         1.22        0.315
## 10      11         1.22        0.281
## 11       5         1.23        0.298
## 12      13         1.25        0.379
## 13       1         1.27        0.217
## 14      14         1.31        0.189
## 15      12         1.35        0.409
table3 %>% arrange(order_by = desc(var_exchange))
## # A tibble: 15 × 3
##    team_id avg_exchange var_exchange
##      <int>        <dbl>        <dbl>
##  1      12         1.35        0.409
##  2      13         1.25        0.379
##  3       6         1.22        0.315
##  4       5         1.23        0.298
##  5       2         1.20        0.292
##  6      11         1.22        0.281
##  7      10         1.17        0.271
##  8      15         1.17        0.268
##  9       7         1.20        0.248
## 10       1         1.27        0.217
## 11       9         1.15        0.197
## 12       3         1.22        0.190
## 13      14         1.31        0.189
## 14       8         1.19        0.183
## 15       4         1.17        0.140

Answer:

Team 9 had the quickest average exchange time Team 12 had the largest variation in exchange time

4. Given that a throw was made less than 100 feet from first base, is there a correlation between throw velocity and throw distance? Provide a basic visual alongside a brief explanation.

table4 <- throw_table %>% 
  mutate(throw_dist = dist(x1 = throw_pos_x,
                           y1 = throw_pos_y,
                           x2 = receiver_pos_x,
                           y2 = receiver_pos_y) + receiver_dist_from_1b) %>%
  filter(throw_dist < 100)
correlation <- cor.test(table4$throw_dist, table4$throw_velo_x)

correlation$estimate
##       cor 
## 0.4950024
correlation$p.value
## [1] 0
plot4 <- ggplot(data = table4, aes(x = throw_dist, y = throw_velo_x)) +
  geom_point(color = 'royalblue') +
  labs(title = 'Distance vs Velocity',
       x = 'Distance (ft)',
       y = 'Velocity (ft/sec)')

plot4
## Warning: Removed 5 rows containing missing values (`geom_point()`).

Answer: There appears to be a moderate positive correlation between throw distance and velocity. The correlation coefficient of 0.495 indicates a moderate correlation, and the p-value of nearly 0 supports its significance.

This means that there is somewhat of a relationship between throw distance and velocity, where the longer the throw, the higher the velocity.