throw_table <- read.csv('dataset_2024.csv')
head(throw_table)
## throw_id team_id fielder_id fielder_position thrower_id thrower_position
## 1 3 11 400 6 400 6
## 2 6 11 228 5 390 4
## 3 7 8 415 4 415 4
## 4 8 8 308 1 308 1
## 5 10 1 314 4 300 6
## 6 11 2 312 4 312 4
## receiver_id receiver_position exchange_time throw_pos_x throw_pos_y
## 1 63 3 1.533 -60.116123 132.18728
## 2 63 3 0.534 -0.562563 122.82194
## 3 143 3 1.266 1.598751 123.03693
## 4 143 3 1.800 25.403185 59.92459
## 5 514 3 0.733 8.957441 126.16588
## 6 695 3 1.200 15.075073 138.72246
## throw_velo_x throw_velo_y throw_velo_z batter_pos_x_at_throw
## 1 60.04756 -33.596364 8.583470 26.91291
## 2 51.29392 -45.316519 8.831870 43.39641
## 3 50.04965 -41.717447 5.986785 25.30078
## 4 24.01967 3.289158 12.783662 19.69922
## 5 51.63067 -63.361730 7.811719 40.11830
## 6 41.26329 -62.250650 6.230779 39.70496
## batter_pos_y_at_throw batter_velo_at_throw bounce_pos_x bounce_pos_y
## 1 26.55610 25.78753 38.06139 69.47463
## 2 44.29049 28.64788 NA NA
## 3 29.97072 25.13043 NA NA
## 4 15.24991 18.50923 NA NA
## 5 41.43340 27.31851 NA NA
## 6 41.80575 27.57804 NA NA
## bounce_velo_x bounce_velo_y bounce_velo_z receiver_pos_x receiver_pos_y
## 1 32.81451 -25.4525 7.330194 56.19932 60.18778
## 2 NA NA NA 59.01313 65.65794
## 3 NA NA NA 61.03749 64.42913
## 4 NA NA NA 63.14208 65.64872
## 5 NA NA NA 59.77040 64.04606
## 6 NA NA NA 59.62286 63.94086
## receiver_dist_from_1b throw_deflected_by_receiver start_state end_state
## 1 8.202015 0 ____1 1___1
## 2 5.047566 0 1___1 1___2
## 3 2.719261 0 123_1 ____3
## 4 2.069797 0 ____1 ____2
## 5 3.890504 0 123_1 ____3
## 6 4.028031 0 ____0 ____1
## runs_on_play batter_result
## 1 0 first
## 2 0 first
## 3 0 out
## 4 0 out
## 5 0 out
## 6 0 out
table1 <- throw_table %>%
group_by(thrower_id) %>%
summarize(med_exchange = median(exchange_time),
count = n()) %>%
arrange(order_by = med_exchange) %>%
filter(count > mean(count))
head(table1, 5)
## # A tibble: 5 × 3
## thrower_id med_exchange count
## <int> <dbl> <int>
## 1 731 0.866 65
## 2 13 0.867 31
## 3 39 0.867 243
## 4 159 0.867 194
## 5 116 0.9 94
Answer: Thrower 731 Thrower 13 Thrower 39 Thrower 159 Thrower 116
table2 <- throw_table %>%
mutate(errant = if_else(!is.na(bounce_pos_x) & batter_result != 'out', 1, 0)) %>%
filter(errant == 1) %>%
group_by(team_id) %>%
summarize(errant_throws = n()) %>%
arrange(order_by = desc(errant_throws))
table2
## # A tibble: 15 × 2
## team_id errant_throws
## <int> <int>
## 1 2 85
## 2 5 79
## 3 1 76
## 4 7 75
## 5 8 75
## 6 9 75
## 7 15 74
## 8 10 73
## 9 11 73
## 10 4 67
## 11 6 66
## 12 12 62
## 13 14 56
## 14 3 49
## 15 13 44
plot2 <- ggplot(data = table2, aes(x = team_id, y = errant_throws)) +
geom_col(fill = 'royalblue') +
labs(x = 'Team ID',
y = 'Errant Throws',
title = 'Errant Throws by Team',
subtitle = '(red line = league average)') +
geom_hline(yintercept = mean(table2$errant_throws), color = 'darkred')
plot2
dist <- function(x1, y1, x2, y2){
d <- sqrt((x2-x1)^2 + (y2 - y1)^2)
return(d)
}
table3 <- throw_table %>%
mutate(throw_dist = dist(x1 = throw_pos_x,
y1 = throw_pos_y,
x2 = receiver_pos_x,
y2 = receiver_pos_y) + receiver_dist_from_1b,
throw_dist_perc = percent_rank(throw_dist)) %>%
filter(throw_dist_perc >= 0.9) %>%
group_by(team_id) %>%
summarize(avg_exchange = mean(exchange_time),
var_exchange = var(exchange_time))
table3 %>% arrange(order_by = avg_exchange)
## # A tibble: 15 × 3
## team_id avg_exchange var_exchange
## <int> <dbl> <dbl>
## 1 9 1.15 0.197
## 2 10 1.17 0.271
## 3 15 1.17 0.268
## 4 4 1.17 0.140
## 5 8 1.19 0.183
## 6 2 1.20 0.292
## 7 7 1.20 0.248
## 8 3 1.22 0.190
## 9 6 1.22 0.315
## 10 11 1.22 0.281
## 11 5 1.23 0.298
## 12 13 1.25 0.379
## 13 1 1.27 0.217
## 14 14 1.31 0.189
## 15 12 1.35 0.409
table3 %>% arrange(order_by = desc(var_exchange))
## # A tibble: 15 × 3
## team_id avg_exchange var_exchange
## <int> <dbl> <dbl>
## 1 12 1.35 0.409
## 2 13 1.25 0.379
## 3 6 1.22 0.315
## 4 5 1.23 0.298
## 5 2 1.20 0.292
## 6 11 1.22 0.281
## 7 10 1.17 0.271
## 8 15 1.17 0.268
## 9 7 1.20 0.248
## 10 1 1.27 0.217
## 11 9 1.15 0.197
## 12 3 1.22 0.190
## 13 14 1.31 0.189
## 14 8 1.19 0.183
## 15 4 1.17 0.140
Answer:
Team 9 had the quickest average exchange time Team 12 had the largest variation in exchange time
table4 <- throw_table %>%
mutate(throw_dist = dist(x1 = throw_pos_x,
y1 = throw_pos_y,
x2 = receiver_pos_x,
y2 = receiver_pos_y) + receiver_dist_from_1b) %>%
filter(throw_dist < 100)
correlation <- cor.test(table4$throw_dist, table4$throw_velo_x)
correlation$estimate
## cor
## 0.4950024
correlation$p.value
## [1] 0
plot4 <- ggplot(data = table4, aes(x = throw_dist, y = throw_velo_x)) +
geom_point(color = 'royalblue') +
labs(title = 'Distance vs Velocity',
x = 'Distance (ft)',
y = 'Velocity (ft/sec)')
plot4
## Warning: Removed 5 rows containing missing values (`geom_point()`).
Answer: There appears to be a moderate positive correlation between throw distance and velocity. The correlation coefficient of 0.495 indicates a moderate correlation, and the p-value of nearly 0 supports its significance.
This means that there is somewhat of a relationship between throw distance and velocity, where the longer the throw, the higher the velocity.