Data was downloaded from Baseball Savant
Take a look at the structure of the data.
data %>%
head %>%
mutate(Name = paste(first_name, last_name)) %>%
dplyr::select(Name, player_id, team, position, competitive_runs,
sprint_speed, year) %>%
knitr::kable(format = "html") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "bordered", "hover"),
full_width = FALSE)
| Name | player_id | team | position | competitive_runs | sprint_speed | year |
|---|---|---|---|---|---|---|
| Torii Hunter | 116338 | MIN | RF | 240 | 26.5 | 2015 |
| David Ortiz | 120074 | BOS | DH | 148 | 23.1 | 2015 |
| David Ortiz | 120074 | BOS | DH | 153 | 23.6 | 2016 |
| Alex Rodriguez | 121347 | NYY | DH | 193 | 23.9 | 2015 |
| Alex Rodriguez | 121347 | NYY | DH | 73 | 23.9 | 2016 |
| Aramis Ramirez | 133380 | PIT | 3B | 176 | 24.3 | 2015 |
Look at trends between all positions for each player
data %>%
group_by(player_id) %>%
dplyr::filter(n() > 1) %>%
mutate(delta = sprint_speed - lag(sprint_speed)) %>%
ggplot(., aes(age, delta, weight = competitive_runs)) + geom_point() +
facet_wrap(~position) + geom_smooth() + theme_bw() +
geom_hline(aes(yintercept = 0), lty = 2)
Look at trends for the players who we have data in each year from 2015 to 2019
data %>%
group_by(player_id) %>%
dplyr::filter(n() == 5) %>%
mutate(delta = sprint_speed - dplyr::lag(sprint_speed, n = 4)) %>%
ggplot(., aes(age, delta, weight = competitive_runs)) + geom_point() +
facet_wrap(~position) + geom_smooth(method = "lm") + theme_bw() +
geom_hline(aes(yintercept = 0), lty = 2)
Now I think the most interesting thing to look at might be which players have shown:
First, create a function that look will run a linear model of sprint speed across year. We are giving the number of competitve runs a weight in the model so that years with more data poionts will be given more weight.
speed_model <- function(df) {
lm(sprint_speed ~ year, weight = competitive_runs, data = df)
}
Second, calculate the trend for each player. But we have some filters to run:
trends = data %>%
dplyr::filter(competitive_runs > 20) %>%
group_by(player_id) %>%
dplyr::filter(n() == 5) %>%
group_by(player_id) %>%
nest() %>%
mutate(model = purrr::map(data, speed_model)) %>%
mutate(slope = purrr::map(model, .f = function(df) coef(df)[2])) %>%
dplyr::select(-data, -model) %>%
unnest(cols = c(slope)) %>%
ungroup()
Now that we’ve calculated the slope for speed for each player, let’s take a look at the players who showed the most loss in their sprint speed.
decliners = trends %>%
top_n(., -10, wt = slope) %>%
pull(player_id)
p = data %>%
dplyr::filter(player_id %in% decliners) %>%
mutate(Name = paste(first_name, last_name)) %>%
ggplot(., aes(age, sprint_speed, col = Name)) + geom_line(lwd = 1.2) +
theme_bw() + ylab("Sprint speed (ft/s)") + xlab("Age") +
theme(legend.title = element_blank(),
legend.position = "bottom")
ggplotly(p, tooltip = c("colour", "sprint_speed")) %>%
layout(legend = list(orientation = "h", x = 0, y = -0.15))
Top 10 decliners:
decliners = trends %>%
top_n(., -10, wt = slope)
data %>%
dplyr::filter(player_id %in% decliners$player_id) %>%
mutate(Name = paste(first_name, last_name)) %>%
group_by(Name, player_id) %>%
summarize() %>%
left_join(decliners) %>%
arrange(slope) %>%
rename(MLBID = player_id, `Change per year` = slope) %>%
knitr::kable(format = "html") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "bordered", "hover"),
full_width = FALSE)
| Name | MLBID | Change per year |
|---|---|---|
| Brock Holt | 571788 | -0.6722737 |
| Chris Davis | 448801 | -0.5334106 |
| Yonder Alonso | 475174 | -0.4359165 |
| Brandon Belt | 474832 | -0.4258121 |
| Tim Beckham | 542921 | -0.4109107 |
| Adeiny Hechavarria | 588751 | -0.4055546 |
| Adam Jones | 430945 | -0.3950449 |
| Stephen Piscotty | 572039 | -0.3926748 |
| Eugenio Suarez | 553993 | -0.3689193 |
| Khris Davis | 501981 | -0.3631600 |
And now the players who showed the most improvement in their sprint speed.
improvers = trends %>%
top_n(., 10, wt = slope) %>%
pull(player_id)
p = data %>%
dplyr::filter(player_id %in% improvers) %>%
mutate(Name = paste(first_name, last_name)) %>%
ggplot(., aes(age, sprint_speed, col = Name)) + geom_line(lwd = 1.2) +
theme_bw() + ylab("Sprint speed (ft/s)") + xlab("Age") +
theme(legend.title = element_blank(),
legend.position = "bottom")
ggplotly(p, tooltip = c("colour", "sprint_speed")) %>%
layout(legend = list(orientation = "h", x = 0, y = -0.15))
Top 10 improvers:
improvers = trends %>%
top_n(., 10, wt = slope)
data %>%
dplyr::filter(player_id %in% improvers$player_id) %>%
mutate(Name = paste(first_name, last_name)) %>%
group_by(Name, player_id) %>%
summarize() %>%
left_join(improvers) %>%
arrange(desc(slope)) %>%
rename(MLBID = player_id, `Change per year` = slope) %>%
knitr::kable(format = "html") %>%
kableExtra::kable_styling(bootstrap_options = c("striped", "bordered", "hover"),
full_width = FALSE)
| Name | MLBID | Change per year |
|---|---|---|
| Nicholas Castellanos | 592206 | 0.3341643 |
| Jose Abreu | 547989 | 0.2899603 |
| Carlos Gonzalez | 471865 | 0.2336047 |
| Nick Markakis | 455976 | 0.1976432 |
| Corey Dickerson | 572816 | 0.1966223 |
| Freddie Freeman | 518692 | 0.1914975 |
| Matt Adams | 571431 | 0.1517382 |
| Bryce Harper | 547180 | 0.1511591 |
| Giancarlo Stanton | 519317 | 0.1407563 |
| Enrique Hernandez | 571771 | 0.1131175 |