Data Preview

Data was downloaded from Baseball Savant

Take a look at the structure of the data.

data %>% 
  head %>% 
  mutate(Name = paste(first_name, last_name)) %>% 
  dplyr::select(Name, player_id, team, position, competitive_runs, 
                sprint_speed, year) %>% 
  knitr::kable(format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "bordered", "hover"),
                            full_width = FALSE)
Name player_id team position competitive_runs sprint_speed year
Torii Hunter 116338 MIN RF 240 26.5 2015
David Ortiz 120074 BOS DH 148 23.1 2015
David Ortiz 120074 BOS DH 153 23.6 2016
Alex Rodriguez 121347 NYY DH 193 23.9 2015
Alex Rodriguez 121347 NYY DH 73 23.9 2016
Aramis Ramirez 133380 PIT 3B 176 24.3 2015

Analysis

Look at trends between all positions for each player

data %>% 
  group_by(player_id) %>% 
  dplyr::filter(n() > 1) %>% 
  mutate(delta = sprint_speed - lag(sprint_speed)) %>% 
  ggplot(., aes(age, delta, weight = competitive_runs)) + geom_point() + 
  facet_wrap(~position) + geom_smooth() + theme_bw() + 
  geom_hline(aes(yintercept = 0), lty = 2)

Look at trends for the players who we have data in each year from 2015 to 2019

data %>% 
  group_by(player_id) %>% 
  dplyr::filter(n() == 5) %>% 
  mutate(delta = sprint_speed - dplyr::lag(sprint_speed, n = 4)) %>% 
  ggplot(., aes(age, delta, weight = competitive_runs)) + geom_point() + 
  facet_wrap(~position) + geom_smooth(method = "lm") + theme_bw() + 
  geom_hline(aes(yintercept = 0), lty = 2)

Now I think the most interesting thing to look at might be which players have shown:

  1. The greatest speed decline over the 5 year
  2. Players who have improved their speed (conditioning?) over the same time series

Calculate the slope of the data for spring speed (players with data all 5 years)

First, create a function that look will run a linear model of sprint speed across year. We are giving the number of competitve runs a weight in the model so that years with more data poionts will be given more weight.

speed_model <- function(df) {
  lm(sprint_speed ~ year, weight = competitive_runs, data = df)
}

Second, calculate the trend for each player. But we have some filters to run:

  1. Remove any seasons where players didn’t have at least 20 competitive runs to try and remove small smaple sizes
  2. Only include players that gave data for each year
trends = data %>% 
  dplyr::filter(competitive_runs > 20) %>% 
  group_by(player_id) %>% 
  dplyr::filter(n() == 5) %>% 
  group_by(player_id) %>%
  nest() %>% 
  mutate(model = purrr::map(data, speed_model)) %>% 
  mutate(slope = purrr::map(model, .f = function(df) coef(df)[2])) %>% 
  dplyr::select(-data, -model) %>%
  unnest(cols = c(slope)) %>% 
  ungroup()

Now that we’ve calculated the slope for speed for each player, let’s take a look at the players who showed the most loss in their sprint speed.

decliners = trends %>% 
  top_n(., -10, wt = slope) %>% 
  pull(player_id)

p = data %>% 
  dplyr::filter(player_id %in% decliners) %>% 
  mutate(Name = paste(first_name, last_name)) %>% 
  ggplot(., aes(age, sprint_speed, col = Name)) + geom_line(lwd = 1.2) +
  theme_bw() + ylab("Sprint speed (ft/s)") + xlab("Age") + 
  theme(legend.title = element_blank(),
        legend.position = "bottom") 

ggplotly(p, tooltip = c("colour", "sprint_speed")) %>% 
  layout(legend = list(orientation = "h", x = 0, y = -0.15))

Top 10 decliners:

decliners = trends %>% 
  top_n(., -10, wt = slope) 

data %>% 
  dplyr::filter(player_id %in% decliners$player_id) %>% 
  mutate(Name = paste(first_name, last_name)) %>% 
  group_by(Name, player_id) %>% 
  summarize() %>% 
  left_join(decliners) %>% 
  arrange(slope) %>% 
  rename(MLBID = player_id, `Change per year` = slope) %>% 
  knitr::kable(format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "bordered", "hover"),
                            full_width = FALSE)
Name MLBID Change per year
Brock Holt 571788 -0.6722737
Chris Davis 448801 -0.5334106
Yonder Alonso 475174 -0.4359165
Brandon Belt 474832 -0.4258121
Tim Beckham 542921 -0.4109107
Adeiny Hechavarria 588751 -0.4055546
Adam Jones 430945 -0.3950449
Stephen Piscotty 572039 -0.3926748
Eugenio Suarez 553993 -0.3689193
Khris Davis 501981 -0.3631600

And now the players who showed the most improvement in their sprint speed.

improvers = trends %>% 
  top_n(., 10, wt = slope) %>% 
  pull(player_id)

p = data %>% 
  dplyr::filter(player_id %in% improvers) %>% 
  mutate(Name = paste(first_name, last_name)) %>% 
  ggplot(., aes(age, sprint_speed, col = Name)) + geom_line(lwd = 1.2) +
  theme_bw() + ylab("Sprint speed (ft/s)") + xlab("Age") + 
  theme(legend.title = element_blank(),
        legend.position = "bottom") 

ggplotly(p, tooltip = c("colour", "sprint_speed")) %>% 
  layout(legend = list(orientation = "h", x = 0, y = -0.15))

Top 10 improvers:

improvers = trends %>% 
  top_n(., 10, wt = slope) 

data %>% 
  dplyr::filter(player_id %in% improvers$player_id) %>% 
  mutate(Name = paste(first_name, last_name)) %>% 
  group_by(Name, player_id) %>% 
  summarize() %>% 
  left_join(improvers) %>% 
  arrange(desc(slope)) %>% 
  rename(MLBID = player_id, `Change per year` = slope) %>% 
  knitr::kable(format = "html") %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "bordered", "hover"),
                            full_width = FALSE)
Name MLBID Change per year
Nicholas Castellanos 592206 0.3341643
Jose Abreu 547989 0.2899603
Carlos Gonzalez 471865 0.2336047
Nick Markakis 455976 0.1976432
Corey Dickerson 572816 0.1966223
Freddie Freeman 518692 0.1914975
Matt Adams 571431 0.1517382
Bryce Harper 547180 0.1511591
Giancarlo Stanton 519317 0.1407563
Enrique Hernandez 571771 0.1131175