Pull some data from the savant database. Grab 2000 batted ball by home team, which should let us extract the park effects on xwOBA. Do some extra housekeeping on the data, where I remove all batted balls with an EV less than 40 MPH.

## Rows: 3,872,210
## Columns: 91
## Delimiter: "\t"
## chr  [18]: pitch_type, player_name, events, description, des, game_type, stand, p_throws, ...
## dbl  [65]: release_speed, release_pos_x, release_pos_z, batter, pitcher, zone, hit_locatio...
## lgl  [ 7]: spin_dir, spin_rate_deprecated, break_angle_deprecated, break_length_deprecated...
## date [ 1]: game_date
## 
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message

Create a plot showing the relationship between launch angle, exit velocity and wOBA

ggplot(df, aes(launch_speed, launch_angle, col = woba_value)) + geom_point() + theme_dark() + 
  scale_color_gradient2(midpoint = 0.7)

Step 1. Create the mesh

max.edge <- 3
locs = cbind(df$launch_speed, df$launch_angle)
boundary <- inla.nonconvex.hull(points = locs)

mesh <- inla.mesh.2d(loc = locs,
                      max.edge = c(0.5, 6) * max.edge,
                      # boundary = boundary,
                      cutoff = 2.5)

plot(mesh)

Step 2: Create a convex hull around the points to that we can limit the mesh predictions

hpts <- chull(df$launch_speed, df$launch_angle)
hpts <- c(hpts, hpts[1])

poly = df[hpts,1:2] %>% 
  dplyr::rename(x = launch_speed, y = launch_angle)
mesh_locs = data.frame(X = mesh$loc[,1], Y = mesh$loc[,2])
mesh_locs$intercept = GEOmap::inpoly(mesh_locs$X, mesh_locs$Y, POK = list(x = poly$x, y = poly$y))


mesh_locs$intercept = ifelse(mesh_locs$intercept == 0, NA, 1)

Step 3. Define the projector matrix

A <- inla.spde.make.A(mesh = mesh, 
                      loc = locs)

Step 4 Define the SPDE.

spde <- inla.spde2.pcmatern(mesh,
                            alpha = 2,
                            prior.range = c(50, 0.5), ### P(practic.range < 100) = 0.9 
                            prior.sigma = c(1, 0.1),   ### P(sigma > 5) = 0.1
                            constr = FALSE)

Step 5. Define the spatial field

field.z.idx <- inla.spde.make.index(
  name = 'x',
  n.spde = spde$n.spde)

Step 6. Create the stack

stk.fit <- inla.stack(
  tag = 'fit',
  data = list(Y = df$woba_value, link = 1),
  A = list(1, 1, A), 
  effects = list(intercept = rep(1, nrow(df)),
                 team = df$home_team,
                 field.z.idx))

# Second: Stack for Gamma model: non-out hits
stk.pred <- inla.stack(
  tag = 'pred',
  data = list(Y = matrix(NA, nrow = ncol(A)), link = 1),
  A = list(1, 1), 
  effects = list(intercept = mesh_locs$intercept,
                 field.z.idx))


stk.all = inla.stack(stk.fit, stk.pred)

Step 7: Define the formula. Here we using the mesh to create a smooth between EV and LA, and then use home team as a random effect to define home park factors

form = Y ~ -1 + 
  intercept + f(team, model = "iid") + f(x, model = spde)

step 8: Run the model

mod = inla(form,
            family = "gaussian",
            data = inla.stack.data(stk.all),    
            control.predictor = list(A = inla.stack.A(stk.all), 
                                     compute = T, link = link),    
            control.compute = list(cpo = TRUE, waic = TRUE))

Check out a summary of the model

summary(mod)

## 
## Call:
##    c("inla(formula = form, family = \"gaussian\", data = 
##    inla.stack.data(stk.all), ", " control.compute = list(cpo = TRUE, waic 
##    = TRUE), control.predictor = list(A = inla.stack.A(stk.all), ", " 
##    compute = T, link = link))") 
## Time used:
##     Pre = 8.66, Running = 175, Post = 3.96, Total = 187 
## Fixed effects:
##            mean    sd 0.025quant 0.5quant 0.975quant  mode kld
## intercept 0.307 0.183      -0.05    0.304      0.683 0.299   0
## 
## Random effects:
##   Name     Model
##     team IID model
##    x SPDE2 model
## 
## Model hyperparameters:
##                                             mean       sd 0.025quant 0.5quant
## Precision for the Gaussian observations    5.610    0.033      5.546    5.610
## Precision for team                      6850.508 2943.691   2658.027 6349.423
## Range for x                               58.581   11.667     39.912   57.053
## Stdev for x                                0.483    0.090      0.338    0.472
##                                         0.975quant     mode
## Precision for the Gaussian observations   5.67e+00    5.609
## Precision for team                        1.40e+04 5386.409
## Range for x                               8.56e+01   53.856
## Stdev for x                               6.89e-01    0.448
## 
## Expected number of effective parameters(stdev): 387.85(12.58)
## Number of equivalent replicates : 153.48 
## 
## Watanabe-Akaike information criterion (WAIC) ...: 66595.54
## Effective number of parameters .................: 315.51
## 
## Marginal log-Likelihood:  -33597.99 
## CPO and PIT are computed
## 
## Posterior marginals for the linear predictor and
##  the fitted values are computed

Park factors - look at how home team (ie the park) affects xwOBA

mod$summary.random$team %>% 
  arrange(`0.5quant`) %>% 
  mutate(ID = factor(ID, levels = ID)) %>% 
  ggplot(., aes(`0.5quant`, ID)) + geom_point() + 
  geom_errorbarh(aes(xmin = `0.025quant`, xmax = `0.975quant`)) + 
  ylab("Home Team Park") + xlab("Relative Park Effect")

Plot the marginals for some of the teams

Colorado Rockies

TEAMS = mod$summary.random$team$ID
plot.inla(mod$marginals.random$team[[which(TEAMS == "COL")]], xlab = "T")

San Francisco Giants

plot.inla(mod$marginals.random$team[[which(TEAMS == "SF")]], xlab = "T")

See how all the marginals compare on a single plot.

xx = lapply(seq_along(mod$marginals.random$team),
       FUN = function(i){
         as.data.frame(mod$marginals.random$team[[i]]) %>% 
           mutate(Team = TEAMS[i])
       })

bind_rows(xx) %>% 
  ggplot(., aes(x, y, col = Team)) + geom_line()

Take a closewr look at the Rockies and Giants

bind_rows(xx) %>% 
  dplyr::filter(Team %in% c("COL","SF")) %>% 
  ggplot(., aes(x, y, col = Team)) + geom_line() + theme_bw()

Check out the output for xwOBA

i.x = inla.stack.index(stk.all, tag = "pred")$data
prop = matrix(mod$summary.fitted.values$`0.5quant`[i.x],
              spde$n.spde)

proj = inla.mesh.projector(mesh, dims = c(300, 300))
field.proj <- inla.mesh.project(proj, prop)

loc = expand.grid(proj$x, proj$y)
colnames(loc) = c("x","y")
loc$z = as.vector(field.proj)

loc$inp = GEOmap::inpoly(loc$x, loc$y, POK = list(x = poly$x, y = poly$y))


ggplot() + 
  geom_tile(data = loc %>% dplyr::filter(inp == 1), aes(x = x, y = y, fill = z)) + 
  scale_fill_gradient2(midpoint = 0.7) + theme_dark() + 
  xlab("Exit Velocity") + ylab("Launch Angle") + ggtitle("xwOBA Predictions")

xwOBA using INLA

Colin Charles

12/11/2020