Pull some data from the savant database. Grab 2000 batted ball by home team, which should let us extract the park effects on xwOBA. Do some extra housekeeping on the data, where I remove all batted balls with an EV less than 40 MPH.
## Rows: 3,872,210
## Columns: 91
## Delimiter: "\t"
## chr [18]: pitch_type, player_name, events, description, des, game_type, stand, p_throws, ...
## dbl [65]: release_speed, release_pos_x, release_pos_z, batter, pitcher, zone, hit_locatio...
## lgl [ 7]: spin_dir, spin_rate_deprecated, break_angle_deprecated, break_length_deprecated...
## date [ 1]: game_date
##
## Use `spec()` to retrieve the guessed column specification
## Pass a specification to the `col_types` argument to quiet this message
Create a plot showing the relationship between launch angle, exit velocity and wOBA
ggplot(df, aes(launch_speed, launch_angle, col = woba_value)) + geom_point() + theme_dark() +
scale_color_gradient2(midpoint = 0.7)
max.edge <- 3
locs = cbind(df$launch_speed, df$launch_angle)
boundary <- inla.nonconvex.hull(points = locs)
mesh <- inla.mesh.2d(loc = locs,
max.edge = c(0.5, 6) * max.edge,
# boundary = boundary,
cutoff = 2.5)
plot(mesh)
hpts <- chull(df$launch_speed, df$launch_angle)
hpts <- c(hpts, hpts[1])
poly = df[hpts,1:2] %>%
dplyr::rename(x = launch_speed, y = launch_angle)
mesh_locs = data.frame(X = mesh$loc[,1], Y = mesh$loc[,2])
mesh_locs$intercept = GEOmap::inpoly(mesh_locs$X, mesh_locs$Y, POK = list(x = poly$x, y = poly$y))
mesh_locs$intercept = ifelse(mesh_locs$intercept == 0, NA, 1)
A <- inla.spde.make.A(mesh = mesh,
loc = locs)
spde <- inla.spde2.pcmatern(mesh,
alpha = 2,
prior.range = c(50, 0.5), ### P(practic.range < 100) = 0.9
prior.sigma = c(1, 0.1), ### P(sigma > 5) = 0.1
constr = FALSE)
field.z.idx <- inla.spde.make.index(
name = 'x',
n.spde = spde$n.spde)
stk.fit <- inla.stack(
tag = 'fit',
data = list(Y = df$woba_value, link = 1),
A = list(1, 1, A),
effects = list(intercept = rep(1, nrow(df)),
team = df$home_team,
field.z.idx))
# Second: Stack for Gamma model: non-out hits
stk.pred <- inla.stack(
tag = 'pred',
data = list(Y = matrix(NA, nrow = ncol(A)), link = 1),
A = list(1, 1),
effects = list(intercept = mesh_locs$intercept,
field.z.idx))
stk.all = inla.stack(stk.fit, stk.pred)
form = Y ~ -1 +
intercept + f(team, model = "iid") + f(x, model = spde)
mod = inla(form,
family = "gaussian",
data = inla.stack.data(stk.all),
control.predictor = list(A = inla.stack.A(stk.all),
compute = T, link = link),
control.compute = list(cpo = TRUE, waic = TRUE))
Check out a summary of the model
summary(mod)
##
## Call:
## c("inla(formula = form, family = \"gaussian\", data =
## inla.stack.data(stk.all), ", " control.compute = list(cpo = TRUE, waic
## = TRUE), control.predictor = list(A = inla.stack.A(stk.all), ", "
## compute = T, link = link))")
## Time used:
## Pre = 8.66, Running = 175, Post = 3.96, Total = 187
## Fixed effects:
## mean sd 0.025quant 0.5quant 0.975quant mode kld
## intercept 0.307 0.183 -0.05 0.304 0.683 0.299 0
##
## Random effects:
## Name Model
## team IID model
## x SPDE2 model
##
## Model hyperparameters:
## mean sd 0.025quant 0.5quant
## Precision for the Gaussian observations 5.610 0.033 5.546 5.610
## Precision for team 6850.508 2943.691 2658.027 6349.423
## Range for x 58.581 11.667 39.912 57.053
## Stdev for x 0.483 0.090 0.338 0.472
## 0.975quant mode
## Precision for the Gaussian observations 5.67e+00 5.609
## Precision for team 1.40e+04 5386.409
## Range for x 8.56e+01 53.856
## Stdev for x 6.89e-01 0.448
##
## Expected number of effective parameters(stdev): 387.85(12.58)
## Number of equivalent replicates : 153.48
##
## Watanabe-Akaike information criterion (WAIC) ...: 66595.54
## Effective number of parameters .................: 315.51
##
## Marginal log-Likelihood: -33597.99
## CPO and PIT are computed
##
## Posterior marginals for the linear predictor and
## the fitted values are computed
mod$summary.random$team %>%
arrange(`0.5quant`) %>%
mutate(ID = factor(ID, levels = ID)) %>%
ggplot(., aes(`0.5quant`, ID)) + geom_point() +
geom_errorbarh(aes(xmin = `0.025quant`, xmax = `0.975quant`)) +
ylab("Home Team Park") + xlab("Relative Park Effect")
Plot the marginals for some of the teams
Colorado Rockies
TEAMS = mod$summary.random$team$ID
plot.inla(mod$marginals.random$team[[which(TEAMS == "COL")]], xlab = "T")
San Francisco Giants
plot.inla(mod$marginals.random$team[[which(TEAMS == "SF")]], xlab = "T")
xx = lapply(seq_along(mod$marginals.random$team),
FUN = function(i){
as.data.frame(mod$marginals.random$team[[i]]) %>%
mutate(Team = TEAMS[i])
})
bind_rows(xx) %>%
ggplot(., aes(x, y, col = Team)) + geom_line()
Take a closewr look at the Rockies and Giants
bind_rows(xx) %>%
dplyr::filter(Team %in% c("COL","SF")) %>%
ggplot(., aes(x, y, col = Team)) + geom_line() + theme_bw()
i.x = inla.stack.index(stk.all, tag = "pred")$data
prop = matrix(mod$summary.fitted.values$`0.5quant`[i.x],
spde$n.spde)
proj = inla.mesh.projector(mesh, dims = c(300, 300))
field.proj <- inla.mesh.project(proj, prop)
loc = expand.grid(proj$x, proj$y)
colnames(loc) = c("x","y")
loc$z = as.vector(field.proj)
loc$inp = GEOmap::inpoly(loc$x, loc$y, POK = list(x = poly$x, y = poly$y))
ggplot() +
geom_tile(data = loc %>% dplyr::filter(inp == 1), aes(x = x, y = y, fill = z)) +
scale_fill_gradient2(midpoint = 0.7) + theme_dark() +
xlab("Exit Velocity") + ylab("Launch Angle") + ggtitle("xwOBA Predictions")