Start by setting up the packages to manipulate data.
suppressPackageStartupMessages({
library(tidyverse)
library(rio)
library(boot)
library(broom)
library(car)
library(GGally)
library(ggrepel)
library(lindia)
library(performance)
library(MASS)
source("aptheme.R") #Code that helps format graphs
})
Import data and create additional variables to model.
data <- import("plays.csv") %>%
mutate(converted = yardsGained >= yardsToGo,
pass = !(is.na(passLength)),
zone = pff_manZone == "Zone",
incompleted_pass = passResult == "I",
interception = passResult == "IN",
dropbackDistance = ifelse(is.na(dropbackDistance), 0, dropbackDistance),
timeToThrow = ifelse(is.na(timeToThrow), 0, timeToThrow),
timeInTackleBox = ifelse(is.na(timeInTackleBox), 0, timeInTackleBox),
score_differential = abs(preSnapHomeScore - preSnapVisitorScore),
penaltyYards = ifelse(is.na(penaltyYards), 0, penaltyYards)
)
pass_data <- data %>%
filter(pass)
Checking the distribution
hist(pass_data$yardsGained)
This distribution doesn’t look like it follows the normality requirements, so we take the log.
hist(log(pass_data$yardsGained))
## Warning in log(pass_data$yardsGained): NaNs produced
This looks better, so now we check the linearity of possible variables
to use in the model
df_long <- pass_data %>%
pivot_longer(cols = c(quarter, down, yardsToGo, dropbackDistance, timeInTackleBox, zone,
playAction, passLength, score_differential, penaltyYards),
names_to = "variable",
values_to = "value")
# open the plot in new window (upper right icon) and drag-extend window as needed
ggplot(data=df_long, aes(x = value, y = log(yardsGained))) +
geom_point(shape=1) +
facet_wrap(~variable, scales = "free_x", ncol=3) +
theme_ap(family = "sans")
## Warning in log(yardsGained): NaNs produced
## Warning: Removed 2089 rows containing missing values or values outside the scale range
## (`geom_point()`).
This shows two important things. First, the
passLength
variable needs to be transformed, since the relationship is not strictly
linear, and second there are a lot of results where the number of yards
gained is zero, and that will throw off the results of any model fitted.
We can see below that a lot of the zero yardage situations come from
incomplete passes, and interceptions (which despite the ball going the
other way, is coded as a zero yardage change).
zero_yards <- pass_data %>%
filter(yardsGained == 0 )
non_zero_yards <- pass_data %>%
filter(yardsGained != 0 )
table(zero_yards$passResult)
##
## C I IN
## 94 2884 193
table(non_zero_yards$passResult)
##
## C I R
## 5530 24 1
To handle this, we will do two things: 1) for this specific analysis we will fit a model excluding incomplete passes and interceptions and 2) going forward we will need a model to explain the behavior of pass results (but that will be a task for another data dive).
model <- glm(I(log(yardsGained + 100)) ~ quarter +
down +
yardsToGo +
offenseFormation +
dropbackDistance +
timeToThrow+
timeInTackleBox +
zone +
penaltyYards +
qbKneel +
playAction +
I(log(passLength + 100)) +
dropbackType,
data = pass_data[!(pass_data$passResult %in% c("I", "IN")),],
family = "gaussian")
plot(model$fitted.values, model$residuals)
This shows in the full model that when we remove the intercepted and
incomplete passes, the model’s residuals are randomly distributed across
the fitted values. Here we partion the data and then itterate through
the different explanatory variables to see if any can be removed.
pass_data <- pass_data[!(pass_data$passResult %in% c("I", "IN")),]
exp_var <- c("quarter" ,
"down" ,
"yardsToGo" ,
"offenseFormation" ,
"dropbackDistance" ,
"timeToThrow",
#"timeInTackleBox" , #Removed due to multicolinearity issues
"zone" ,
"penaltyYards" ,
"qbKneel" ,
"playAction" ,
"log(passLength + 100)" ,
"interception" ,
"dropbackType"
)
initial_model <- glm(as.formula(paste("I(log(yardsGained + 100)) ~ ", paste(exp_var, collapse = "+"))),
data = pass_data,
family = "gaussian")
initial_AIC <- AIC(initial_model)
new_aic <- initial_AIC -1
while(initial_AIC > new_aic){
initial_AIC <- new_aic
AIC_vec <- vector()
var_left_out <- vector()
for(i in 1:length(exp_var)){
test_formula <- as.formula(paste("I(log(yardsGained + 100)) ~ ", paste(exp_var[-i], collapse = "+")))
test_model <- glm(test_formula,
data = pass_data,
family = "gaussian")
AIC_vec[i] <- AIC(test_model)
var_left_out[i] <- exp_var[i]
}
print(paste("Dropping", exp_var[AIC_vec == min(AIC_vec)]))
exp_var <- c(exp_var[!AIC_vec == min(AIC_vec)])
new_model <- glm(as.formula(paste("I(log(yardsGained + 100)) ~ ", paste(exp_var[-i], collapse = "+"))),
data = pass_data,
family = "gaussian")
new_aic <- AIC(new_model)
}
## [1] "Dropping zone"
## [1] "Dropping offenseFormation"
## [1] "Dropping qbKneel" "Dropping interception"
new_model <- glm(as.formula(paste("I(log(yardsGained + 100)) ~ ", paste(exp_var, collapse = "+"))),
data = pass_data,
family = "gaussian")
summary(new_model)
##
## Call:
## glm(formula = as.formula(paste("I(log(yardsGained + 100)) ~ ",
## paste(exp_var, collapse = "+"))), family = "gaussian", data = pass_data)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9057691 0.0507240 17.857 < 2e-16 ***
## quarter -0.0015616 0.0006776 -2.304 0.021236 *
## down 0.0017499 0.0010265 1.705 0.088305 .
## yardsToGo 0.0010260 0.0001978 5.187 2.21e-07 ***
## dropbackDistance 0.0024093 0.0005103 4.722 2.40e-06 ***
## timeToThrow -0.0023330 0.0013082 -1.783 0.074581 .
## penaltyYards 0.0074256 0.0004995 14.868 < 2e-16 ***
## playActionTRUE 0.0109972 0.0020329 5.410 6.57e-08 ***
## log(passLength + 100) 0.8089704 0.0111046 72.850 < 2e-16 ***
## dropbackTypeDESIGNED_ROLLOUT_RIGHT 0.0058286 0.0062899 0.927 0.354137
## dropbackTypeSCRAMBLE 0.0149018 0.0073639 2.024 0.043056 *
## dropbackTypeSCRAMBLE_ROLLOUT_LEFT 0.0331002 0.0261987 1.263 0.206488
## dropbackTypeSCRAMBLE_ROLLOUT_RIGHT -0.0062206 0.0126592 -0.491 0.623170
## dropbackTypeTRADITIONAL 0.0192737 0.0053107 3.629 0.000287 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 0.003284715)
##
## Null deviance: 40.629 on 5624 degrees of freedom
## Residual deviance: 18.431 on 5611 degrees of freedom
## AIC: -16187
##
## Number of Fisher Scoring iterations: 2
Check for multicolinearity.
vif(new_model)
## GVIF Df GVIF^(1/(2*Df))
## quarter 1.014063 1 1.007007
## down 1.209995 1 1.099998
## yardsToGo 1.109567 1 1.053360
## dropbackDistance 1.697306 1 1.302807
## timeToThrow 2.249486 1 1.499829
## penaltyYards 1.001181 1 1.000590
## playAction 1.367304 1 1.169318
## log(passLength + 100) 1.202791 1 1.096718
## dropbackType 1.699230 5 1.054448
Since all of the variables have a value less than 5, we don’t need to be too concerned about multicoliniearity, although it should be noted that the droback type and time to throw might be related. Now we can look at some diagnostic plots
plot(new_model$fitted.values, new_model$residuals)
plot(new_model)
If we look at the diagnostic plots, we can see potentially some issues with the normality assumption from the QQ plot. It appears that for values further out (so more yards gained), the residuals are further away from the theoretical normal distribution. This is not entierly unexpected since there are explosive plays that are going to be very difficult to fit into the normal regression line. We can generalize relationships between variables from the model, but it’s not going to be very helpful in explaining those large yardage plays.
To interpret the coefficents, we look at the exponentiated values.
exp(new_model$coefficients)
## (Intercept) quarter
## 2.4738337 0.9984397
## down yardsToGo
## 1.0017514 1.0010266
## dropbackDistance timeToThrow
## 1.0024122 0.9976697
## penaltyYards playActionTRUE
## 1.0074533 1.0110579
## log(passLength + 100) dropbackTypeDESIGNED_ROLLOUT_RIGHT
## 2.2455948 1.0058457
## dropbackTypeSCRAMBLE dropbackTypeSCRAMBLE_ROLLOUT_LEFT
## 1.0150134 1.0336541
## dropbackTypeSCRAMBLE_ROLLOUT_RIGHT dropbackTypeTRADITIONAL
## 0.9937987 1.0194606
Here we can see a couple of interesting things, particularly with the drop back distance. The father a quarterback drops back in the pocket, the fewer yards a play picks up. This is compared with the time to throw variable that shows more yards picked up for every second the quarterback has to throw. Together this shows that the quarterback needs as much time as possile to work through their reads.