Data Dive Week 10 - Logistic Regression

Start by setting up the packages to manipulate data.

suppressPackageStartupMessages({
  library(tidyverse)
  library(rio)
  library(boot)
  library(broom)
  library(car)
  library(GGally)
  library(ggrepel)
  library(lindia)
  library(performance)
  library(MASS)
  source("aptheme.R") #Code that helps format graphs
  })

Import data and create additional variables to model.

data <- import("plays.csv") %>%
  mutate(converted = yardsGained >= yardsToGo, 
         pass = !(is.na(passLength)),
         zone = pff_manZone == "Zone", 
         incompleted_pass = passResult == "I", 
         interception = passResult == "IN", 
         dropbackDistance = ifelse(is.na(dropbackDistance), 0, dropbackDistance),
         timeToThrow = ifelse(is.na(timeToThrow), 0, timeToThrow),
         timeInTackleBox = ifelse(is.na(timeInTackleBox), 0, timeInTackleBox), 
         score_differential = abs(preSnapHomeScore - preSnapVisitorScore), 
         penaltyYards = ifelse(is.na(penaltyYards), 0, penaltyYards)
         ) 

pass_data <- data %>%
  filter(pass)

Checking the distribution

hist(pass_data$yardsGained)

This distribution doesn’t look like it follows the normality requirements, so we take the log.

hist(log(pass_data$yardsGained))
## Warning in log(pass_data$yardsGained): NaNs produced

This looks better, so now we check the linearity of possible variables to use in the model

df_long <- pass_data %>% 
  pivot_longer(cols = c(quarter, down, yardsToGo, dropbackDistance, timeInTackleBox, zone,
    playAction, passLength, score_differential, penaltyYards), 
               names_to = "variable", 
               values_to = "value")

# open the plot in new window (upper right icon) and drag-extend window as needed
ggplot(data=df_long, aes(x = value, y = log(yardsGained))) +
  geom_point(shape=1) +
  facet_wrap(~variable, scales = "free_x", ncol=3) +
  theme_ap(family = "sans")
## Warning in log(yardsGained): NaNs produced
## Warning: Removed 2089 rows containing missing values or values outside the scale range
## (`geom_point()`).

This shows two important things. First, the passLength variable needs to be transformed, since the relationship is not strictly linear, and second there are a lot of results where the number of yards gained is zero, and that will throw off the results of any model fitted. We can see below that a lot of the zero yardage situations come from incomplete passes, and interceptions (which despite the ball going the other way, is coded as a zero yardage change).

zero_yards <- pass_data %>%
  filter(yardsGained == 0 )
non_zero_yards <- pass_data %>%
  filter(yardsGained != 0 )
table(zero_yards$passResult)
## 
##    C    I   IN 
##   94 2884  193
table(non_zero_yards$passResult)
## 
##    C    I    R 
## 5530   24    1

To handle this, we will do two things: 1) for this specific analysis we will fit a model excluding incomplete passes and interceptions and 2) going forward we will need a model to explain the behavior of pass results (but that will be a task for another data dive).

model <- glm(I(log(yardsGained + 100))   ~ quarter + 
               down + 
               yardsToGo + 
               offenseFormation + 
               dropbackDistance + 
               timeToThrow+ 
               timeInTackleBox +
               zone + 
               penaltyYards + 
               qbKneel + 
               playAction + 
               I(log(passLength + 100)) + 
               dropbackType,
             data = pass_data[!(pass_data$passResult %in% c("I", "IN")),], 
             family = "gaussian")
plot(model$fitted.values, model$residuals)

This shows in the full model that when we remove the intercepted and incomplete passes, the model’s residuals are randomly distributed across the fitted values. Here we partion the data and then itterate through the different explanatory variables to see if any can be removed.

pass_data <- pass_data[!(pass_data$passResult %in% c("I", "IN")),]
exp_var <- c("quarter" , 
               "down" , 
               "yardsToGo" , 
               "offenseFormation" , 
               "dropbackDistance" , 
               "timeToThrow", 
               #"timeInTackleBox" , #Removed due to multicolinearity issues
               "zone" , 
               "penaltyYards" , 
               "qbKneel" , 
               "playAction" , 
               "log(passLength + 100)" , 
               "interception" , 
               "dropbackType"
               )
initial_model <- glm(as.formula(paste("I(log(yardsGained + 100)) ~ ", paste(exp_var, collapse = "+"))),
                     data = pass_data, 
                     family = "gaussian")
initial_AIC <- AIC(initial_model)
new_aic <- initial_AIC -1
while(initial_AIC > new_aic){
 initial_AIC <- new_aic
AIC_vec <- vector()
var_left_out <- vector()

for(i in 1:length(exp_var)){
  test_formula <- as.formula(paste("I(log(yardsGained + 100)) ~ ", paste(exp_var[-i], collapse = "+")))
  test_model <- glm(test_formula,
                    data = pass_data, 
                    family = "gaussian")
  AIC_vec[i] <- AIC(test_model)
  var_left_out[i] <- exp_var[i]
  
}
print(paste("Dropping", exp_var[AIC_vec == min(AIC_vec)]))
exp_var <- c(exp_var[!AIC_vec == min(AIC_vec)])

new_model <-  glm(as.formula(paste("I(log(yardsGained + 100)) ~ ", paste(exp_var[-i], collapse = "+"))),
                  data = pass_data, 
                  family = "gaussian")
new_aic <- AIC(new_model)

}
## [1] "Dropping zone"
## [1] "Dropping offenseFormation"
## [1] "Dropping qbKneel"      "Dropping interception"
new_model <-  glm(as.formula(paste("I(log(yardsGained + 100)) ~ ", paste(exp_var, collapse = "+"))),
                 data = pass_data, 
                 family = "gaussian")
summary(new_model)
## 
## Call:
## glm(formula = as.formula(paste("I(log(yardsGained + 100)) ~ ", 
##     paste(exp_var, collapse = "+"))), family = "gaussian", data = pass_data)
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                         0.9057691  0.0507240  17.857  < 2e-16 ***
## quarter                            -0.0015616  0.0006776  -2.304 0.021236 *  
## down                                0.0017499  0.0010265   1.705 0.088305 .  
## yardsToGo                           0.0010260  0.0001978   5.187 2.21e-07 ***
## dropbackDistance                    0.0024093  0.0005103   4.722 2.40e-06 ***
## timeToThrow                        -0.0023330  0.0013082  -1.783 0.074581 .  
## penaltyYards                        0.0074256  0.0004995  14.868  < 2e-16 ***
## playActionTRUE                      0.0109972  0.0020329   5.410 6.57e-08 ***
## log(passLength + 100)               0.8089704  0.0111046  72.850  < 2e-16 ***
## dropbackTypeDESIGNED_ROLLOUT_RIGHT  0.0058286  0.0062899   0.927 0.354137    
## dropbackTypeSCRAMBLE                0.0149018  0.0073639   2.024 0.043056 *  
## dropbackTypeSCRAMBLE_ROLLOUT_LEFT   0.0331002  0.0261987   1.263 0.206488    
## dropbackTypeSCRAMBLE_ROLLOUT_RIGHT -0.0062206  0.0126592  -0.491 0.623170    
## dropbackTypeTRADITIONAL             0.0192737  0.0053107   3.629 0.000287 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 0.003284715)
## 
##     Null deviance: 40.629  on 5624  degrees of freedom
## Residual deviance: 18.431  on 5611  degrees of freedom
## AIC: -16187
## 
## Number of Fisher Scoring iterations: 2

Check for multicolinearity.

vif(new_model)
##                           GVIF Df GVIF^(1/(2*Df))
## quarter               1.014063  1        1.007007
## down                  1.209995  1        1.099998
## yardsToGo             1.109567  1        1.053360
## dropbackDistance      1.697306  1        1.302807
## timeToThrow           2.249486  1        1.499829
## penaltyYards          1.001181  1        1.000590
## playAction            1.367304  1        1.169318
## log(passLength + 100) 1.202791  1        1.096718
## dropbackType          1.699230  5        1.054448

Since all of the variables have a value less than 5, we don’t need to be too concerned about multicoliniearity, although it should be noted that the droback type and time to throw might be related. Now we can look at some diagnostic plots

plot(new_model$fitted.values, new_model$residuals)

plot(new_model)

If we look at the diagnostic plots, we can see potentially some issues with the normality assumption from the QQ plot. It appears that for values further out (so more yards gained), the residuals are further away from the theoretical normal distribution. This is not entierly unexpected since there are explosive plays that are going to be very difficult to fit into the normal regression line. We can generalize relationships between variables from the model, but it’s not going to be very helpful in explaining those large yardage plays.

To interpret the coefficents, we look at the exponentiated values.

exp(new_model$coefficients)
##                        (Intercept)                            quarter 
##                          2.4738337                          0.9984397 
##                               down                          yardsToGo 
##                          1.0017514                          1.0010266 
##                   dropbackDistance                        timeToThrow 
##                          1.0024122                          0.9976697 
##                       penaltyYards                     playActionTRUE 
##                          1.0074533                          1.0110579 
##              log(passLength + 100) dropbackTypeDESIGNED_ROLLOUT_RIGHT 
##                          2.2455948                          1.0058457 
##               dropbackTypeSCRAMBLE  dropbackTypeSCRAMBLE_ROLLOUT_LEFT 
##                          1.0150134                          1.0336541 
## dropbackTypeSCRAMBLE_ROLLOUT_RIGHT            dropbackTypeTRADITIONAL 
##                          0.9937987                          1.0194606

Here we can see a couple of interesting things, particularly with the drop back distance. The father a quarterback drops back in the pocket, the fewer yards a play picks up. This is compared with the time to throw variable that shows more yards picked up for every second the quarterback has to throw. Together this shows that the quarterback needs as much time as possile to work through their reads.