Load data

data <- load_pbp(2000:2024) %>% select(season, yardline_100, game_seconds_remaining, 
                        down, play_type, touchdown, score_differential, wpa, epa)

Exploratory Data Analysis

df <- data %>% drop_na(down, epa) %>%
  group_by(season) %>%
  summarise(epa_per_down = mean(epa/down))

ggplot(df, aes(x = season, y = epa_per_down)) +
  geom_point()

Conclusion: EPA per down steadily increased from 2000-2010, but then doesn’t follow a noticeable pattern from 2011-2024. Use this time period to calculate LEPARD metric.

data <- load_pbp(2011:2024) %>% select(season, game_id, yardline_100, game_seconds_remaining, 
                        down, play_type, touchdown, score_differential, wpa, qb_epa)

Create categorical variables based on down, distance, score and time.

df <- data %>%
   filter(!(play_type %in% c("kickoff", "no_play", "extra_point", "qb_kneel", "qb_spike", "field_goal"))) %>%
  drop_na(play_type, down, wpa, qb_epa) %>%
  mutate(
    down_group = as_factor(down),
  yardline_group = as_factor(case_when(
    yardline_100 <= 20 ~ "red_zone",
    yardline_100 <= 43 & yardline_100 > 20 ~ "fg_range",
    yardline_100 <= 65 & yardline_100 > 43 ~ "midfield",
    yardline_100 <= 100 & yardline_100 > 65 ~ "far_away")
  ),
  time_group = as_factor( case_when(
    game_seconds_remaining <= 3600 & game_seconds_remaining > 900 ~ "Q1_to_Q3",
    game_seconds_remaining <= 900 & game_seconds_remaining >= 0 ~ "Q4"
  )),
  margin_group = as_factor(case_when(
    score_differential >= -3 & score_differential < 1 ~ "fg_wins_or_ties",
    score_differential >= -8 & score_differential < -3 ~ "td_wins_or_ties",
    score_differential >= -16 & score_differential < -8 ~ "two_scores_wins_or_ties",
    score_differential < -16 ~ "far_behind",
    score_differential > 0 & score_differential <= 16 ~ "lead",
    score_differential > 16  ~ "big_lead")),
  touchdown = if_else(touchdown == 0, "no_td", "td_scored"))

anyNA(df)
## [1] FALSE

Create a new independent variable called wpa_diff that compares the average difference in wpa when a touchdown is scored on a play versus not. This will be used to weight each play situation.

df1 <- df %>% select(touchdown, wpa, down_group, yardline_group, time_group, margin_group) %>%
  group_by(down_group, yardline_group, touchdown, time_group, margin_group) %>%
  summarize(mean_wpa = mean(wpa)) %>%
  pivot_wider(names_from = touchdown, values_from = mean_wpa) %>%
  ungroup() %>%
  mutate(wpa_diff = td_scored - no_td)
## `summarise()` has grouped output by 'down_group', 'yardline_group',
## 'touchdown', 'time_group'. You can override using the `.groups` argument.

Run an analysis on the categorical variables and their correlation with wpa_diff. Note: I iterated on the parameters of the categorical variables multiple times, however for brevity I am only showing one such iteration here. In my opinion this is the biggest change from my original thought process.

Boxplots:

ggplot(df1, aes(x = down_group, y = wpa_diff)) +
  geom_boxplot(fill = "skyblue", outlier.color = "red") +
  labs(title = "wpa_diff vs. down_group", x = "down_group", y = "wpa_diff") +
  theme_minimal()

ggplot(df1, aes(x = yardline_group, y = wpa_diff)) +
  geom_boxplot(fill = "skyblue", outlier.color = "red") +
  labs(title = "wpa_diff vs. yardline_group", x = "yardline_group", y = "wpa_diff") +
  theme_minimal()

ggplot(df1, aes(x = time_group, y = wpa_diff)) +
  geom_boxplot(fill = "skyblue", outlier.color = "red") +
  labs(title = "wpa_diff vs. time_group", x = "time_group", y = "wpa_diff") +
  theme_minimal()

ggplot(df1, aes(x = margin_group, y = wpa_diff)) +
  geom_boxplot(fill = "skyblue", outlier.color = "red") +
  labs(title = "wpa_diff vs. margin_group", x = "margin_group", y = "wpa_diff") +
  theme_minimal()

ANOVA for statistical significance

anova_result <- aov(wpa_diff ~ down_group, data = df1)
summary(anova_result)
##              Df Sum Sq Mean Sq F value Pr(>F)
## down_group    3  0.016 0.00518    0.69   0.56
## Residuals   188  1.419 0.00755
anova_result <- aov(wpa_diff ~ yardline_group, data = df1)
summary(anova_result)
##                 Df Sum Sq Mean Sq F value    Pr(>F)    
## yardline_group   3  0.201  0.0670    10.2 0.0000029 ***
## Residuals      188  1.233  0.0066                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_result <- aov(wpa_diff ~ time_group, data = df1)
summary(anova_result)
##              Df Sum Sq Mean Sq F value Pr(>F)  
## time_group    1  0.042  0.0419    5.71  0.018 *
## Residuals   190  1.393  0.0073                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_result <- aov(wpa_diff ~ margin_group, data = df1)
summary(anova_result)
##               Df Sum Sq Mean Sq F value          Pr(>F)    
## margin_group   5  0.398  0.0796    14.3 0.0000000000078 ***
## Residuals    186  1.036  0.0056                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Regression model

set.seed(13)

model <- lm(wpa_diff ~ down_group + yardline_group + time_group + margin_group, data = df1)

summary(model)
## 
## Call:
## lm(formula = wpa_diff ~ down_group + yardline_group + time_group + 
##     margin_group, data = df1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.2577 -0.0300 -0.0033  0.0251  0.4081 
## 
## Coefficients:
##                                     Estimate Std. Error t value     Pr(>|t|)
## (Intercept)                          0.03589    0.01715    2.09      0.03785
## down_group2                         -0.01228    0.01346   -0.91      0.36270
## down_group3                          0.00879    0.01346    0.65      0.51445
## down_group4                         -0.01269    0.01346   -0.94      0.34711
## yardline_groupmidfield               0.05329    0.01346    3.96      0.00011
## yardline_groupfg_range               0.08490    0.01346    6.31 0.0000000021
## yardline_groupred_zone               0.07208    0.01346    5.36 0.0000002582
## time_groupQ4                         0.02954    0.00952    3.10      0.00222
## margin_grouptd_wins_or_ties          0.01978    0.01648    1.20      0.23169
## margin_grouplead                    -0.08901    0.01648   -5.40 0.0000002093
## margin_grouptwo_scores_wins_or_ties -0.04722    0.01648   -2.86      0.00467
## margin_groupfar_behind              -0.08749    0.01648   -5.31 0.0000003245
## margin_groupbig_lead                -0.09620    0.01648   -5.84 0.0000000244
##                                        
## (Intercept)                         *  
## down_group2                            
## down_group3                            
## down_group4                            
## yardline_groupmidfield              ***
## yardline_groupfg_range              ***
## yardline_groupred_zone              ***
## time_groupQ4                        ** 
## margin_grouptd_wins_or_ties            
## margin_grouplead                    ***
## margin_grouptwo_scores_wins_or_ties ** 
## margin_groupfar_behind              ***
## margin_groupbig_lead                ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0659 on 179 degrees of freedom
## Multiple R-squared:  0.458,  Adjusted R-squared:  0.421 
## F-statistic: 12.6 on 12 and 179 DF,  p-value: <0.0000000000000002

Conclusion: Down is not strongly correlated with wpa_diff. Remove it. Also, having a “big lead” and being “far behind” have similar means and distributions, so combine them.

df2 <- data %>%
   filter(!(play_type %in% c("kickoff", "no_play", "extra_point", "qb_kneel", "qb_spike", "field_goal"))) %>%
  drop_na(play_type, down, wpa, qb_epa) %>%
  mutate(
  yardline_group = as_factor(case_when(
    yardline_100 <= 20 ~ "red_zone",
    yardline_100 <= 43 & yardline_100 > 20 ~ "fg_range",
    yardline_100 <= 65 & yardline_100 > 43 ~ "midfield",
    yardline_100 <= 100 & yardline_100 > 65 ~ "far_away")
  ),
  time_group = as_factor( case_when(
    game_seconds_remaining <= 3600 & game_seconds_remaining > 900 ~ "Q1_to_Q3",
    game_seconds_remaining <= 900 & game_seconds_remaining >= 0 ~ "Q4"
  )),
  margin_group = as_factor(case_when(
    score_differential >= -3 & score_differential < 1 ~ "fg_wins_or_ties",
    score_differential >= -8 & score_differential < -3 ~ "td_wins_or_ties",
    score_differential >= -16 & score_differential < -8 ~ "two_scores_wins_or_ties",
    score_differential < -16 | score_differential > 16  ~ "big_pt_dif",
    score_differential > 0 & score_differential <= 16 ~ "lead")),
  touchdown = if_else(touchdown == 0, "no_td", "td_scored")) %>%
  select(touchdown, wpa, yardline_group, time_group, margin_group) %>%
  group_by(yardline_group, touchdown, time_group, margin_group) %>%
  summarize(mean_wpa = mean(wpa)) %>%
  pivot_wider(names_from = touchdown, values_from = mean_wpa) %>%
  ungroup() %>%
  mutate(wpa_diff = td_scored - no_td)
## `summarise()` has grouped output by 'yardline_group', 'touchdown',
## 'time_group'. You can override using the `.groups` argument.
anyNA(df2)
## [1] FALSE
summary(df2)
##   yardline_group    time_group                  margin_group
##  far_away:10     Q1_to_Q3:20   fg_wins_or_ties        :8    
##  midfield:10     Q4      :20   td_wins_or_ties        :8    
##  fg_range:10                   lead                   :8    
##  red_zone:10                   two_scores_wins_or_ties:8    
##                                big_pt_dif             :8    
##                                                             
##      no_td             td_scored          wpa_diff      
##  Min.   :-0.037975   Min.   :-0.0611   Min.   :-0.0597  
##  1st Qu.:-0.002304   1st Qu.: 0.0115   1st Qu.: 0.0124  
##  Median :-0.001256   Median : 0.0386   Median : 0.0447  
##  Mean   :-0.003412   Mean   : 0.0540   Mean   : 0.0574  
##  3rd Qu.:-0.000264   3rd Qu.: 0.0798   3rd Qu.: 0.0840  
##  Max.   : 0.004218   Max.   : 0.2136   Max.   : 0.2147

Re-run regression model and compare to original

set.seed(13)

model2 <- lm(wpa_diff ~ yardline_group + time_group + margin_group, data = df2)

summary(model2)
## 
## Call:
## lm(formula = wpa_diff ~ yardline_group + time_group + margin_group, 
##     data = df2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.04940 -0.02572 -0.00331  0.02259  0.05802 
## 
## Coefficients:
##                                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           0.0281     0.0165    1.70  0.09851 .  
## yardline_groupmidfield                0.0622     0.0156    4.00  0.00036 ***
## yardline_groupfg_range                0.0734     0.0156    4.72 0.000048 ***
## yardline_groupred_zone                0.0551     0.0156    3.54  0.00128 ** 
## time_groupQ4                          0.0270     0.0110    2.45  0.01994 *  
## margin_grouptd_wins_or_ties           0.0301     0.0174    1.73  0.09324 .  
## margin_grouplead                     -0.0704     0.0174   -4.05  0.00032 ***
## margin_grouptwo_scores_wins_or_ties  -0.0390     0.0174   -2.24  0.03218 *  
## margin_groupbig_pt_dif               -0.0803     0.0174   -4.62 0.000064 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0348 on 31 degrees of freedom
## Multiple R-squared:  0.744,  Adjusted R-squared:  0.678 
## F-statistic: 11.3 on 8 and 31 DF,  p-value: 0.000000257

Conclusion: The categories have low p-values and the new model has a higher Adjusted R-Squared (0.678 vs 0.421). Proceed with these categorical variables.

Make and store regression predictions, standardize and create leverage factor

df3 <- df2 %>%
  mutate(predicted_wpa = predict(model2, newdata = df2))

df3 <- df3 %>% mutate(
  leverage_factor = (predicted_wpa - mean(predicted_wpa))/sd(predicted_wpa))

adj <- min(df3$leverage_factor)

df3 <- df3 %>% mutate(
  leverage_factor = leverage_factor + (adj * -1))

leverage_factors <- df3 %>% select(yardline_group, time_group, margin_group, leverage_factor)

summary(leverage_factors)
##   yardline_group    time_group                  margin_group leverage_factor
##  far_away:10     Q1_to_Q3:20   fg_wins_or_ties        :8     Min.   :0.00   
##  midfield:10     Q4      :20   td_wins_or_ties        :8     1st Qu.:1.38   
##  fg_range:10                   lead                   :8     Median :1.99   
##  red_zone:10                   two_scores_wins_or_ties:8     Mean   :2.07   
##                                big_pt_dif             :8     3rd Qu.:2.75   
##                                                              Max.   :3.98
write.csv(leverage_factors, file = "leverage_factors.csv")