Load data
data <- load_pbp(2000:2024) %>% select(season, yardline_100, game_seconds_remaining,
down, play_type, touchdown, score_differential, wpa, epa)
Exploratory Data Analysis
df <- data %>% drop_na(down, epa) %>%
group_by(season) %>%
summarise(epa_per_down = mean(epa/down))
ggplot(df, aes(x = season, y = epa_per_down)) +
geom_point()
Conclusion: EPA per down steadily increased from 2000-2010, but then
doesn’t follow a noticeable pattern from 2011-2024. Use this time period
to calculate LEPARD metric.
data <- load_pbp(2011:2024) %>% select(season, game_id, yardline_100, game_seconds_remaining,
down, play_type, touchdown, score_differential, wpa, qb_epa)
Create categorical variables based on down, distance, score and time.
df <- data %>%
filter(!(play_type %in% c("kickoff", "no_play", "extra_point", "qb_kneel", "qb_spike", "field_goal"))) %>%
drop_na(play_type, down, wpa, qb_epa) %>%
mutate(
down_group = as_factor(down),
yardline_group = as_factor(case_when(
yardline_100 <= 20 ~ "red_zone",
yardline_100 <= 43 & yardline_100 > 20 ~ "fg_range",
yardline_100 <= 65 & yardline_100 > 43 ~ "midfield",
yardline_100 <= 100 & yardline_100 > 65 ~ "far_away")
),
time_group = as_factor( case_when(
game_seconds_remaining <= 3600 & game_seconds_remaining > 900 ~ "Q1_to_Q3",
game_seconds_remaining <= 900 & game_seconds_remaining >= 0 ~ "Q4"
)),
margin_group = as_factor(case_when(
score_differential >= -3 & score_differential < 1 ~ "fg_wins_or_ties",
score_differential >= -8 & score_differential < -3 ~ "td_wins_or_ties",
score_differential >= -16 & score_differential < -8 ~ "two_scores_wins_or_ties",
score_differential < -16 ~ "far_behind",
score_differential > 0 & score_differential <= 16 ~ "lead",
score_differential > 16 ~ "big_lead")),
touchdown = if_else(touchdown == 0, "no_td", "td_scored"))
anyNA(df)
## [1] FALSE
Create a new independent variable called wpa_diff that compares the average difference in wpa when a touchdown is scored on a play versus not. This will be used to weight each play situation.
df1 <- df %>% select(touchdown, wpa, down_group, yardline_group, time_group, margin_group) %>%
group_by(down_group, yardline_group, touchdown, time_group, margin_group) %>%
summarize(mean_wpa = mean(wpa)) %>%
pivot_wider(names_from = touchdown, values_from = mean_wpa) %>%
ungroup() %>%
mutate(wpa_diff = td_scored - no_td)
## `summarise()` has grouped output by 'down_group', 'yardline_group',
## 'touchdown', 'time_group'. You can override using the `.groups` argument.
Run an analysis on the categorical variables and their correlation with wpa_diff. Note: I iterated on the parameters of the categorical variables multiple times, however for brevity I am only showing one such iteration here. In my opinion this is the biggest change from my original thought process.
Boxplots:
ggplot(df1, aes(x = down_group, y = wpa_diff)) +
geom_boxplot(fill = "skyblue", outlier.color = "red") +
labs(title = "wpa_diff vs. down_group", x = "down_group", y = "wpa_diff") +
theme_minimal()
ggplot(df1, aes(x = yardline_group, y = wpa_diff)) +
geom_boxplot(fill = "skyblue", outlier.color = "red") +
labs(title = "wpa_diff vs. yardline_group", x = "yardline_group", y = "wpa_diff") +
theme_minimal()
ggplot(df1, aes(x = time_group, y = wpa_diff)) +
geom_boxplot(fill = "skyblue", outlier.color = "red") +
labs(title = "wpa_diff vs. time_group", x = "time_group", y = "wpa_diff") +
theme_minimal()
ggplot(df1, aes(x = margin_group, y = wpa_diff)) +
geom_boxplot(fill = "skyblue", outlier.color = "red") +
labs(title = "wpa_diff vs. margin_group", x = "margin_group", y = "wpa_diff") +
theme_minimal()
ANOVA for statistical significance
anova_result <- aov(wpa_diff ~ down_group, data = df1)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## down_group 3 0.016 0.00518 0.69 0.56
## Residuals 188 1.419 0.00755
anova_result <- aov(wpa_diff ~ yardline_group, data = df1)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## yardline_group 3 0.201 0.0670 10.2 0.0000029 ***
## Residuals 188 1.233 0.0066
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_result <- aov(wpa_diff ~ time_group, data = df1)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## time_group 1 0.042 0.0419 5.71 0.018 *
## Residuals 190 1.393 0.0073
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova_result <- aov(wpa_diff ~ margin_group, data = df1)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## margin_group 5 0.398 0.0796 14.3 0.0000000000078 ***
## Residuals 186 1.036 0.0056
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Regression model
set.seed(13)
model <- lm(wpa_diff ~ down_group + yardline_group + time_group + margin_group, data = df1)
summary(model)
##
## Call:
## lm(formula = wpa_diff ~ down_group + yardline_group + time_group +
## margin_group, data = df1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2577 -0.0300 -0.0033 0.0251 0.4081
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.03589 0.01715 2.09 0.03785
## down_group2 -0.01228 0.01346 -0.91 0.36270
## down_group3 0.00879 0.01346 0.65 0.51445
## down_group4 -0.01269 0.01346 -0.94 0.34711
## yardline_groupmidfield 0.05329 0.01346 3.96 0.00011
## yardline_groupfg_range 0.08490 0.01346 6.31 0.0000000021
## yardline_groupred_zone 0.07208 0.01346 5.36 0.0000002582
## time_groupQ4 0.02954 0.00952 3.10 0.00222
## margin_grouptd_wins_or_ties 0.01978 0.01648 1.20 0.23169
## margin_grouplead -0.08901 0.01648 -5.40 0.0000002093
## margin_grouptwo_scores_wins_or_ties -0.04722 0.01648 -2.86 0.00467
## margin_groupfar_behind -0.08749 0.01648 -5.31 0.0000003245
## margin_groupbig_lead -0.09620 0.01648 -5.84 0.0000000244
##
## (Intercept) *
## down_group2
## down_group3
## down_group4
## yardline_groupmidfield ***
## yardline_groupfg_range ***
## yardline_groupred_zone ***
## time_groupQ4 **
## margin_grouptd_wins_or_ties
## margin_grouplead ***
## margin_grouptwo_scores_wins_or_ties **
## margin_groupfar_behind ***
## margin_groupbig_lead ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0659 on 179 degrees of freedom
## Multiple R-squared: 0.458, Adjusted R-squared: 0.421
## F-statistic: 12.6 on 12 and 179 DF, p-value: <0.0000000000000002
Conclusion: Down is not strongly correlated with wpa_diff. Remove it. Also, having a “big lead” and being “far behind” have similar means and distributions, so combine them.
df2 <- data %>%
filter(!(play_type %in% c("kickoff", "no_play", "extra_point", "qb_kneel", "qb_spike", "field_goal"))) %>%
drop_na(play_type, down, wpa, qb_epa) %>%
mutate(
yardline_group = as_factor(case_when(
yardline_100 <= 20 ~ "red_zone",
yardline_100 <= 43 & yardline_100 > 20 ~ "fg_range",
yardline_100 <= 65 & yardline_100 > 43 ~ "midfield",
yardline_100 <= 100 & yardline_100 > 65 ~ "far_away")
),
time_group = as_factor( case_when(
game_seconds_remaining <= 3600 & game_seconds_remaining > 900 ~ "Q1_to_Q3",
game_seconds_remaining <= 900 & game_seconds_remaining >= 0 ~ "Q4"
)),
margin_group = as_factor(case_when(
score_differential >= -3 & score_differential < 1 ~ "fg_wins_or_ties",
score_differential >= -8 & score_differential < -3 ~ "td_wins_or_ties",
score_differential >= -16 & score_differential < -8 ~ "two_scores_wins_or_ties",
score_differential < -16 | score_differential > 16 ~ "big_pt_dif",
score_differential > 0 & score_differential <= 16 ~ "lead")),
touchdown = if_else(touchdown == 0, "no_td", "td_scored")) %>%
select(touchdown, wpa, yardline_group, time_group, margin_group) %>%
group_by(yardline_group, touchdown, time_group, margin_group) %>%
summarize(mean_wpa = mean(wpa)) %>%
pivot_wider(names_from = touchdown, values_from = mean_wpa) %>%
ungroup() %>%
mutate(wpa_diff = td_scored - no_td)
## `summarise()` has grouped output by 'yardline_group', 'touchdown',
## 'time_group'. You can override using the `.groups` argument.
anyNA(df2)
## [1] FALSE
summary(df2)
## yardline_group time_group margin_group
## far_away:10 Q1_to_Q3:20 fg_wins_or_ties :8
## midfield:10 Q4 :20 td_wins_or_ties :8
## fg_range:10 lead :8
## red_zone:10 two_scores_wins_or_ties:8
## big_pt_dif :8
##
## no_td td_scored wpa_diff
## Min. :-0.037975 Min. :-0.0611 Min. :-0.0597
## 1st Qu.:-0.002304 1st Qu.: 0.0115 1st Qu.: 0.0124
## Median :-0.001256 Median : 0.0386 Median : 0.0447
## Mean :-0.003412 Mean : 0.0540 Mean : 0.0574
## 3rd Qu.:-0.000264 3rd Qu.: 0.0798 3rd Qu.: 0.0840
## Max. : 0.004218 Max. : 0.2136 Max. : 0.2147
Re-run regression model and compare to original
set.seed(13)
model2 <- lm(wpa_diff ~ yardline_group + time_group + margin_group, data = df2)
summary(model2)
##
## Call:
## lm(formula = wpa_diff ~ yardline_group + time_group + margin_group,
## data = df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.04940 -0.02572 -0.00331 0.02259 0.05802
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0281 0.0165 1.70 0.09851 .
## yardline_groupmidfield 0.0622 0.0156 4.00 0.00036 ***
## yardline_groupfg_range 0.0734 0.0156 4.72 0.000048 ***
## yardline_groupred_zone 0.0551 0.0156 3.54 0.00128 **
## time_groupQ4 0.0270 0.0110 2.45 0.01994 *
## margin_grouptd_wins_or_ties 0.0301 0.0174 1.73 0.09324 .
## margin_grouplead -0.0704 0.0174 -4.05 0.00032 ***
## margin_grouptwo_scores_wins_or_ties -0.0390 0.0174 -2.24 0.03218 *
## margin_groupbig_pt_dif -0.0803 0.0174 -4.62 0.000064 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0348 on 31 degrees of freedom
## Multiple R-squared: 0.744, Adjusted R-squared: 0.678
## F-statistic: 11.3 on 8 and 31 DF, p-value: 0.000000257
Conclusion: The categories have low p-values and the new model has a higher Adjusted R-Squared (0.678 vs 0.421). Proceed with these categorical variables.
Make and store regression predictions, standardize and create leverage factor
df3 <- df2 %>%
mutate(predicted_wpa = predict(model2, newdata = df2))
df3 <- df3 %>% mutate(
leverage_factor = (predicted_wpa - mean(predicted_wpa))/sd(predicted_wpa))
adj <- min(df3$leverage_factor)
df3 <- df3 %>% mutate(
leverage_factor = leverage_factor + (adj * -1))
leverage_factors <- df3 %>% select(yardline_group, time_group, margin_group, leverage_factor)
summary(leverage_factors)
## yardline_group time_group margin_group leverage_factor
## far_away:10 Q1_to_Q3:20 fg_wins_or_ties :8 Min. :0.00
## midfield:10 Q4 :20 td_wins_or_ties :8 1st Qu.:1.38
## fg_range:10 lead :8 Median :1.99
## red_zone:10 two_scores_wins_or_ties:8 Mean :2.07
## big_pt_dif :8 3rd Qu.:2.75
## Max. :3.98
write.csv(leverage_factors, file = "leverage_factors.csv")