The purpose of this analysis is to create a metric to extract drive-level performance for NFL Offenses, accounting for variables such as field position, kicker quality, time remaining, etc. This analysis introduces a metric called Points Scored Over Expected that analyzes both how often teams score Touchdowns above average and how often teams attempt field goals that exceed expected rates.
This is intended to be an exploratory analysis to provide a base for how the model could be formed.
Data Source: nflfastR
Code available below if interested. Years included from 2017 to 2025.
library(tidyverse)
games <- nflfastR::load_pbp(2017:2025) |> dplyr::filter(season_type == "REG") %>% select(play_id, game_id, home_team, away_team, week, posteam, posteam_type, defteam, yardline_100, yards_gained, half_seconds_remaining, game_seconds_remaining, quarter_end, drive, sp, qtr, down, goal_to_go, ydstogo, ydsnet, desc, play_type, yards_gained, shotgun, no_huddle, qb_dropback, qb_kneel, qb_spike, qb_scramble, yards_after_catch, run_location, run_gap, field_goal_result, field_goal_attempt, kick_distance, extra_point_result, extra_point_attempt, two_point_conv_result, home_timeouts_remaining, away_timeouts_remaining, timeout, timeout_team, time_of_day, td_team ,td_player_name, td_prob, posteam_timeouts_remaining, posteam_score, posteam_score_post, defteam_score, defteam_score_post, score_differential, ep, epa, punt_blocked, first_down, third_down_converted, interception, fumble, fumble_lost, solo_tackle, safety, penalty, tackled_for_loss, own_kickoff_recovery, sack, touchdown, pass_touchdown, rush_touchdown, return_touchdown, extra_point_attempt, complete_pass, assist_tackle, passer_player_id, passer_player_name, passing_yards, receiver_player_id, receiver_player_name, receiving_yards, rusher_player_id, rusher_player_name, rushing_yards, interception, interception_player_id, interception_player_name, kicker_player_id, kicker_player_name, solo_tackle_1_player_id, solo_tackle_1_player_name, assist_tackle_1_player_id, assist_tackle_1_player_name, assist_tackle_2_player_id, assist_tackle_2_player_name, tackle_with_assist, fumbled_1_player_id, fumbled_1_player_name, sack_player_id, sack_player_name, return_team, return_yards, penalty_team, penalty, penalty_player_id, penalty_player_name, penalty_yards, penalty_type, replay_or_challenge, replay_or_challenge_result, cp, cpoe, series, series_result, series_success, time_of_day, stadium, weather,play_clock, play_type_nfl, play, special_teams_play, drive_end_transition, drive_end_yard_line, drive_ended_with_score, drive_first_downs, drive_play_count, drive_game_clock_start, drive_start_yard_line, drive_start_transition, drive_yards_penalized, spread_line, total_line, roof, surface, pass, rush, out_of_bounds, special, season)
df1 = games %>% filter(timeout == 0 & !is.na(drive)) %>% group_by(game_id, season,week, posteam, defteam, drive, drive_end_transition, drive_start_transition, drive_game_clock_start, drive_ended_with_score, drive_yards_penalized, drive_start_yard_line, drive_end_yard_line, drive_first_downs, drive_play_count, weather, roof) %>% summarise(max_seconds_Game = max(game_seconds_remaining), max_seconds_half = max(half_seconds_remaining), Pos_Score = min(posteam_score), defteam_score = min(defteam_score), extra_point_att = sum(extra_point_attempt), extra_point_result = max(extra_point_result,na.rm=TRUE), FG_att = sum(field_goal_attempt), FG_result = max(field_goal_result,na.rm=TRUE), FG_Distance = max(kick_distance[field_goal_attempt == 1],na.rm=TRUE), Two_PT_Result = max(two_point_conv_result,na.rm=TRUE), SFTY = sum(safety[special_teams_play == 0]), INT = sum(interception), FUMBLE_LOST = sum(fumble_lost[special_teams_play == 0]), OFF_TD = sum(td_team == posteam & special_teams_play == 0,na.rm=TRUE), DEF_TD = sum(td_team == defteam & special_teams_play == 0,na.rm=TRUE), TD_PLAYER = max(td_player_name,na.rm=TRUE), sacks = sum(sack), passing_plays = sum(pass[play_type == "pass"]), passing_yards = sum(passing_yards,na.rm=TRUE), Penalties_Off = sum(penalty[special_teams_play == 0 & penalty_team == posteam],na.rm=TRUE), Penalties_Def = sum(penalty[special_teams_play == 0 & penalty_team == defteam],na.rm=TRUE), QB_id = max(passer_player_id[qb_dropback > 0],na.rm=T), QB_name = max(passer_player_name[qb_dropback > 0],na.rm=TRUE), kicker_id = max(kicker_player_id[field_goal_attempt + extra_point_attempt > 0],na.rm=T), kicker_name = max(kicker_player_name[field_goal_attempt + extra_point_attempt > 0], na.rm=T), ST_OFF_TD = sum(touchdown[special_teams_play == 1 & td_team == posteam],na.rm=T), ST_DEF_TD = sum(touchdown[special_teams_play == 1 & td_team == defteam],na.rm=T), kneels = sum(qb_kneel), scrambles = sum(qb_scramble)) %>% mutate(Starting_Number = as.numeric(substr(drive_start_yard_line, nchar(drive_start_yard_line) - 1, nchar(drive_start_yard_line))), STARTING_TEAM = substr(drive_start_yard_line,1,3), yards_to_endzone = case_when(gsub(" ", "", STARTING_TEAM) == defteam ~ Starting_Number, STARTING_TEAM == "50 " ~ 50, !is.na(STARTING_TEAM) ~ 100 - Starting_Number, TRUE ~ 999999))
temp <- games %>%
filter(!is.na(passer_player_id)) %>%
group_by(game_id, posteam, drive, passer_player_id, passer_player_name) %>%
summarise(plays = length(play_id), .groups = "drop") %>%
group_by(game_id, drive, posteam) %>%
slice_max(order_by = plays, n = 1, with_ties = FALSE)
df1$QB_id = temp$passer_player_id[match(paste(df1$game_id, df1$posteam, df1$drive, sep = ", "),paste(temp$game_id, temp$posteam, temp$drive, sep = ", "))]
df1$QB_name = temp$passer_player_name[match(paste(df1$game_id, df1$posteam, df1$drive, sep = ", "),paste(temp$game_id, temp$posteam, temp$drive, sep = ", "))]
##
## Call:
## glm(formula = OFF_TD ~ log(yards_to_endzone) + poly(sec_capped,
## 2) + outdoors_ind + rain_ind + snow_ind + winning_late_ind,
## family = "binomial", data = df1 %>% filter(yards_to_endzone <
## 100))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6120 -0.7273 -0.6789 -0.2551 3.0580
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.20022 0.13424 23.839 < 2e-16 ***
## log(yards_to_endzone) -1.05017 0.03152 -33.314 < 2e-16 ***
## poly(sec_capped, 2)1 96.97069 4.10935 23.598 < 2e-16 ***
## poly(sec_capped, 2)2 -44.82182 3.26336 -13.735 < 2e-16 ***
## outdoors_indTRUE -0.12611 0.02489 -5.067 4.04e-07 ***
## rain_indTRUE -0.31058 0.05808 -5.347 8.94e-08 ***
## snow_indTRUE -0.16976 0.13608 -1.248 0.212
## winning_late_indTRUE -2.08436 0.17474 -11.928 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 49373 on 47028 degrees of freedom
## Residual deviance: 46714 on 47021 degrees of freedom
## AIC: 46730
##
## Number of Fisher Scoring iterations: 6
Per graphs above, Yards to Endzone and Seconds to Half are important variables in modelling TD Probability.
Model fits points relatively well, but steep changes in Probability are harder to exactly fit TD Probability.
Next, we will examine the probability to attempt a Field Goal for each drive.
##
## Call:
## glm(formula = FG_att ~ poly(log(yards_to_endzone), 2) + poly(sec_capped,
## 2) + outdoors_ind + winning_late_ind + within_FG, family = "binomial",
## data = df1 %>% filter(yards_to_endzone < 100))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2723 -0.6358 -0.5605 -0.4731 3.4306
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.57970 0.02371 -66.614 < 2e-16 ***
## poly(log(yards_to_endzone), 2)1 -59.40896 2.74157 -21.670 < 2e-16 ***
## poly(log(yards_to_endzone), 2)2 -66.74231 3.82299 -17.458 < 2e-16 ***
## poly(sec_capped, 2)1 -12.51229 2.84064 -4.405 1.06e-05 ***
## poly(sec_capped, 2)2 -11.40779 2.68984 -4.241 2.22e-05 ***
## outdoors_indTRUE -0.09272 0.02655 -3.492 0.00048 ***
## winning_late_indTRUE -2.22931 0.13904 -16.033 < 2e-16 ***
## within_FGTRUE 0.31104 0.02668 11.656 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 43691 on 47028 degrees of freedom
## Residual deviance: 42090 on 47021 degrees of freedom
## AIC: 42106
##
## Number of Fisher Scoring iterations: 5
##
## Call:
## glm(formula = FG_Make_Ind ~ poly(yards_to_endzone, 2) + poly(sec_capped,
## 1) + outdoors_ind + snow_ind, family = "binomial", data = df1 %>%
## filter(yards_to_endzone < 100 & FG_att == 1))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3613 0.5035 0.5594 0.5839 1.0215
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.87487 0.05740 32.666 < 2e-16 ***
## poly(yards_to_endzone, 2)1 -14.33443 3.14766 -4.554 5.26e-06 ***
## poly(yards_to_endzone, 2)2 9.60583 3.09831 3.100 0.00193 **
## poly(sec_capped, 1) 21.57369 2.46684 8.745 < 2e-16 ***
## outdoors_indTRUE -0.21482 0.06766 -3.175 0.00150 **
## snow_indTRUE -0.53567 0.32676 -1.639 0.10114
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7138.7 on 8253 degrees of freedom
## Residual deviance: 7026.3 on 8248 degrees of freedom
## AIC: 7038.3
##
## Number of Fisher Scoring iterations: 4
df2 = df1 %>% filter(yards_to_endzone < 100) %>% group_by(game_id,season, posteam, defteam) %>% summarise(OFF_TDs = sum(OFF_TD), DEF_TDs = sum(DEF_TD), FG_ATTs = sum(FG_att), FG_Makes = sum(FG_result == "made",na.rm=T), INTs = sum(INT), FUMB_Lost = sum(FUMBLE_LOST), ST_OFF_TDs = sum(ST_OFF_TD), ST_DEF_TDs = sum(ST_DEF_TD), xOFF_TDs = sum(TD_Prob_Off), drives = length(drive), xFG_Atts = sum(FG_Att_Prob), xFG_Makes_base = sum(FG_Make_Prob * FG_Att_Prob), xFG_Makes_Off = sum(FG_Make_Prob * FG_att))
df3 = df2 %>% group_by(season, posteam) %>% summarise(OFF_TDs = sum(OFF_TDs), xOFF_TDs = sum(xOFF_TDs), xFG_Makes_Off = sum(xFG_Makes_Off), xFG_Makes_base = sum(xFG_Makes_base), drives = sum(drives)) %>% mutate(Off_PSOE = 7 * OFF_TDs - 7 * xOFF_TDs + 3 * xFG_Makes_Off - 3 * xFG_Makes_base)
## # A tibble: 6 × 8
## # Groups: season [4]
## season posteam OFF_TDs xOFF_TDs xFG_Makes_Off xFG_Makes_base drives Off_PSOE
## <int> <chr> <int> <dbl> <dbl> <dbl> <int> <dbl>
## 1 2018 KC 66 38.0 22.6 23.8 170 193.
## 2 2020 GB 64 33.0 13.7 22.3 162 191.
## 3 2019 BAL 58 33.7 24.6 23.5 166 173.
## 4 2024 BAL 63 38.9 25.1 26.8 183 164.
## 5 2020 TEN 60 36.4 23.3 24.4 167 162.
## 6 2024 DET 68 44.1 24.6 29.3 182 153.
2018 Chiefs and 2020 Packers top list of best offenses, with the Ravens coming in 3rd and 4th.
## # A tibble: 6 × 8
## # Groups: season [5]
## season posteam OFF_TDs xOFF_TDs xFG_Makes_Off xFG_Makes_base drives Off_PSOE
## <int> <chr> <int> <dbl> <dbl> <dbl> <int> <dbl>
## 1 2023 NYJ 18 43.7 32.7 31.6 208 -177.
## 2 2018 ARI 24 42.4 14.5 28.8 187 -172.
## 3 2017 ARI 27 48.3 34.3 32.3 205 -143.
## 4 2024 CLE 27 44.4 22.9 29.5 201 -142.
## 5 2022 HOU 28 46.1 26.7 30.9 198 -139.
## 6 2022 IND 25 45.6 33.4 31.0 195 -137.
2023 Jets scoring 18 Offensive Touchdowns leads the list of worst offenses, with the Cardinals coming in 2nd and 3rd.
df2 = df1 %>% filter(yards_to_endzone < 100) %>% group_by(game_id,season, posteam, defteam, QB_id, QB_name) %>% summarise(OFF_TDs = sum(OFF_TD), DEF_TDs = sum(DEF_TD), FG_ATTs = sum(FG_att), FG_Makes = sum(FG_result == "made",na.rm=T), INTs = sum(INT), FUMB_Lost = sum(FUMBLE_LOST), ST_OFF_TDs = sum(ST_OFF_TD), ST_DEF_TDs = sum(ST_DEF_TD), xOFF_TDs = sum(TD_Prob_Off), drives = length(drive), xFG_Atts = sum(FG_Att_Prob), xFG_Makes_base = sum(FG_Make_Prob * FG_Att_Prob), xFG_Makes_Off = sum(FG_Make_Prob * FG_att))
df3 = df2 %>% filter(!is.na(QB_id)) %>% group_by(QB_id, QB_name) %>% summarise(OFF_TDs = sum(OFF_TDs), xOFF_TDs = sum(xOFF_TDs), xFG_Makes_Off = sum(xFG_Makes_Off), xFG_Makes_base = sum(xFG_Makes_base), drives = sum(drives)) %>% mutate(Off_PSOE = 7 * OFF_TDs - 7 * xOFF_TDs + 3 * xFG_Makes_Off - 3 * xFG_Makes_base)
## # A tibble: 6 × 8
## # Groups: QB_id [6]
## QB_id QB_name OFF_TDs xOFF_TDs xFG_Makes_Off xFG_Makes_base drives Off_PSOE
## <chr> <chr> <int> <dbl> <dbl> <dbl> <int> <dbl>
## 1 00-0033… P.Maho… 344 238. 180. 155. 1060 820.
## 2 00-0034… L.Jack… 272 195. 154. 132. 870 608.
## 3 00-0034… J.Allen 305 227. 154. 154. 1000 549.
## 4 00-0033… J.Goff 367 295. 203. 196. 1263 526.
## 5 00-0019… T.Brady 272 224. 171. 149. 991 406.
## 6 00-0023… A.Rodg… 264 204. 129. 136. 928 402.
Patrick Mahomes tops the list of most valuable offensive players, with other notable names including Lamar Jackson, Josh Allen, Jared Goff (!), Tom Brady, and Aaron Rodgers.
## # A tibble: 6 × 8
## # Groups: QB_id [6]
## QB_id QB_name OFF_TDs xOFF_TDs xFG_Makes_Off xFG_Makes_base drives Off_PSOE
## <chr> <chr> <int> <dbl> <dbl> <dbl> <int> <dbl>
## 1 00-0037… Z.Wils… 47 74.1 49.7 51.6 343 -195.
## 2 00-0036… D.Mills 41 65.2 39.2 43.4 282 -182.
## 3 00-0034… J.Rosen 20 41.8 19.5 27.6 179 -177.
## 4 00-0034… S.Darn… 144 175. 133. 117. 773 -171.
## 5 00-0039… B.Young 46 63.9 37.7 43.3 295 -142.
## 6 00-0035… D.Jones 122 145. 106. 96.8 666 -134.
Future analysis could look into bayesian hierarchal modelling of Offense performance (rather than above expectation model), multinomial modelling of drive outcomes, and turnover/drive outcome analysis.