NFL Logistic Regression Model

1. Introduction

This report aims to analyse how 32 NFL teams have evolved over 21 seasons. A logistic regression model be used to examine what factors determine if a team has a winning season or not.

library(readr)
NFL = read_csv("/Users/kennyg/Downloads/team_stats_2003_2023.csv")
head(NFL)

## # A tibble: 6 × 35
##    year team       wins losses win_loss_perc points points_opp points_diff   mov
##   <dbl> <chr>     <dbl>  <dbl>         <dbl>  <dbl>      <dbl>       <dbl> <dbl>
## 1  2003 New Engl…    14      2         0.875    348        238         110   6.9
## 2  2003 Miami Do…    10      6         0.625    311        261          50   3.1
## 3  2003 Buffalo …     6     10         0.375    243        279         -36  -2.3
## 4  2003 New York…     6     10         0.375    283        299         -16  -1  
## 5  2003 Baltimor…    10      6         0.625    391        281         110   6.9
## 6  2003 Cincinna…     8      8         0.5      346        384         -38  -2.4
## # ℹ 26 more variables: g <dbl>, total_yards <dbl>, plays_offense <dbl>,
## #   yds_per_play_offense <dbl>, turnovers <dbl>, fumbles_lost <dbl>,
## #   first_down <dbl>, pass_cmp <dbl>, pass_att <dbl>, pass_yds <dbl>,
## #   pass_td <dbl>, pass_int <dbl>, pass_net_yds_per_att <dbl>, pass_fd <dbl>,
## #   rush_att <dbl>, rush_yds <dbl>, rush_td <dbl>, rush_yds_per_att <dbl>,
## #   rush_fd <dbl>, penalties <dbl>, penalties_yds <dbl>, pen_fd <dbl>,
## #   score_pct <dbl>, turnover_pct <dbl>, exp_pts_tot <dbl>, ties <dbl>

NFL = NFL %>%
  clean_names()

# check for any loading errors
any(is.na(NFL))

## [1] TRUE

3. Exploratory Data Analysis

3.1 Summary of Statistics

summary(NFL)

##       year          team                wins            losses      
##  Min.   :2003   Length:672         Min.   : 0.000   Min.   : 0.000  
##  1st Qu.:2008   Class :character   1st Qu.: 6.000   1st Qu.: 6.000  
##  Median :2013   Mode  :character   Median : 8.000   Median : 8.000  
##  Mean   :2013                      Mean   : 8.051   Mean   : 8.051  
##  3rd Qu.:2018                      3rd Qu.:10.000   3rd Qu.:10.000  
##  Max.   :2023                      Max.   :16.000   Max.   :16.000  
##                                                                     
##  win_loss_perc        points        points_opp     points_diff     
##  Min.   :0.0000   Min.   :168.0   Min.   :201.0   Min.   :-261.00  
##  1st Qu.:0.3750   1st Qu.:305.0   1st Qu.:316.8   1st Qu.: -75.00  
##  Median :0.5000   Median :359.0   Median :356.5   Median :  -1.00  
##  Mean   :0.5002   Mean   :358.6   Mean   :358.6   Mean   :   0.00  
##  3rd Qu.:0.6250   3rd Qu.:406.0   3rd Qu.:398.0   3rd Qu.:  75.25  
##  Max.   :1.0000   Max.   :606.0   Max.   :519.0   Max.   : 315.00  
##                                                                    
##       mov                  g          total_yards   plays_offense 
##  Min.   :-16.30000   Min.   :16.00   Min.   :3587   Min.   : 865  
##  1st Qu.: -4.70000   1st Qu.:16.00   1st Qu.:5030   1st Qu.: 990  
##  Median :  0.00000   Median :16.00   Median :5468   Median :1019  
##  Mean   : -0.00114   Mean   :16.14   Mean   :5464   Mean   :1022  
##  3rd Qu.:  4.62500   3rd Qu.:16.00   3rd Qu.:5900   3rd Qu.:1053  
##  Max.   : 19.70000   Max.   :17.00   Max.   :7474   Max.   :1191  
##  NA's   :320                                                      
##  yds_per_play_offense   turnovers      fumbles_lost     first_down   
##  Min.   :3.90         Min.   : 8.00   Min.   : 2.00   Min.   :191.0  
##  1st Qu.:5.00         1st Qu.:20.00   1st Qu.: 8.00   1st Qu.:288.0  
##  Median :5.30         Median :24.00   Median :10.00   Median :313.0  
##  Mean   :5.34         Mean   :24.71   Mean   :10.06   Mean   :314.4  
##  3rd Qu.:5.70         3rd Qu.:29.00   3rd Qu.:12.00   3rd Qu.:340.0  
##  Max.   :6.80         Max.   :46.00   Max.   :26.00   Max.   :444.0  
##                                                                      
##     pass_cmp        pass_att        pass_yds       pass_td         pass_int    
##  Min.   :204.0   Min.   :358.0   Min.   :1898   Min.   : 7.00   Min.   : 2.00  
##  1st Qu.:303.0   1st Qu.:506.8   1st Qu.:3156   1st Qu.:18.00   1st Qu.:11.00  
##  Median :339.0   Median :546.0   Median :3592   Median :23.00   Median :14.00  
##  Mean   :339.3   Mean   :546.9   Mean   :3617   Mean   :23.58   Mean   :14.65  
##  3rd Qu.:372.0   3rd Qu.:590.0   3rd Qu.:4048   3rd Qu.:28.00   3rd Qu.:18.00  
##  Max.   :499.0   Max.   :751.0   Max.   :5444   Max.   :55.00   Max.   :32.00  
##                                                                                
##  pass_net_yds_per_att    pass_fd         rush_att        rush_yds   
##  Min.   :4.100        Min.   : 96.0   Min.   :304.0   Min.   :1129  
##  1st Qu.:5.700        1st Qu.:163.0   1st Qu.:401.0   1st Qu.:1606  
##  Median :6.100        Median :186.0   Median :433.5   Median :1810  
##  Mean   :6.188        Mean   :186.7   Mean   :437.8   Mean   :1847  
##  3rd Qu.:6.800        3rd Qu.:208.0   3rd Qu.:472.0   3rd Qu.:2047  
##  Max.   :8.500        Max.   :293.0   Max.   :618.0   Max.   :3296  
##                                                                     
##     rush_td      rush_yds_per_att    rush_fd         penalties    
##  Min.   : 2.00   Min.   :3.1      Min.   : 53.00   Min.   : 55.0  
##  1st Qu.:10.00   1st Qu.:3.9      1st Qu.: 84.00   1st Qu.: 91.0  
##  Median :13.00   Median :4.2      Median : 98.00   Median :102.0  
##  Mean   :13.46   Mean   :4.2      Mean   : 99.78   Mean   :101.5  
##  3rd Qu.:17.00   3rd Qu.:4.5      3rd Qu.:112.25   3rd Qu.:112.0  
##  Max.   :32.00   Max.   :5.5      Max.   :188.00   Max.   :163.0  
##                                                                   
##  penalties_yds        pen_fd        score_pct      turnover_pct  
##  Min.   : 415.0   Min.   : 9.00   Min.   :16.40   Min.   : 4.00  
##  1st Qu.: 751.8   1st Qu.:23.00   1st Qu.:29.20   1st Qu.:10.20  
##  Median : 852.5   Median :27.00   Median :33.90   Median :12.40  
##  Mean   : 849.5   Mean   :27.89   Mean   :34.09   Mean   :12.56  
##  3rd Qu.: 941.2   3rd Qu.:33.00   3rd Qu.:38.80   3rd Qu.:14.80  
##  Max.   :1358.0   Max.   :50.00   Max.   :52.70   Max.   :25.00  
##                                                                  
##   exp_pts_tot           ties        
##  Min.   :-358.27   Min.   :0.00000  
##  1st Qu.:-124.02   1st Qu.:0.00000  
##  Median : -40.97   Median :0.00000  
##  Mean   : -35.94   Mean   :0.08125  
##  3rd Qu.:  49.80   3rd Qu.:0.00000  
##  Max.   : 298.14   Max.   :1.00000  
##                    NA's   :352

# Create a heatmap
library(pheatmap)
numeric_cols = NFL[, sapply(NFL, is.numeric)]
scaled_data = scale(numeric_cols)

cols_to_keep = colSums(is.na(numeric_cols)) / nrow(numeric_cols) < 0.2
numeric_cols_clean = numeric_cols[, cols_to_keep]
scaled_data = scale(numeric_cols_clean)

pheatmap(scaled_data, 
         main = " NFL Stats 2003-2023",
         show_rownames = FALSE,
         clustering_method = "complete",
         color = colorRampPalette(c("blue", "white", "red"))(50)
         )

3.2 Distribution of the data

# Select a few variables to create a handful of histograms
off_cols = c("points", "rush_yds", "pass_yds")
def_cols = c("points_opp", "pass_int", "turnovers")

# Offensive data
NFL_OFF = NFL %>%
  select(year, all_of(off_cols)) %>%
  pivot_longer(cols = -year,
               names_to = "variable",
               values_to = "value")

NFL_OFF %>%
  filter(year %in% c(2003, 2008, 2013, 2018, 2023)) %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30, fill = "steelblue", colour = "black") +
  facet_grid(variable ~ year, scales = "free") +
  labs(title = "Distrbution of Offensive Stats",
       x = "value",
       y = "frequency") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.y = element_text(size = 8))

# Defensive Stats
NFL_DEF = NFL %>%
  select(year, all_of(def_cols)) %>%
  pivot_longer(cols = -year,
               names_to = "variable",
               values_to = "value")

NFL_DEF %>%
  filter(year %in% c(2003, 2008, 2013, 2018, 2023)) %>%
  ggplot(aes(x = value)) +
  geom_histogram(bins = 30, fill = "steelblue", colour = "black") +
  facet_grid(variable ~ year, scales = "free") +
  labs(title = "Distrbution of Defensive Stats",
       x = "value",
       y = "frequency") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        strip.text.y = element_text(size = 8))

When looking at the offensive data, passing yards increasingly become ore left skewed going into 2023, whereas rushing rushing yards become more right skewed. Indicating teams are more reliant on passing the ball than running the ball. This strategy is proven to be more efficient with the increasing frequency of points score. This increasing reliance on passing is more evident in difference in compensation between running backs and quarterbacks. With the 15 highest paid running backs earning between 8-20 million dollars, whereas the 15 highest paid quarterbacks earn between 33-60 million dollars. Such a disparity is driven by quarterbacks being seen as more valuable due to their longevity and their ability to produce more of an output, evident by the increasing frequency of points scored in later years. This is a result of significant improvement in offensive schemes and rule changes favoring quarterbacks and receivers. For instance, rules such as defenseless receiver protection were introduced to prevent defenders from hitting receivers in the neck or head while they were attempting to catch the ball, and the enforcement of the roughing passer protection, preventing defenders from driving the quarterback into the ground, landing their body weight on them or hitting them below the knee.

In terms of the defensive statistics they remain relatively consistent, with slight shift towards fewer turnovers. The fact turnovers remain relatively stable with the ‘passing evolution’ between the mid to late 2000s shows that teams have become more efficient in protecting the ball, thus they are able to have better manamgment of the risk/reward ratio when it came to passing.

best_teams = c("New England Patriots", "Green Bay Packers", "Pittsburgh Steelers", 
               "Baltimore Ravens", "Seattle Seahawks", "Philadelphia Eagles")


# Normalize the offensive data
off_cols = c("points", "rush_yds", "pass_yds")

NFL_OFF2 = NFL %>%
  filter(team %in% best_teams) %>%
  select(team, year, all_of(off_cols)) %>%
  group_by(team) %>%
  arrange(year) %>%
  mutate(
    points_norm = (points - mean(points, na.rm = TRUE)) / sd(points, na.rm = TRUE),
    pass_yds_norm = (pass_yds - mean(pass_yds, na.rm = TRUE)) / sd(pass_yds, na.rm = TRUE),
    rush_yds_norm = (rush_yds - mean(rush_yds, na.rm = TRUE)) / sd(rush_yds, na.rm = TRUE)
  ) %>%
  ungroup() %>%
  select(team, year, points_norm, pass_yds_norm, rush_yds_norm) %>%
  pivot_longer(cols = c(points_norm, pass_yds_norm, rush_yds_norm),
               names_to = "variable",
               values_to = "values")

ggplot(NFL_OFF2, aes(x = year, y = values, color = variable)) +
  geom_line(size = 1) +
  facet_wrap(~ team, scales = "free_y") +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  scale_color_manual(values = c("points_norm" = "darkred",
                                "pass_yds_norm" = "steelblue",
                                "rush_yds_norm" = "darkgreen"),
                     labels = c("Points", "Passing Yards", "Rushing Yards")) +
  labs(title = "Normalised Offensive Trends for the Top 6 Teams",
       x = "Year",
       y = "Standard Deviation",
       color = "values") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")

# Normalise the defensive data
def_cols = c("points_opp", "pass_int", "turnovers")

NFL_DEF2 = NFL %>%
  filter(team %in% best_teams) %>%
  select(team, year, all_of(def_cols)) %>%
  group_by(team) %>%
  arrange(year) %>%
  mutate(
    points_opp_norm = (points_opp - mean(points_opp, na.rm = TRUE)) / sd(points_opp, na.rm = TRUE),
    pass_int_norm = (pass_int - mean(pass_int, na.rm = TRUE)) / sd(pass_int, na.rm = TRUE),
    turnovers_norm = (turnovers - mean(turnovers, na.rm = TRUE)) / sd(turnovers, na.rm = TRUE)
  ) %>%
  ungroup() %>%
  select(team, year, points_opp_norm, pass_int_norm, turnovers_norm) %>%
  pivot_longer(cols = c(points_opp_norm, pass_int_norm, turnovers_norm),
               names_to = "variable",
               values_to = "values")

ggplot(NFL_DEF2, aes(x = year, y = values, color = variable)) +
  geom_line(size = 1) +
  facet_wrap(~ team, scales = "free_y") +
  geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
  scale_color_manual(values = c("points_opp_norm" = "darkred",
                                "pass_int_norm" = "steelblue",
                                "turnovers_norm" = "darkgreen"),
                     labels = c("Points scored by opposition", "Passing Interceptions", "Turnovers")) +
  labs(title = "Normalised Defensive Trends for the Top 6 Teams",
       x = "Year",
       y = "Standard Deviation",
       color = "values") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1),
        legend.position = "bottom")

In terms of offensive trends most peak performances occured just before 2020, with the likes of the Patriots, Packers and Eagles. With the Patriots having the highest offensive peak around the 2000s and peaking again around 2020 then sharply declining after 2020 coinciding with end of the Tom Brady era. The Eagles also peaked just before 2015 and again after 2020. The Ravens are the most consistent offensive team over the last 20 years and increase in both rushing and passing yards around 2018, coinciding with the beginning of the Lamar Jackson era who’s known for passing and running the ball. The Eagles and Steelers have shown the widest swings between the mid to late 2010s. In terms of defensive trends, the Steelers had the most consistent defense, followed by the Ravens and the Eagles. However, the Seahawks had a drastic improvement in passing interceptions in the last ten years due to the impact of Richard Sherman as their cornerback. However, the Patriots post-Brady era made an overall increase in all three defensive categories after a decline in the mid 2010s.

4. Building a Logistic Regression

A logistic regression model will be constructed to determine which offensive and defensive categories determine whether or not a team had a winning season.

4.1 Creation of a target variable

# Creating a target variable
NFL = NFL %>%
  mutate(
    games_played = ifelse(year <= 2020, 16, 17),
    win_pct = wins / games_played,
    winning_season = ifelse(win_pct > 0.5, 1, 0)
  )
# Check the distribution
table(NFL$winning_season)

## 
##   0   1 
## 367 305

prop.table(table(NFL$winning_season))

## 
##        0        1 
## 0.546131 0.453869

# Top 10 Predictors 
predictors = c(
  "points",
  "points_opp",
  "pass_yds",
  "rush_yds",
  "turnovers",
  "pass_int",
  "yds_per_play_offense",
  "penalties",
  "penalties_yds"
)

cat("Winning seasons (1):", sum(NFL$winning_season == 1), "\n")

## Winning seasons (1): 305

cat("Losing seasons (0):", sum(NFL$winning_season == 0), "\n")

## Losing seasons (0): 367

cat("Proportion winning:", mean(NFL$winning_season), "\n")

## Proportion winning: 0.453869

# Clear any rows with missing values
Beta_model = NFL %>%
  select(year, team, winning_season, all_of(predictors)) %>%
  na.omit()

4.2 Training the data

train_NFL = Beta_model %>% filter(year <= 2018)
test_NFL = Beta_model %>% filter(year >= 2019)

cat("Training set size:", nrow(train_NFL), "\n")

## Training set size: 512

cat("Test set size:", nrow(test_NFL), "\n")

## Test set size: 160

cat("Winning % in training:", mean(train_NFL$winning_season), "\n")

## Winning % in training: 0.4433594

cat("Winning % in test:", mean(test_NFL$winning_season), "\n")

## Winning % in test: 0.4875

4.3 Building the model

cor_matrix = cor(train_NFL[, predictors], use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8)

model_NFL = glm(winning_season ~ points + points_opp + pass_yds + 
                  rush_yds + turnovers + pass_int  +
                  yds_per_play_offense + penalties + penalties_yds,
                data = train_NFL,
                family = binomial)

summary(model_NFL)

## 
## Call:
## glm(formula = winning_season ~ points + points_opp + pass_yds + 
##     rush_yds + turnovers + pass_int + yds_per_play_offense + 
##     penalties + penalties_yds, family = binomial, data = train_NFL)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           8.421e+00  2.970e+00   2.835  0.00458 ** 
## points                4.513e-02  6.573e-03   6.867 6.57e-12 ***
## points_opp           -5.205e-02  5.915e-03  -8.800  < 2e-16 ***
## pass_yds              1.136e-04  8.640e-04   0.131  0.89544    
## rush_yds              2.207e-05  9.988e-04   0.022  0.98237    
## turnovers            -6.874e-02  5.280e-02  -1.302  0.19292    
## pass_int              1.015e-02  7.071e-02   0.144  0.88582    
## yds_per_play_offense -9.302e-01  9.841e-01  -0.945  0.34456    
## penalties            -2.680e-02  2.660e-02  -1.007  0.31382    
## penalties_yds         2.385e-03  2.965e-03   0.804  0.42124    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 703.20  on 511  degrees of freedom
## Residual deviance: 228.65  on 502  degrees of freedom
## AIC: 248.65
## 
## Number of Fisher Scoring iterations: 7

Turnovers and points are negatively correlated, meaning more turnovers leads to less points being scored. Points to penalties have no correlation, meaning the amount of penalties accumalated have no impact on the score.

library(broom)

# Extract coefficients with confidence intervals
coef_data = tidy(model_NFL, conf.int = TRUE) %>%
  filter(term != "(Intercept)") %>%
  mutate(
    term = reorder(term, estimate),
    effect = ifelse(estimate > 0, "Increases Wins", "Decreases Wins")
  )

# Coefficient plots
ggplot(coef_data, aes(x = term, y = estimate, color = effect)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
  geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
  coord_flip() +
  scale_color_manual(values = c("Increases Wins" = "darkgreen",
                                "Decreases Wins" = "darkred")) +
  labs(title = "What determines a team having a winning season?",
       x = "Predictor",
       y = "Coefficient (Log-Odds)",
       color = "Effect on Winning") + 
  theme_minimal() + 
  theme(legend.position = "bottom")

# standardize the predictors
train_scaled = train_NFL %>%
  mutate(across(all_of(predictors), scale))

model_scaled = glm(winning_season ~ points + points_opp + pass_yds + 
                  rush_yds + turnovers + pass_int  +
                  yds_per_play_offense + penalties + penalties_yds,
                data = train_scaled,
                family = binomial)
coef_scaled = tidy(model_scaled) %>%
  filter(term != "(Intercept)") %>%
  mutate(term = reorder(term, abs(estimate)))

ggplot(coef_scaled, aes(x = term, y = estimate, fill = estimate > 0)) + 
  geom_bar(stat = "identity") +
  coord_flip() +
  scale_fill_manual(values = c("darkred", "darkgreen"),
                    labels = c("Decreases Wins", "Increases Wins")) +
  labs(title = " Feature Importance (Standardized Coefficients)",
       x = "Predictor",
       y = "Standardized Coefficients",
       fill = "Effect") +
  theme_minimal() +
  theme(legend.position = "bottom")

## 5. Conclusion The study has concluded points scored and points allowed by the defense are by far the most important factors in winning a game, with standardized coefficients of 2.6 and -2.5 respectively. Teams that score more points have a significantly higher chance of winning a game, whereas teams that allow fewer points have a higher chance of winning. This pretty much confirms the two fundamental aspects of team sports defense wins games and outscore your opponent. Secondary factors such as passing yards and passing interceptions increase the likelihood of winning games, compared to rushing yards, having no statistical significance, re-emphasising the NFL’s current status as a ‘passing league’.