This report aims to analyse how 32 NFL teams have evolved over 21 seasons. A logistic regression model be used to examine what factors determine if a team has a winning season or not.
library(readr)
NFL = read_csv("/Users/kennyg/Downloads/team_stats_2003_2023.csv")
head(NFL)
## # A tibble: 6 × 35
## year team wins losses win_loss_perc points points_opp points_diff mov
## <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2003 New Engl… 14 2 0.875 348 238 110 6.9
## 2 2003 Miami Do… 10 6 0.625 311 261 50 3.1
## 3 2003 Buffalo … 6 10 0.375 243 279 -36 -2.3
## 4 2003 New York… 6 10 0.375 283 299 -16 -1
## 5 2003 Baltimor… 10 6 0.625 391 281 110 6.9
## 6 2003 Cincinna… 8 8 0.5 346 384 -38 -2.4
## # ℹ 26 more variables: g <dbl>, total_yards <dbl>, plays_offense <dbl>,
## # yds_per_play_offense <dbl>, turnovers <dbl>, fumbles_lost <dbl>,
## # first_down <dbl>, pass_cmp <dbl>, pass_att <dbl>, pass_yds <dbl>,
## # pass_td <dbl>, pass_int <dbl>, pass_net_yds_per_att <dbl>, pass_fd <dbl>,
## # rush_att <dbl>, rush_yds <dbl>, rush_td <dbl>, rush_yds_per_att <dbl>,
## # rush_fd <dbl>, penalties <dbl>, penalties_yds <dbl>, pen_fd <dbl>,
## # score_pct <dbl>, turnover_pct <dbl>, exp_pts_tot <dbl>, ties <dbl>
NFL = NFL %>%
clean_names()
# check for any loading errors
any(is.na(NFL))
## [1] TRUE
summary(NFL)
## year team wins losses
## Min. :2003 Length:672 Min. : 0.000 Min. : 0.000
## 1st Qu.:2008 Class :character 1st Qu.: 6.000 1st Qu.: 6.000
## Median :2013 Mode :character Median : 8.000 Median : 8.000
## Mean :2013 Mean : 8.051 Mean : 8.051
## 3rd Qu.:2018 3rd Qu.:10.000 3rd Qu.:10.000
## Max. :2023 Max. :16.000 Max. :16.000
##
## win_loss_perc points points_opp points_diff
## Min. :0.0000 Min. :168.0 Min. :201.0 Min. :-261.00
## 1st Qu.:0.3750 1st Qu.:305.0 1st Qu.:316.8 1st Qu.: -75.00
## Median :0.5000 Median :359.0 Median :356.5 Median : -1.00
## Mean :0.5002 Mean :358.6 Mean :358.6 Mean : 0.00
## 3rd Qu.:0.6250 3rd Qu.:406.0 3rd Qu.:398.0 3rd Qu.: 75.25
## Max. :1.0000 Max. :606.0 Max. :519.0 Max. : 315.00
##
## mov g total_yards plays_offense
## Min. :-16.30000 Min. :16.00 Min. :3587 Min. : 865
## 1st Qu.: -4.70000 1st Qu.:16.00 1st Qu.:5030 1st Qu.: 990
## Median : 0.00000 Median :16.00 Median :5468 Median :1019
## Mean : -0.00114 Mean :16.14 Mean :5464 Mean :1022
## 3rd Qu.: 4.62500 3rd Qu.:16.00 3rd Qu.:5900 3rd Qu.:1053
## Max. : 19.70000 Max. :17.00 Max. :7474 Max. :1191
## NA's :320
## yds_per_play_offense turnovers fumbles_lost first_down
## Min. :3.90 Min. : 8.00 Min. : 2.00 Min. :191.0
## 1st Qu.:5.00 1st Qu.:20.00 1st Qu.: 8.00 1st Qu.:288.0
## Median :5.30 Median :24.00 Median :10.00 Median :313.0
## Mean :5.34 Mean :24.71 Mean :10.06 Mean :314.4
## 3rd Qu.:5.70 3rd Qu.:29.00 3rd Qu.:12.00 3rd Qu.:340.0
## Max. :6.80 Max. :46.00 Max. :26.00 Max. :444.0
##
## pass_cmp pass_att pass_yds pass_td pass_int
## Min. :204.0 Min. :358.0 Min. :1898 Min. : 7.00 Min. : 2.00
## 1st Qu.:303.0 1st Qu.:506.8 1st Qu.:3156 1st Qu.:18.00 1st Qu.:11.00
## Median :339.0 Median :546.0 Median :3592 Median :23.00 Median :14.00
## Mean :339.3 Mean :546.9 Mean :3617 Mean :23.58 Mean :14.65
## 3rd Qu.:372.0 3rd Qu.:590.0 3rd Qu.:4048 3rd Qu.:28.00 3rd Qu.:18.00
## Max. :499.0 Max. :751.0 Max. :5444 Max. :55.00 Max. :32.00
##
## pass_net_yds_per_att pass_fd rush_att rush_yds
## Min. :4.100 Min. : 96.0 Min. :304.0 Min. :1129
## 1st Qu.:5.700 1st Qu.:163.0 1st Qu.:401.0 1st Qu.:1606
## Median :6.100 Median :186.0 Median :433.5 Median :1810
## Mean :6.188 Mean :186.7 Mean :437.8 Mean :1847
## 3rd Qu.:6.800 3rd Qu.:208.0 3rd Qu.:472.0 3rd Qu.:2047
## Max. :8.500 Max. :293.0 Max. :618.0 Max. :3296
##
## rush_td rush_yds_per_att rush_fd penalties
## Min. : 2.00 Min. :3.1 Min. : 53.00 Min. : 55.0
## 1st Qu.:10.00 1st Qu.:3.9 1st Qu.: 84.00 1st Qu.: 91.0
## Median :13.00 Median :4.2 Median : 98.00 Median :102.0
## Mean :13.46 Mean :4.2 Mean : 99.78 Mean :101.5
## 3rd Qu.:17.00 3rd Qu.:4.5 3rd Qu.:112.25 3rd Qu.:112.0
## Max. :32.00 Max. :5.5 Max. :188.00 Max. :163.0
##
## penalties_yds pen_fd score_pct turnover_pct
## Min. : 415.0 Min. : 9.00 Min. :16.40 Min. : 4.00
## 1st Qu.: 751.8 1st Qu.:23.00 1st Qu.:29.20 1st Qu.:10.20
## Median : 852.5 Median :27.00 Median :33.90 Median :12.40
## Mean : 849.5 Mean :27.89 Mean :34.09 Mean :12.56
## 3rd Qu.: 941.2 3rd Qu.:33.00 3rd Qu.:38.80 3rd Qu.:14.80
## Max. :1358.0 Max. :50.00 Max. :52.70 Max. :25.00
##
## exp_pts_tot ties
## Min. :-358.27 Min. :0.00000
## 1st Qu.:-124.02 1st Qu.:0.00000
## Median : -40.97 Median :0.00000
## Mean : -35.94 Mean :0.08125
## 3rd Qu.: 49.80 3rd Qu.:0.00000
## Max. : 298.14 Max. :1.00000
## NA's :352
# Create a heatmap
library(pheatmap)
numeric_cols = NFL[, sapply(NFL, is.numeric)]
scaled_data = scale(numeric_cols)
cols_to_keep = colSums(is.na(numeric_cols)) / nrow(numeric_cols) < 0.2
numeric_cols_clean = numeric_cols[, cols_to_keep]
scaled_data = scale(numeric_cols_clean)
pheatmap(scaled_data,
main = " NFL Stats 2003-2023",
show_rownames = FALSE,
clustering_method = "complete",
color = colorRampPalette(c("blue", "white", "red"))(50)
)
# Select a few variables to create a handful of histograms
off_cols = c("points", "rush_yds", "pass_yds")
def_cols = c("points_opp", "pass_int", "turnovers")
# Offensive data
NFL_OFF = NFL %>%
select(year, all_of(off_cols)) %>%
pivot_longer(cols = -year,
names_to = "variable",
values_to = "value")
NFL_OFF %>%
filter(year %in% c(2003, 2008, 2013, 2018, 2023)) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30, fill = "steelblue", colour = "black") +
facet_grid(variable ~ year, scales = "free") +
labs(title = "Distrbution of Offensive Stats",
x = "value",
y = "frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(size = 8))
# Defensive Stats
NFL_DEF = NFL %>%
select(year, all_of(def_cols)) %>%
pivot_longer(cols = -year,
names_to = "variable",
values_to = "value")
NFL_DEF %>%
filter(year %in% c(2003, 2008, 2013, 2018, 2023)) %>%
ggplot(aes(x = value)) +
geom_histogram(bins = 30, fill = "steelblue", colour = "black") +
facet_grid(variable ~ year, scales = "free") +
labs(title = "Distrbution of Defensive Stats",
x = "value",
y = "frequency") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
strip.text.y = element_text(size = 8))
When looking at the offensive data, passing yards increasingly become
ore left skewed going into 2023, whereas rushing rushing yards become
more right skewed. Indicating teams are more reliant on passing the ball
than running the ball. This strategy is proven to be more efficient with
the increasing frequency of points score. This increasing reliance on
passing is more evident in difference in compensation between running
backs and quarterbacks. With the 15 highest paid running backs earning
between 8-20 million dollars, whereas the 15 highest paid quarterbacks
earn between 33-60 million dollars. Such a disparity is driven by
quarterbacks being seen as more valuable due to their longevity and
their ability to produce more of an output, evident by the increasing
frequency of points scored in later years. This is a result of
significant improvement in offensive schemes and rule changes favoring
quarterbacks and receivers. For instance, rules such as defenseless
receiver protection were introduced to prevent defenders from hitting
receivers in the neck or head while they were attempting to catch the
ball, and the enforcement of the roughing passer protection, preventing
defenders from driving the quarterback into the ground, landing their
body weight on them or hitting them below the knee.
In terms of the defensive statistics they remain relatively consistent, with slight shift towards fewer turnovers. The fact turnovers remain relatively stable with the ‘passing evolution’ between the mid to late 2000s shows that teams have become more efficient in protecting the ball, thus they are able to have better manamgment of the risk/reward ratio when it came to passing.
best_teams = c("New England Patriots", "Green Bay Packers", "Pittsburgh Steelers",
"Baltimore Ravens", "Seattle Seahawks", "Philadelphia Eagles")
# Normalize the offensive data
off_cols = c("points", "rush_yds", "pass_yds")
NFL_OFF2 = NFL %>%
filter(team %in% best_teams) %>%
select(team, year, all_of(off_cols)) %>%
group_by(team) %>%
arrange(year) %>%
mutate(
points_norm = (points - mean(points, na.rm = TRUE)) / sd(points, na.rm = TRUE),
pass_yds_norm = (pass_yds - mean(pass_yds, na.rm = TRUE)) / sd(pass_yds, na.rm = TRUE),
rush_yds_norm = (rush_yds - mean(rush_yds, na.rm = TRUE)) / sd(rush_yds, na.rm = TRUE)
) %>%
ungroup() %>%
select(team, year, points_norm, pass_yds_norm, rush_yds_norm) %>%
pivot_longer(cols = c(points_norm, pass_yds_norm, rush_yds_norm),
names_to = "variable",
values_to = "values")
ggplot(NFL_OFF2, aes(x = year, y = values, color = variable)) +
geom_line(size = 1) +
facet_wrap(~ team, scales = "free_y") +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
scale_color_manual(values = c("points_norm" = "darkred",
"pass_yds_norm" = "steelblue",
"rush_yds_norm" = "darkgreen"),
labels = c("Points", "Passing Yards", "Rushing Yards")) +
labs(title = "Normalised Offensive Trends for the Top 6 Teams",
x = "Year",
y = "Standard Deviation",
color = "values") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")
# Normalise the defensive data
def_cols = c("points_opp", "pass_int", "turnovers")
NFL_DEF2 = NFL %>%
filter(team %in% best_teams) %>%
select(team, year, all_of(def_cols)) %>%
group_by(team) %>%
arrange(year) %>%
mutate(
points_opp_norm = (points_opp - mean(points_opp, na.rm = TRUE)) / sd(points_opp, na.rm = TRUE),
pass_int_norm = (pass_int - mean(pass_int, na.rm = TRUE)) / sd(pass_int, na.rm = TRUE),
turnovers_norm = (turnovers - mean(turnovers, na.rm = TRUE)) / sd(turnovers, na.rm = TRUE)
) %>%
ungroup() %>%
select(team, year, points_opp_norm, pass_int_norm, turnovers_norm) %>%
pivot_longer(cols = c(points_opp_norm, pass_int_norm, turnovers_norm),
names_to = "variable",
values_to = "values")
ggplot(NFL_DEF2, aes(x = year, y = values, color = variable)) +
geom_line(size = 1) +
facet_wrap(~ team, scales = "free_y") +
geom_hline(yintercept = 0, linetype = "dashed", alpha = 0.5) +
scale_color_manual(values = c("points_opp_norm" = "darkred",
"pass_int_norm" = "steelblue",
"turnovers_norm" = "darkgreen"),
labels = c("Points scored by opposition", "Passing Interceptions", "Turnovers")) +
labs(title = "Normalised Defensive Trends for the Top 6 Teams",
x = "Year",
y = "Standard Deviation",
color = "values") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")
In terms of offensive trends most peak performances occured just before
2020, with the likes of the Patriots, Packers and Eagles. With the
Patriots having the highest offensive peak around the 2000s and peaking
again around 2020 then sharply declining after 2020 coinciding with end
of the Tom Brady era. The Eagles also peaked just before 2015 and again
after 2020. The Ravens are the most consistent offensive team over the
last 20 years and increase in both rushing and passing yards around
2018, coinciding with the beginning of the Lamar Jackson era who’s known
for passing and running the ball. The Eagles and Steelers have shown the
widest swings between the mid to late 2010s. In terms of defensive
trends, the Steelers had the most consistent defense, followed by the
Ravens and the Eagles. However, the Seahawks had a drastic improvement
in passing interceptions in the last ten years due to the impact of
Richard Sherman as their cornerback. However, the Patriots post-Brady
era made an overall increase in all three defensive categories after a
decline in the mid 2010s.
A logistic regression model will be constructed to determine which offensive and defensive categories determine whether or not a team had a winning season.
# Creating a target variable
NFL = NFL %>%
mutate(
games_played = ifelse(year <= 2020, 16, 17),
win_pct = wins / games_played,
winning_season = ifelse(win_pct > 0.5, 1, 0)
)
# Check the distribution
table(NFL$winning_season)
##
## 0 1
## 367 305
prop.table(table(NFL$winning_season))
##
## 0 1
## 0.546131 0.453869
# Top 10 Predictors
predictors = c(
"points",
"points_opp",
"pass_yds",
"rush_yds",
"turnovers",
"pass_int",
"yds_per_play_offense",
"penalties",
"penalties_yds"
)
cat("Winning seasons (1):", sum(NFL$winning_season == 1), "\n")
## Winning seasons (1): 305
cat("Losing seasons (0):", sum(NFL$winning_season == 0), "\n")
## Losing seasons (0): 367
cat("Proportion winning:", mean(NFL$winning_season), "\n")
## Proportion winning: 0.453869
# Clear any rows with missing values
Beta_model = NFL %>%
select(year, team, winning_season, all_of(predictors)) %>%
na.omit()
train_NFL = Beta_model %>% filter(year <= 2018)
test_NFL = Beta_model %>% filter(year >= 2019)
cat("Training set size:", nrow(train_NFL), "\n")
## Training set size: 512
cat("Test set size:", nrow(test_NFL), "\n")
## Test set size: 160
cat("Winning % in training:", mean(train_NFL$winning_season), "\n")
## Winning % in training: 0.4433594
cat("Winning % in test:", mean(test_NFL$winning_season), "\n")
## Winning % in test: 0.4875
cor_matrix = cor(train_NFL[, predictors], use = "complete.obs")
corrplot(cor_matrix, method = "color", type = "upper", tl.cex = 0.8)
model_NFL = glm(winning_season ~ points + points_opp + pass_yds +
rush_yds + turnovers + pass_int +
yds_per_play_offense + penalties + penalties_yds,
data = train_NFL,
family = binomial)
summary(model_NFL)
##
## Call:
## glm(formula = winning_season ~ points + points_opp + pass_yds +
## rush_yds + turnovers + pass_int + yds_per_play_offense +
## penalties + penalties_yds, family = binomial, data = train_NFL)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 8.421e+00 2.970e+00 2.835 0.00458 **
## points 4.513e-02 6.573e-03 6.867 6.57e-12 ***
## points_opp -5.205e-02 5.915e-03 -8.800 < 2e-16 ***
## pass_yds 1.136e-04 8.640e-04 0.131 0.89544
## rush_yds 2.207e-05 9.988e-04 0.022 0.98237
## turnovers -6.874e-02 5.280e-02 -1.302 0.19292
## pass_int 1.015e-02 7.071e-02 0.144 0.88582
## yds_per_play_offense -9.302e-01 9.841e-01 -0.945 0.34456
## penalties -2.680e-02 2.660e-02 -1.007 0.31382
## penalties_yds 2.385e-03 2.965e-03 0.804 0.42124
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 703.20 on 511 degrees of freedom
## Residual deviance: 228.65 on 502 degrees of freedom
## AIC: 248.65
##
## Number of Fisher Scoring iterations: 7
Turnovers and points are negatively correlated, meaning more turnovers leads to less points being scored. Points to penalties have no correlation, meaning the amount of penalties accumalated have no impact on the score.
library(broom)
# Extract coefficients with confidence intervals
coef_data = tidy(model_NFL, conf.int = TRUE) %>%
filter(term != "(Intercept)") %>%
mutate(
term = reorder(term, estimate),
effect = ifelse(estimate > 0, "Increases Wins", "Decreases Wins")
)
# Coefficient plots
ggplot(coef_data, aes(x = term, y = estimate, color = effect)) +
geom_point(size = 3) +
geom_errorbar(aes(ymin = conf.low, ymax = conf.high), width = 0.2) +
geom_hline(yintercept = 0, linetype = "dashed", color = "grey50") +
coord_flip() +
scale_color_manual(values = c("Increases Wins" = "darkgreen",
"Decreases Wins" = "darkred")) +
labs(title = "What determines a team having a winning season?",
x = "Predictor",
y = "Coefficient (Log-Odds)",
color = "Effect on Winning") +
theme_minimal() +
theme(legend.position = "bottom")
# standardize the predictors
train_scaled = train_NFL %>%
mutate(across(all_of(predictors), scale))
model_scaled = glm(winning_season ~ points + points_opp + pass_yds +
rush_yds + turnovers + pass_int +
yds_per_play_offense + penalties + penalties_yds,
data = train_scaled,
family = binomial)
coef_scaled = tidy(model_scaled) %>%
filter(term != "(Intercept)") %>%
mutate(term = reorder(term, abs(estimate)))
ggplot(coef_scaled, aes(x = term, y = estimate, fill = estimate > 0)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = c("darkred", "darkgreen"),
labels = c("Decreases Wins", "Increases Wins")) +
labs(title = " Feature Importance (Standardized Coefficients)",
x = "Predictor",
y = "Standardized Coefficients",
fill = "Effect") +
theme_minimal() +
theme(legend.position = "bottom")
## 5. Conclusion The study has concluded points scored and points
allowed by the defense are by far the most important factors in winning
a game, with standardized coefficients of 2.6 and -2.5 respectively.
Teams that score more points have a significantly higher chance of
winning a game, whereas teams that allow fewer points have a higher
chance of winning. This pretty much confirms the two fundamental aspects
of team sports defense wins games and outscore your opponent. Secondary
factors such as passing yards and passing interceptions increase the
likelihood of winning games, compared to rushing yards, having no
statistical significance, re-emphasising the NFL’s current status as a
‘passing league’.