This analysis identifies running backs (RBs) and wide receivers (WRs) who gain more valuable yards than expected. I achieved this by modeling Expected Points Added (EPA) as a function of yards gained. The residual from this model for each player’s touch represents their impact, with their average residual serving as their overall rating.
This methodology isolates a player’s direct impact when they touch the ball, independent of their overall involvement in the offense and the yards they gain. This helps to pinpoint potentially undervalued and underutilized players who consistently make significant contributions.
I also examined which traditional statistics correlate most strongly with these ratings. Touchdowns, as expected, are the most significant factor for both RBs and WRs ratings. Touchdowns create points and therefore will always increase expected points by some margin, meaning that players who score more touchdowns per touch will do well in this rating system.
Below, you’ll find the code used for this analysis, displays of the key statistics that led to these conclusions, and tables ranking players based on their performance.
# import libraries
library(dplyr)
library(cfbfastR)
library(ggplot2)
library(reshape2)
library(gt)
library(gtExtras)
library(tidyr)
library(stringr)
library(knitr)
library(broom)
library(corrplot)
# load 2024 data and setup
Sys.setenv(CFBD_API_KEY = "5NK1DDMX7HOCY+FrWUtry5yJW65kG7YsohVHHxJmjMJXS0gl8g+XPk+UL7C+deaB")
all_data = load_cfb_pbp()
# only need these columns from the whole data set
df = all_data %>%
select(pos_team, play_text, yards_gained, EPA, rush, pass)
# only want runs and passes
df = df %>%
# plays are either rush, pass, or other
mutate(play = case_when(
rush == 1 ~ "rush",
pass == 1 ~ "pass",
rush + pass == 0 ~ "other"
)) %>%
# only looking at runs and passes
filter(play != "other") %>%
# don't need these columns anymore
select(-rush) %>%
select(-pass) %>%
# need plays with an EPA calculated
filter(!is.na(EPA))
# find key player function
extract_key_player <- function(play_text) {
# Completed passes (Pattern 1: "pass complete to [Receiver Name]")
if (str_detect(play_text, "pass complete to")) {
player <- str_extract(play_text, "(?<=pass complete to )[A-Za-z]+\\s[A-Za-z]+")
return(player)
}
# Completed passes (Pattern 2: "[Receiver Name] [Yards] Yd pass from [QB Name]")
else if (str_detect(play_text, " Yd pass from ")) {
player <- str_extract(play_text, "^[A-Za-z]+\\s[A-Za-z]+(?= \\d+ Yd pass from)")
return(player)
}
# Run plays (Pattern 1: "[Runner Name] run for")
else if (str_detect(play_text, " run for ")) {
player <- str_extract(play_text, "^[A-Za-z]+\\s[A-Za-z]+(?= run for)")
return(player)
}
# Run plays (Pattern 2: "[Runner Name] [Yards] Yd Run")
else if (str_detect(play_text, " Yd Run")) {
player <- str_extract(play_text, "^[A-Za-z]+\\s[A-Za-z]+(?= \\d+ Yd Run)")
return(player)
}
# If it's not a complete pass or a run, return NA
return(NA_character_)
}
# execute find key player function
df = df %>%
mutate(key_player = sapply(play_text, extract_key_player))
# get rid of any plays that don't have a key player
df = df %>%
filter(!is.na(key_player))
# examine yards gained and EPA relationship
# create and view linear model for yards gained and EPA relationship
ydsEPAlinear = lm(EPA ~ yards_gained, data = df)
print(tidy(ydsEPAlinear))
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.432 0.00321 -134. 0
## 2 yards_gained 0.105 0.000252 418. 0
# create visual of relationship
ggplot(df, aes(x = yards_gained, y = EPA)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", col = "red") +
labs(title = "Relationship between Yards Gained and EPA",
x = "Yards Gained",
y = "Expected Points Added (EPA)") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
# give summary statistics for relationship
summary(ydsEPAlinear)
##
## Call:
## lm(formula = EPA ~ yards_gained, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.6466 -0.3908 -0.1344 0.2963 9.5297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.4318697 0.0032145 -134.4 <2e-16 ***
## yards_gained 0.1052071 0.0002518 417.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9642 on 142828 degrees of freedom
## Multiple R-squared: 0.5501, Adjusted R-squared: 0.5501
## F-statistic: 1.746e+05 on 1 and 142828 DF, p-value: < 2.2e-16
# calculate residuals (EPA over/under yards gained)
# the residual represents the amount of EPA over/under expected based on how many yards the player gained on the play
df = df %>%
mutate(
predicted_epa = predict(ydsEPAlinear, newdata = .),
residual_epa = EPA - predicted_epa)
# rounding the residuals for simplicity
df = df %>%
mutate(residual_epa = round(residual_epa, digits = 5))
# create player ratings
# a player's rating is their average residual
# this means players with higher ratings were players that increased EPA more than expected given the yards they gained on the play
player_ratings = df %>%
group_by(pos_team, key_player) %>%
summarise(rating = mean(residual_epa),
plays = n())
# to get a rating a player has to have been the key player for at least 10 plays
# meaning players must have 10 involvements to be rated
player_ratings = player_ratings %>%
filter(plays > 9)
# load usage rates
player_usage = cfbd_player_usage(year = 2024)
# select only the columns I need
player_usage = player_usage %>%
select(team, name, usg_overall)
# rename columns so joining dataframes is easy
player_ratings = player_ratings %>%
rename(team = pos_team, name = key_player)
# merge usage and ratings
player_ratings = merge(player_ratings, player_usage, by=c("team", "name"))
# get player positions
all_teams = cfbd_team_roster(2024)
# combine first and last name so stats can be matched to a player and their rating
positions = all_teams %>%
select(team, firstName, lastName, position) %>%
mutate(name = paste(firstName, lastName, sep = " "))
# get rid of unnecessary columns and rows
positions = positions %>%
select(team, name, position) %>%
filter(name != " Team")
# combine rating and position so each position group can be evaluated individually
player_ratings = merge(player_ratings, positions, by=c("team", "name"))
# only looking at WRs and RBs
player_ratings = player_ratings %>%
filter(position %in% c("WR", "RB"))
# top 10 overall players table
top10 = player_ratings %>%
select(-position) %>%
arrange(desc(rating)) %>%
head(10)
t10ratingTable = top10 %>%
gt() %>%
gt_hulk_col_numeric(columns = c(rating, usg_overall)) %>%
tab_header(
title = md(paste0("**Top 10 Clutch Skill Position Players**")),
subtitle = "clutch: gets high value yards"
) %>%
cols_label(
team = md("**Team**"),
name = md("**Player**"),
rating = md("**EPA Above Expected**"),
plays = "Touches",
usg_overall = "Usage Rate"
) %>%
fmt_number(
columns = c(rating, usg_overall),
decimals = 3
) %>%
cols_align(align = "left",columns = team) %>%
cols_align(align = "right", columns = c(rating, usg_overall)) %>%
cols_align(align = "center",columns = name)
t10ratingTable
| Top 10 Clutch Skill Position Players | ||||
| clutch: gets high value yards | ||||
| Team | Player | EPA Above Expected | Touches | Usage Rate |
|---|---|---|---|---|
| West Virginia | Justin Robinson | 1.017 | 16 | 0.018 |
| UCLA | Logan Loya | 1.002 | 29 | 0.042 |
| UCF | Jacoby Jones | 0.942 | 20 | 0.033 |
| App State | Dalton Stroman | 0.888 | 20 | 0.029 |
| Florida State | Kentron Poitier | 0.779 | 10 | 0.014 |
| Washington State | Tre Shackelford | 0.685 | 10 | 0.012 |
| Florida Atlantic | Marlyn Johnson | 0.681 | 15 | 0.028 |
| UTSA | Jamel Hardy | 0.664 | 10 | 0.011 |
| Florida Atlantic | Jabari Smith | 0.664 | 17 | 0.025 |
| Northern Illinois | Cam Thompson | 0.639 | 28 | 0.040 |
# bottom 10 overall players table
bottom10 = player_ratings %>%
select(-position) %>%
arrange(rating) %>%
head(10)
b10ratingTable = bottom10 %>%
gt() %>%
gt_hulk_col_numeric(columns = c(rating, usg_overall)) %>%
tab_header(
title = md(paste0("**Bottom 10 Clutch Skill Position Players**")),
subtitle = "clutch: gets high value yards"
) %>%
cols_label(
team = md("**Team**"),
name = md("**Player**"),
rating = md("**EPA Above Expected**"),
plays = "Touches",
usg_overall = "Usage Rate"
) %>%
fmt_number(
columns = c(rating, usg_overall),
decimals = 3
) %>%
cols_align(align = "left",columns = team) %>%
cols_align(align = "right", columns = c(rating, usg_overall)) %>%
cols_align(align = "center",columns = name)
b10ratingTable
| Bottom 10 Clutch Skill Position Players | ||||
| clutch: gets high value yards | ||||
| Team | Player | EPA Above Expected | Touches | Usage Rate |
|---|---|---|---|---|
| Virginia Tech | Jeremiah Coney | −0.739 | 25 | 0.039 |
| Oklahoma | Sam Franklin | −0.580 | 23 | 0.026 |
| Kennesaw State | Gabriel Benyard | −0.574 | 23 | 0.031 |
| Wisconsin | Dilin Jones | −0.556 | 16 | 0.021 |
| North Texas | Makenzie McGill | −0.519 | 20 | 0.080 |
| BYU | Jojo Phillips | −0.517 | 10 | 0.018 |
| South Alabama | PJ Martin | −0.487 | 16 | 0.019 |
| Louisiana | Tylon Citizen | −0.473 | 10 | 0.011 |
| South Carolina | Dalevon Campbell | −0.467 | 11 | 0.012 |
| Florida Atlantic | Gemari Sands | −0.456 | 29 | 0.044 |
# load player stats
season_stats = cfbd_stats_season_player(2024)
# select only columns and rows needed
season_stats = season_stats %>%
select(team, player, rushing_yds, rushing_td, rushing_ypc, receiving_rec, receiving_yds, receiving_td) %>%
filter(player != " Team")
# set NAs to 0 so calculations can be done on stats that players never recorded
season_stats = season_stats %>%
mutate(across(c(rushing_yds, rushing_td, rushing_ypc, receiving_rec, receiving_yds, receiving_td), ~replace_na(., 0))) %>%
rename(name = player)
# merge ratings and stats to evaluate the relationship between a rating and simple season stats
stats_ratings = merge(player_ratings, season_stats, by= c("team", "name"))
# only want WRs and RBs
stats_ratings = stats_ratings %>%
filter(position %in% c("WR", "RB"))
# separate positions because each position needs to be evaluated differently
wrs = stats_ratings %>%
filter(position == "WR")
rbs = stats_ratings %>%
filter(position == "RB")
# adjust data based on position
# for WRs I don't care about their rushing stats
wrs = wrs %>%
select(team, name, rating, usg_overall, receiving_rec, receiving_yds, receiving_td)
# for RBs I want to look at receiving stats but want to prioritize rushing stats
# I keep TDs weighted the same whether they are rushing or receiving because I want total TDs to be analyzed for RBs
rbs = rbs %>%
mutate(adj_receiving_yards = receiving_yds * .5,
adj_receiving_rec = receiving_rec * .5,
total_td = rushing_td + receiving_td) %>%
select(team, name, rating, usg_overall, rushing_yds, rushing_ypc, adj_receiving_rec, adj_receiving_yards, total_td)
# combine the adjusted receiving yards and the rushing yards to get an adjusted total yards
# doing so allows the yards an RB gains to be analyzed with an emphasis on their rushing totals
rbs = rbs %>%
mutate(adj_total_yards = rushing_yds + adj_receiving_yards) %>%
select(-adj_receiving_yards) %>%
select(-rushing_yds)
# wr correlation matrix
# look at how each statistic is correlated to the others
wr_cor_cols = wrs %>%
select(rating, receiving_rec, receiving_yds, receiving_td, usg_overall) %>%
rename(rec = receiving_rec, usg = usg_overall)
wr_cor_matrix = round(cor(wr_cor_cols), 5)
corrplot(wr_cor_matrix,
method = "number",
type = "upper",
diag = FALSE,
addCoef.col = "black",
number.cex = 0.8,
tl.cex = 0.7,
col = colorRampPalette(c("purple", "green"))(200),
mar = c(0,0,0,0),
tl.col = "black"
)
# rb correlation matrix
# look at how each statistic is correlated to the others
rb_cor_cols = rbs %>%
select(rating, adj_total_yards, total_td, adj_receiving_rec, rushing_ypc, usg_overall) %>%
rename(ypc = rushing_ypc, usg = usg_overall, adj_rec = adj_receiving_rec)
rb_cor_matrix = round(cor(rb_cor_cols), 5)
corrplot(rb_cor_matrix,
method = "number",
type = "upper",
diag = FALSE,
addCoef.col = "black",
number.cex = 0.8,
tl.cex = 0.7,
col = colorRampPalette(c("purple", "green"))(200),
mar = c(0,0,0,0),
tl.col = "black"
)
# wr model
# use this to see combined impact on player rating
stdWRs = wrs %>%
mutate(srating = scale(rating), srec = scale(receiving_rec), syds = scale(receiving_yds), std = scale(receiving_td)) %>%
select(srating, srec, syds, std)
wrModel = lm(srating ~ srec + syds + std, data = stdWRs)
# the total receiving yards a WR has is the most important predictor of a player that gets high value yards
# TDs are almost as important
print(tidy(wrModel))
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -8.18e-16 0.0437 -1.87e-14 1.00
## 2 srec 2.16e- 1 0.121 1.78e+ 0 0.0755
## 3 syds -4.95e- 1 0.134 -3.68e+ 0 0.000259
## 4 std 4.18e- 1 0.0736 5.67e+ 0 0.0000000244
# rb model
# use this to see combined impact on player rating
stdRBs = rbs %>%
mutate(srating = scale(rating), srec = scale(adj_receiving_rec), syds = scale(adj_total_yards), sypc = scale(rushing_ypc), std = scale(total_td)) %>%
select(srating, srec, syds, sypc, std)
rbModel = lm(srating ~ srec + syds + sypc + std, data = stdRBs)
# touchdowns are by far the most important factor when looking at running backs that get high value yards
print(tidy(rbModel))
## # A tibble: 5 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3.20e-16 0.0451 7.10e-15 1.00
## 2 srec 7.13e- 2 0.0601 1.19e+ 0 0.236
## 3 syds -2.06e- 1 0.115 -1.79e+ 0 0.0733
## 4 sypc -8.94e- 2 0.0480 -1.86e+ 0 0.0633
## 5 std 4.72e- 1 0.105 4.50e+ 0 0.00000873