Summary

Identify Players Who Outperform Expectations

This analysis identifies running backs (RBs) and wide receivers (WRs) who gain more valuable yards than expected. I achieved this by modeling Expected Points Added (EPA) as a function of yards gained. The residual from this model for each player’s touch represents their impact, with their average residual serving as their overall rating.

Uncover Undervalued Talent

This methodology isolates a player’s direct impact when they touch the ball, independent of their overall involvement in the offense and the yards they gain. This helps to pinpoint potentially undervalued and underutilized players who consistently make significant contributions.

Key Statistical Correlations

I also examined which traditional statistics correlate most strongly with these ratings. Touchdowns, as expected, are the most significant factor for both RBs and WRs ratings. Touchdowns create points and therefore will always increase expected points by some margin, meaning that players who score more touchdowns per touch will do well in this rating system.

Supporting Data and Player Rankings

Below, you’ll find the code used for this analysis, displays of the key statistics that led to these conclusions, and tables ranking players based on their performance.

Library Imports

# import libraries
library(dplyr)
library(cfbfastR)
library(ggplot2)
library(reshape2)
library(gt)
library(gtExtras)
library(tidyr)
library(stringr)
library(knitr)
library(broom)
library(corrplot)

Data Loading and Preparation

# load 2024 data and setup
Sys.setenv(CFBD_API_KEY = "5NK1DDMX7HOCY+FrWUtry5yJW65kG7YsohVHHxJmjMJXS0gl8g+XPk+UL7C+deaB")
all_data = load_cfb_pbp()

# only need these columns from the whole data set
df = all_data %>%
  select(pos_team, play_text, yards_gained, EPA, rush, pass)
# only want runs and passes
df = df %>%

# plays are either rush, pass, or other
  mutate(play = case_when(
    rush == 1 ~ "rush",
    pass == 1 ~ "pass",
    rush + pass == 0 ~ "other"
  )) %>%
# only looking at runs and passes
  filter(play != "other") %>%

# don't need these columns anymore
  select(-rush) %>%
  select(-pass) %>%

# need plays with an EPA calculated
  filter(!is.na(EPA))
# find key player function
extract_key_player <- function(play_text) {
  # Completed passes (Pattern 1: "pass complete to [Receiver Name]")
  if (str_detect(play_text, "pass complete to")) {
    player <- str_extract(play_text, "(?<=pass complete to )[A-Za-z]+\\s[A-Za-z]+")
    return(player)
  }
  # Completed passes (Pattern 2: "[Receiver Name] [Yards] Yd pass from [QB Name]")
  else if (str_detect(play_text, " Yd pass from ")) {
    player <- str_extract(play_text, "^[A-Za-z]+\\s[A-Za-z]+(?= \\d+ Yd pass from)")
    return(player)
  }
  # Run plays (Pattern 1: "[Runner Name] run for")
  else if (str_detect(play_text, " run for ")) {
    player <- str_extract(play_text, "^[A-Za-z]+\\s[A-Za-z]+(?= run for)")
    return(player)
  }
  # Run plays (Pattern 2: "[Runner Name] [Yards] Yd Run")
  else if (str_detect(play_text, " Yd Run")) {
    player <- str_extract(play_text, "^[A-Za-z]+\\s[A-Za-z]+(?= \\d+ Yd Run)")
    return(player)
  }
  # If it's not a complete pass or a run, return NA
  return(NA_character_)
}
# execute find key player function
df = df %>%
  mutate(key_player = sapply(play_text, extract_key_player))

# get rid of any plays that don't have a key player
df = df %>%
  filter(!is.na(key_player))

Yards Gained vs EPA

# examine yards gained and EPA relationship

# create and view linear model for yards gained and EPA relationship
ydsEPAlinear = lm(EPA ~ yards_gained, data = df)

print(tidy(ydsEPAlinear))
## # A tibble: 2 × 5
##   term         estimate std.error statistic p.value
##   <chr>           <dbl>     <dbl>     <dbl>   <dbl>
## 1 (Intercept)    -0.432  0.00321      -134.       0
## 2 yards_gained    0.105  0.000252      418.       0
# create visual of relationship
ggplot(df, aes(x = yards_gained, y = EPA)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", col = "red") +
  labs(title = "Relationship between Yards Gained and EPA",
       x = "Yards Gained",
       y = "Expected Points Added (EPA)") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# give summary statistics for relationship
summary(ydsEPAlinear)
## 
## Call:
## lm(formula = EPA ~ yards_gained, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.6466  -0.3908  -0.1344   0.2963   9.5297 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -0.4318697  0.0032145  -134.4   <2e-16 ***
## yards_gained  0.1052071  0.0002518   417.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9642 on 142828 degrees of freedom
## Multiple R-squared:  0.5501, Adjusted R-squared:  0.5501 
## F-statistic: 1.746e+05 on 1 and 142828 DF,  p-value: < 2.2e-16

Rating Players

# calculate residuals (EPA over/under yards gained)
# the residual represents the amount of EPA over/under expected based on how many yards the player gained on the play
df = df %>%
  mutate(
    predicted_epa = predict(ydsEPAlinear, newdata = .),
    residual_epa = EPA - predicted_epa)

# rounding the residuals for simplicity
df = df %>%
  mutate(residual_epa = round(residual_epa, digits = 5))
# create player ratings
# a player's rating is their average residual
# this means players with higher ratings were players that increased EPA more than expected given the yards they gained on the play
player_ratings = df %>%
  group_by(pos_team, key_player) %>%
  summarise(rating = mean(residual_epa),
            plays = n())

# to get a rating a player has to have been the key player for at least 10 plays
# meaning players must have 10 involvements to be rated
player_ratings = player_ratings %>%
  filter(plays > 9)
# load usage rates
player_usage = cfbd_player_usage(year = 2024)

# select only the columns I need
player_usage = player_usage %>%
  select(team, name, usg_overall)

# rename columns so joining dataframes is easy
player_ratings = player_ratings %>%
  rename(team = pos_team, name = key_player)
# merge usage and ratings
player_ratings = merge(player_ratings, player_usage, by=c("team", "name"))
# get player positions
all_teams = cfbd_team_roster(2024)

# combine first and last name so stats can be matched to a player and their rating
positions = all_teams %>%
  select(team, firstName, lastName, position) %>%
  mutate(name = paste(firstName, lastName, sep = " "))

# get rid of unnecessary columns and rows
positions = positions %>%
  select(team, name, position) %>%
  filter(name != " Team")

# combine rating and position so each position group can be evaluated individually
player_ratings = merge(player_ratings, positions, by=c("team", "name"))

# only looking at WRs and RBs
player_ratings = player_ratings %>%
  filter(position %in% c("WR", "RB"))
# top 10 overall players table
top10 = player_ratings %>%
  select(-position) %>%
  arrange(desc(rating)) %>%
  head(10)

t10ratingTable = top10 %>%
  gt() %>%
  gt_hulk_col_numeric(columns = c(rating, usg_overall)) %>%
  tab_header(
    title = md(paste0("**Top 10 Clutch Skill Position Players**")),
    subtitle = "clutch: gets high value yards"
  ) %>%
  cols_label(
    team = md("**Team**"),
    name = md("**Player**"),
    rating = md("**EPA Above Expected**"),
    plays = "Touches",
    usg_overall = "Usage Rate"
  ) %>%
  fmt_number(
    columns = c(rating, usg_overall),
    decimals = 3
  ) %>%
  cols_align(align = "left",columns = team) %>%
  cols_align(align = "right", columns = c(rating, usg_overall)) %>%
  cols_align(align = "center",columns = name)
t10ratingTable
Top 10 Clutch Skill Position Players
clutch: gets high value yards
Team Player EPA Above Expected Touches Usage Rate
West Virginia Justin Robinson 1.017 16 0.018
UCLA Logan Loya 1.002 29 0.042
UCF Jacoby Jones 0.942 20 0.033
App State Dalton Stroman 0.888 20 0.029
Florida State Kentron Poitier 0.779 10 0.014
Washington State Tre Shackelford 0.685 10 0.012
Florida Atlantic Marlyn Johnson 0.681 15 0.028
UTSA Jamel Hardy 0.664 10 0.011
Florida Atlantic Jabari Smith 0.664 17 0.025
Northern Illinois Cam Thompson 0.639 28 0.040
# bottom 10 overall players table
bottom10 = player_ratings %>%
  select(-position) %>%
  arrange(rating) %>%
  head(10)

b10ratingTable = bottom10 %>%
  gt() %>%
  gt_hulk_col_numeric(columns = c(rating, usg_overall)) %>%
  tab_header(
    title = md(paste0("**Bottom 10 Clutch Skill Position Players**")),
    subtitle = "clutch: gets high value yards"
  ) %>%
  cols_label(
    team = md("**Team**"),
    name = md("**Player**"),
    rating = md("**EPA Above Expected**"),
    plays = "Touches",
    usg_overall = "Usage Rate"
  ) %>%
  fmt_number(
    columns = c(rating, usg_overall),
    decimals = 3
  ) %>%
  cols_align(align = "left",columns = team) %>%
  cols_align(align = "right", columns = c(rating, usg_overall)) %>%
  cols_align(align = "center",columns = name)
b10ratingTable
Bottom 10 Clutch Skill Position Players
clutch: gets high value yards
Team Player EPA Above Expected Touches Usage Rate
Virginia Tech Jeremiah Coney −0.739 25 0.039
Oklahoma Sam Franklin −0.580 23 0.026
Kennesaw State Gabriel Benyard −0.574 23 0.031
Wisconsin Dilin Jones −0.556 16 0.021
North Texas Makenzie McGill −0.519 20 0.080
BYU Jojo Phillips −0.517 10 0.018
South Alabama PJ Martin −0.487 16 0.019
Louisiana Tylon Citizen −0.473 10 0.011
South Carolina Dalevon Campbell −0.467 11 0.012
Florida Atlantic Gemari Sands −0.456 29 0.044

Statistical Correlation Evaluation

# load player stats
season_stats = cfbd_stats_season_player(2024)

# select only columns and rows needed
season_stats = season_stats %>%
  select(team, player, rushing_yds, rushing_td, rushing_ypc, receiving_rec, receiving_yds, receiving_td) %>%
  filter(player != " Team")
# set NAs to 0 so calculations can be done on stats that players never recorded
season_stats = season_stats %>%
    mutate(across(c(rushing_yds, rushing_td, rushing_ypc, receiving_rec, receiving_yds, receiving_td), ~replace_na(., 0))) %>%
  rename(name = player)
# merge ratings and stats to evaluate the relationship between a rating and simple season stats
stats_ratings = merge(player_ratings, season_stats, by= c("team", "name"))
# only want WRs and RBs
stats_ratings = stats_ratings %>%
  filter(position %in% c("WR", "RB"))
# separate positions because each position needs to be evaluated differently
wrs = stats_ratings %>%
  filter(position == "WR")

rbs = stats_ratings %>%
  filter(position == "RB")
# adjust data based on position
# for WRs I don't care about their rushing stats
wrs = wrs %>%
  select(team, name, rating, usg_overall, receiving_rec, receiving_yds, receiving_td)

# for RBs I want to look at receiving stats but want to prioritize rushing stats
# I keep TDs weighted the same whether they are rushing or receiving because I want total TDs to be analyzed for RBs
rbs = rbs %>%
  mutate(adj_receiving_yards = receiving_yds * .5,
         adj_receiving_rec = receiving_rec * .5,
         total_td = rushing_td + receiving_td) %>%
  select(team, name, rating, usg_overall, rushing_yds, rushing_ypc, adj_receiving_rec, adj_receiving_yards, total_td)

# combine the adjusted receiving yards and the rushing yards to get an adjusted total yards
# doing so allows the yards an RB gains to be analyzed with an emphasis on their rushing totals
rbs = rbs %>%
  mutate(adj_total_yards = rushing_yds + adj_receiving_yards) %>%
  select(-adj_receiving_yards) %>%
  select(-rushing_yds)
# wr correlation matrix
# look at how each statistic is correlated to the others
wr_cor_cols = wrs %>%
  select(rating, receiving_rec, receiving_yds, receiving_td, usg_overall) %>%
  rename(rec = receiving_rec, usg = usg_overall)

wr_cor_matrix = round(cor(wr_cor_cols), 5)

corrplot(wr_cor_matrix,
         method = "number",
         type = "upper",
         diag = FALSE,
         addCoef.col = "black",
         number.cex = 0.8,
         tl.cex = 0.7,
         col = colorRampPalette(c("purple", "green"))(200),
         mar = c(0,0,0,0),
         tl.col = "black"
         )

# rb correlation matrix
# look at how each statistic is correlated to the others
rb_cor_cols = rbs %>%
  select(rating, adj_total_yards, total_td, adj_receiving_rec, rushing_ypc, usg_overall) %>%
  rename(ypc = rushing_ypc, usg = usg_overall, adj_rec = adj_receiving_rec)

rb_cor_matrix = round(cor(rb_cor_cols), 5)

corrplot(rb_cor_matrix,
         method = "number",
         type = "upper",
         diag = FALSE,
         addCoef.col = "black",
         number.cex = 0.8,
         tl.cex = 0.7,
         col = colorRampPalette(c("purple", "green"))(200),
         mar = c(0,0,0,0),
         tl.col = "black"
         )

# wr model
# use this to see combined impact on player rating
stdWRs = wrs %>%
  mutate(srating = scale(rating), srec = scale(receiving_rec), syds = scale(receiving_yds), std = scale(receiving_td)) %>%
  select(srating, srec, syds, std)

wrModel = lm(srating ~ srec + syds + std, data = stdWRs)

# the total receiving yards a WR has is the most important predictor of a player that gets high value yards
# TDs are almost as important
print(tidy(wrModel))
## # A tibble: 4 × 5
##   term         estimate std.error statistic      p.value
##   <chr>           <dbl>     <dbl>     <dbl>        <dbl>
## 1 (Intercept) -8.18e-16    0.0437 -1.87e-14 1.00        
## 2 srec         2.16e- 1    0.121   1.78e+ 0 0.0755      
## 3 syds        -4.95e- 1    0.134  -3.68e+ 0 0.000259    
## 4 std          4.18e- 1    0.0736  5.67e+ 0 0.0000000244
# rb model
# use this to see combined impact on player rating
stdRBs = rbs %>%
  mutate(srating = scale(rating), srec = scale(adj_receiving_rec), syds = scale(adj_total_yards), sypc = scale(rushing_ypc), std = scale(total_td)) %>%
  select(srating, srec, syds, sypc, std)

rbModel = lm(srating ~ srec + syds + sypc + std, data = stdRBs)

# touchdowns are by far the most important factor when looking at running backs that get high value yards
print(tidy(rbModel))
## # A tibble: 5 × 5
##   term         estimate std.error statistic    p.value
##   <chr>           <dbl>     <dbl>     <dbl>      <dbl>
## 1 (Intercept)  3.20e-16    0.0451  7.10e-15 1.00      
## 2 srec         7.13e- 2    0.0601  1.19e+ 0 0.236     
## 3 syds        -2.06e- 1    0.115  -1.79e+ 0 0.0733    
## 4 sypc        -8.94e- 2    0.0480 -1.86e+ 0 0.0633    
## 5 std          4.72e- 1    0.105   4.50e+ 0 0.00000873