Library Imports

# import libraries
library(dplyr)
library(cfbfastR)
library(ggplot2)
library(reshape2)
library(gt)
library(gtExtras)
library(tidyr)

Data Loading and Preparation

# load data and setup
Sys.setenv(CFBD_API_KEY = "5NK1DDMX7HOCY+FrWUtry5yJW65kG7YsohVHHxJmjMJXS0gl8g+XPk+UL7C+deaB")
seasons = 2018:2024
all_data = load_cfb_pbp(seasons)

# select only the columns I want to use or refer to
miniAll = all_data %>%
  select(year, week, game_id, pos_team, play_type, play_text, down, distance, yards_to_goal, yards_gained, EPA, wpa, rush, pass)
# add play type, field position, and distance to first down groupings

# declares play as rush, pass, or other
miniAll = miniAll %>%
  mutate(play = case_when(
    rush == 1 ~ "rush",
    pass == 1 ~ "pass",
    rush + pass == 0 ~ "other"
  ),

# creates 4 zones for field position for when plays are grouped together
  field_position = case_when(
    yards_to_goal >= 75 ~ 1,
    yards_to_goal >= 50 & yards_to_goal < 75 ~ 2,
    yards_to_goal >= 25 & yards_to_goal < 50 ~ 3,
    yards_to_goal < 25 ~ 4
  ),

# creates 4 zones for yards to first down for when plays are grouped together
  dis2goGroup = case_when(
    distance <= 4 ~ 4,
    distance > 4 & distance <= 8 ~ 3,
    distance > 8 & distance <= 12 ~ 2,
    distance > 12 ~ 1
  ))

# only looking at run and pass plays
miniAll = miniAll %>%
  filter(play != "other")
# going to use 2018-23 to calculate xEPA
dfTrain = miniAll %>%
  filter(year != 2024)

# will use 2024 as year to call plays in based on the xEPA calculated from years prior
df2024 = miniAll %>%
  filter(year == 2024) %>%
  rename(actualPlay = play)

Calculating xEPA

# Calculate xEPA

# only need these columns to caclulate xEPA
xEPAprep = dfTrain %>%
  select(play, down, distance, yards_to_goal, EPA, field_position, dis2goGroup)

# xEPA is the average EPA from all the plays that fit into each group
xEPA = xEPAprep %>%
  group_by(play, down, dis2goGroup, field_position) %>%
  summarise(xEPA = mean(EPA, na.rm = TRUE),
            plays = n())

# if a group doesn't have 10 plays then EPA is set to 0 so decisions aren't being made based on small sample size
xEPA = xEPA %>%
  mutate(xEPA = case_when(
    plays < 10 ~ 0,
    plays >= 10 ~ xEPA
  ))

# get rid of any weird plays where the down is labeled as 0
xEPA = xEPA %>%
  filter(down != 0)

Decision Making

# expected decision for each 2024 play

# match each 2024 play to the rush and pass xEPA so the choice between pass and rush can be made for each play in 2024
merge = merge(df2024, xEPA, by = c("down", "field_position", "dis2goGroup"), all.x = TRUE, allow.cartesian = TRUE)

# after the merge each play has 2 instances and this picks the play call that should be called in this situation
expected_decision = merge %>%
  group_by(down, field_position, dis2goGroup) %>%
  slice_max(xEPA, n = 1, na_rm = TRUE) %>%
  ungroup()

# get rid of any extra plays that aren't being examined
# rename play to expectedPlay so the actual and expected plays can be understood
expected_decision = expected_decision %>%
  filter(actualPlay != "other") %>%
  rename(expectedPlay = play) %>%
  filter(!is.na(EPA), !is.na(wpa))

Evaluation

# Expected vs Actual Play Call

# determine if the expected and actual play call are the same
expected_decision = expected_decision %>%
  mutate(match = case_when(
    actualPlay == expectedPlay ~ 1,
    actualPlay != expectedPlay ~ 0
  )) %>%
  select(-plays)

# calculate how the play fared against how it was expected to go
expected_decision = expected_decision %>%
  mutate(diffEPA = EPA - xEPA)
# BIG IDEA: is xEPA a valid way to call plays?

# compare the EPA for plays where the expected and actual matched compared to when they didn't
bigIdea = expected_decision %>%
  group_by(match) %>%
  summarise(avgEPA = mean(EPA),
            plays = n())

# visualize the comparison made above
BItable = bigIdea %>% 
  mutate(match = case_when(
    match == 1 ~ "match",
    match == 0 ~ "no match"
  )) %>%
  gt() %>%
  gt_theme_538() %>%
  cols_align(align = "center", columns = everything()) %>%
  gt_hulk_col_numeric(c(avgEPA)) %>%
  cols_label(match = md("**Match?**"), avgEPA=md("**Average EPA**"), 
             plays = md("**Plays**")) %>%
  tab_header(title= "Comparing Average EPA for Making Expected Call", 
             subtitle = "EPA: expected points added") %>%
  opt_align_table_header(align="center") %>%
  tab_style(style = list(cell_text(weight = "bold")), 
            locations = cells_body(columns = c(match, avgEPA))) %>%
  tab_source_note(source_note = "cfbfastR | Max Basurto") %>%
  tab_style(
    style = list(cell_text(align = "right")),
    locations = cells_source_notes()
  )
BItable
Comparing Average EPA for Making Expected Call
EPA: expected points added
Match? Average EPA Plays
no match -0.03912992 106007
match 0.06380758 96649
cfbfastR | Max Basurto
# BIG IDEA: significance test

# run a 2 sample difference in means significance test to evaluate results from above
overalltResultEPA = t.test(EPA ~ match, data = expected_decision, var.equal = FALSE)
overalltResultEPA
## 
##  Welch Two Sample t-test
## 
## data:  EPA by match
## t = -15.242, df = 196707, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.11617421 -0.08970079
## sample estimates:
## mean in group 0 mean in group 1 
##     -0.03912992      0.06380758

EPA and Winning Analysis

# How does calling the "right" play effect winning?

# compare win porbability added for plays that expected and actual calls matched vs when they didn't match
winning = expected_decision %>%
  group_by(match) %>%
  summarise(avg_wpa = mean(wpa),
            plays=n())
# Significance test

# run a 2 sample difference in means significance test to evaluate results from above
overalltResultwpa = t.test(wpa ~ match, data = expected_decision, var.equal = FALSE)
overalltResultwpa
## 
##  Welch Two Sample t-test
## 
## data:  wpa by match
## t = -9.6307, df = 199647, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
##  -0.003104182 -0.002054350
## sample estimates:
## mean in group 0 mean in group 1 
##    -0.000826765     0.001752501
# want to see if higher expected points added is correlated with higher win probability to understand if increasing expected points will improve a team's results

# select only the columns I need
# don't include blowouts because expected points is done on a drive-by-drive basis and win probability is game-by-game
# meaning that in blowouts expected points can change significantly while win probability won't
epawpaCOR = all_data %>%
  select(pos_score_diff, half, EPA, wpa) %>%
  filter(half == 1, between(pos_score_diff, -21, 21),
         !is.na(EPA), !is.na(wpa))
  
# get the r value for the relationship between EPA and wpa
correlation_coefficient = cor(epawpaCOR$EPA, epawpaCOR$wpa, method = "pearson")

# test significance of r value
cor_test_result <- cor.test(epawpaCOR$EPA, epawpaCOR$wpa, method = "pearson")

# visualize correlation
corGraph = ggplot(epawpaCOR, aes(x = EPA, y = wpa)) +
  geom_point() +
  geom_smooth(method = "lm", se = FALSE, color = "red") + # Add a linear regression line
  labs(title = "EPA vs wpa",
       x = "Expected Points Added",
       y = "Win Probability Added") +
  theme_minimal()

# print values and visual
correlation_coefficient
## [1] 0.6140792
cor_test_result
## 
##  Pearson's product-moment correlation
## 
## data:  epawpaCOR$EPA and epawpaCOR$wpa
## t = 626.75, df = 648876, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.6125613 0.6155926
## sample estimates:
##       cor 
## 0.6140792
corGraph
## `geom_smooth()` using formula = 'y ~ x'

# calculate and print r squared for better understanding of the relationship
r_squared = correlation_coefficient ** 2
r_squared
## [1] 0.3770933
# use best fit line to predict wpa given EPA

# create linear model using EPA to predict wpa
wpa_given_epa = lm(wpa ~ EPA, data = epawpaCOR)
wpa_given_epa
## 
## Call:
## lm(formula = wpa ~ EPA, data = epawpaCOR)
## 
## Coefficients:
## (Intercept)          EPA  
##   0.0002909    0.0256306

Team-by-Team Evaluation

# get result fucntion
# gets the result of the game as W or L based on the teams and score
get_result <- function(home_team, away_team, home_score, away_score, target_team) {
  if (home_team == target_team) {
    # If target_team is the home team
    if (home_score > away_score) {
      return("W")
    } else {
      return("L")
    }
  } else if (away_team == target_team) {
    if (away_score > home_score) {
      return("W")
    } else {
      return("L")
    }
  } else {
    return("N/A")
  }
}
# Input a team name and get data for that specific team

# input team that you want to evaluate
team = "Rice"

# load the schedule from 2024 to get the results from each game
game_results = load_cfb_schedules()

# use get result function to get the results from each game
game_results = game_results %>%
  select(game_id, home_team, away_team, home_points, away_points) %>%
  rowwise() %>%
  mutate(result = get_result(home_team, away_team, home_points, away_points, team)) %>%
  ungroup()

# modify the table game_results for better visualization
game_results = game_results %>%
  mutate(game_score = paste(home_points, away_points, sep = "-"),
         matchup = paste(home_team, away_team, sep = " vs ")) %>%
  select(game_id, matchup, game_score, result)

# evaluate the team's season as a whole for when they called plays that matched the expected call vs the plays that didn't match
teamEval = expected_decision %>%
  filter(pos_team == team) %>%
  group_by(match) %>%
  summarise(net_EPA = sum(EPA),
            plays = n())

# weekly evaluation instead of whole season approach
weeklyEval = expected_decision %>%
  filter(pos_team == team) %>%
  group_by(game_id, match) %>%
  summarise(net_EPA = sum(EPA),
            plays = n())

# combine the team's game results and evaluation to better understand the effects of using xEPA as a way to call plays
weeklyEval = merge(weeklyEval, game_results, by=c("game_id"))

# clarify and simplify the merged table to more clearly display the data in a table format
weekEvalprocessed = weeklyEval %>%

# game by game approach
  group_by(game_id) %>%

# create new columns to understand decision making in each game
  mutate(
    total_plays_game = sum(plays),
    plays_with_match_1_game = sum(plays[match == 1]),
    percentage_match = (plays_with_match_1_game / total_plays_game) * 100
  ) %>%
  ungroup() %>%

# get the columns I need to show and reorganize data so things are more clear
  select(matchup, match, net_EPA, game_score, result, percentage_match) %>%
  pivot_wider(
    names_from = match,
    values_from = net_EPA,
    names_prefix = "netEPA_Match_",
    id_cols = c(matchup, game_score, result, percentage_match)
  ) %>%
  
# rename columns for better understanding
  dplyr::rename(
    Matchup = matchup,
    `net_EPA (Match = 0)` = netEPA_Match_0,
    `net_EPA (Match = 1)` = netEPA_Match_1,
    `Game Score` = game_score,
    `Result` = result,
    `Match %` = percentage_match
  )

# display the game-by-game evaluation data in a table 
evalTable = weekEvalprocessed %>%
  gt() %>%
  gt_hulk_col_numeric(columns = c(`net_EPA (Match = 0)`, `net_EPA (Match = 1)`, `Match %`)) %>%
  tab_header(
    title = md(paste0("**Net EPA by Matchup and Match Segment: ", team, "**")),
    subtitle = "Match %: percentage of plays where actual call matches the expected call"
  ) %>%
  cols_label(
    Matchup = md("**Matchup**"),
    `net_EPA (Match = 0)` = md("**Net EPA (No Match)**"),
    `net_EPA (Match = 1)` = md("**Net EPA (Match)**"),
    `Game Score` = md("**Game Result**"),
    `Result` = "",
    `Match %` = md("**Match %**")
  ) %>%
  fmt_number(
    columns = c(`net_EPA (Match = 0)`, `net_EPA (Match = 1)`),
    decimals = 3
  ) %>%
  cols_align(align = "left",columns = Matchup) %>%
  cols_align(align = "right", columns = c(`net_EPA (Match = 0)`, `net_EPA (Match = 1)`, `Match %`)) %>%
  cols_align(align = "center",columns = c(`Game Score`, `Result`))

evalTable
Net EPA by Matchup and Match Segment: Rice
Match %: percentage of plays where actual call matches the expected call
Matchup Game Result Match % Net EPA (No Match) Net EPA (Match)
UConn vs Rice 17-10 L 46.55172 −15.006 −14.366
Houston vs Rice 33-7 L 38.00000 −11.001 −5.528
Rice vs Sam Houston 14-34 L 60.65574 −10.259 −0.857
Rice vs Texas Southern 69-7 W 48.38710 18.202 1.638
Army vs Rice 37-14 L 76.92308 −0.918 7.750
Rice vs Charlotte 20-21 L 55.71429 −0.898 2.203
Rice vs UTSA 29-27 W 67.79661 −1.702 12.061
Tulane vs Rice 24-10 L 62.29508 −5.755 −15.132
Rice vs Navy 24-10 W 56.45161 −3.593 6.149
Memphis vs Rice 27-20 L 47.05882 −2.590 0.436
UAB vs Rice 40-14 L 51.35135 −13.459 −4.597
Rice vs South Florida 35-28 W 46.83544 −6.568 9.694