Library Imports
# import libraries
library(dplyr)
library(cfbfastR)
library(ggplot2)
library(reshape2)
library(gt)
library(gtExtras)
library(tidyr)
Data Loading and Preparation
# load data and setup
Sys.setenv(CFBD_API_KEY = "5NK1DDMX7HOCY+FrWUtry5yJW65kG7YsohVHHxJmjMJXS0gl8g+XPk+UL7C+deaB")
seasons = 2018:2024
all_data = load_cfb_pbp(seasons)
# select only the columns I want to use or refer to
miniAll = all_data %>%
select(year, week, game_id, pos_team, play_type, play_text, down, distance, yards_to_goal, yards_gained, EPA, wpa, rush, pass)
# add play type, field position, and distance to first down groupings
# declares play as rush, pass, or other
miniAll = miniAll %>%
mutate(play = case_when(
rush == 1 ~ "rush",
pass == 1 ~ "pass",
rush + pass == 0 ~ "other"
),
# creates 4 zones for field position for when plays are grouped together
field_position = case_when(
yards_to_goal >= 75 ~ 1,
yards_to_goal >= 50 & yards_to_goal < 75 ~ 2,
yards_to_goal >= 25 & yards_to_goal < 50 ~ 3,
yards_to_goal < 25 ~ 4
),
# creates 4 zones for yards to first down for when plays are grouped together
dis2goGroup = case_when(
distance <= 4 ~ 4,
distance > 4 & distance <= 8 ~ 3,
distance > 8 & distance <= 12 ~ 2,
distance > 12 ~ 1
))
# only looking at run and pass plays
miniAll = miniAll %>%
filter(play != "other")
# going to use 2018-23 to calculate xEPA
dfTrain = miniAll %>%
filter(year != 2024)
# will use 2024 as year to call plays in based on the xEPA calculated from years prior
df2024 = miniAll %>%
filter(year == 2024) %>%
rename(actualPlay = play)
Calculating xEPA
# Calculate xEPA
# only need these columns to caclulate xEPA
xEPAprep = dfTrain %>%
select(play, down, distance, yards_to_goal, EPA, field_position, dis2goGroup)
# xEPA is the average EPA from all the plays that fit into each group
xEPA = xEPAprep %>%
group_by(play, down, dis2goGroup, field_position) %>%
summarise(xEPA = mean(EPA, na.rm = TRUE),
plays = n())
# if a group doesn't have 10 plays then EPA is set to 0 so decisions aren't being made based on small sample size
xEPA = xEPA %>%
mutate(xEPA = case_when(
plays < 10 ~ 0,
plays >= 10 ~ xEPA
))
# get rid of any weird plays where the down is labeled as 0
xEPA = xEPA %>%
filter(down != 0)
Decision Making
# expected decision for each 2024 play
# match each 2024 play to the rush and pass xEPA so the choice between pass and rush can be made for each play in 2024
merge = merge(df2024, xEPA, by = c("down", "field_position", "dis2goGroup"), all.x = TRUE, allow.cartesian = TRUE)
# after the merge each play has 2 instances and this picks the play call that should be called in this situation
expected_decision = merge %>%
group_by(down, field_position, dis2goGroup) %>%
slice_max(xEPA, n = 1, na_rm = TRUE) %>%
ungroup()
# get rid of any extra plays that aren't being examined
# rename play to expectedPlay so the actual and expected plays can be understood
expected_decision = expected_decision %>%
filter(actualPlay != "other") %>%
rename(expectedPlay = play) %>%
filter(!is.na(EPA), !is.na(wpa))
Evaluation
# Expected vs Actual Play Call
# determine if the expected and actual play call are the same
expected_decision = expected_decision %>%
mutate(match = case_when(
actualPlay == expectedPlay ~ 1,
actualPlay != expectedPlay ~ 0
)) %>%
select(-plays)
# calculate how the play fared against how it was expected to go
expected_decision = expected_decision %>%
mutate(diffEPA = EPA - xEPA)
# BIG IDEA: is xEPA a valid way to call plays?
# compare the EPA for plays where the expected and actual matched compared to when they didn't
bigIdea = expected_decision %>%
group_by(match) %>%
summarise(avgEPA = mean(EPA),
plays = n())
# visualize the comparison made above
BItable = bigIdea %>%
mutate(match = case_when(
match == 1 ~ "match",
match == 0 ~ "no match"
)) %>%
gt() %>%
gt_theme_538() %>%
cols_align(align = "center", columns = everything()) %>%
gt_hulk_col_numeric(c(avgEPA)) %>%
cols_label(match = md("**Match?**"), avgEPA=md("**Average EPA**"),
plays = md("**Plays**")) %>%
tab_header(title= "Comparing Average EPA for Making Expected Call",
subtitle = "EPA: expected points added") %>%
opt_align_table_header(align="center") %>%
tab_style(style = list(cell_text(weight = "bold")),
locations = cells_body(columns = c(match, avgEPA))) %>%
tab_source_note(source_note = "cfbfastR | Max Basurto") %>%
tab_style(
style = list(cell_text(align = "right")),
locations = cells_source_notes()
)
BItable
| Comparing Average EPA for Making Expected Call |
| EPA: expected points added |
| Match? |
Average EPA |
Plays |
| no match |
-0.03912992 |
106007 |
| match |
0.06380758 |
96649 |
| cfbfastR | Max Basurto |
# BIG IDEA: significance test
# run a 2 sample difference in means significance test to evaluate results from above
overalltResultEPA = t.test(EPA ~ match, data = expected_decision, var.equal = FALSE)
overalltResultEPA
##
## Welch Two Sample t-test
##
## data: EPA by match
## t = -15.242, df = 196707, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.11617421 -0.08970079
## sample estimates:
## mean in group 0 mean in group 1
## -0.03912992 0.06380758
EPA and Winning Analysis
# How does calling the "right" play effect winning?
# compare win porbability added for plays that expected and actual calls matched vs when they didn't match
winning = expected_decision %>%
group_by(match) %>%
summarise(avg_wpa = mean(wpa),
plays=n())
# Significance test
# run a 2 sample difference in means significance test to evaluate results from above
overalltResultwpa = t.test(wpa ~ match, data = expected_decision, var.equal = FALSE)
overalltResultwpa
##
## Welch Two Sample t-test
##
## data: wpa by match
## t = -9.6307, df = 199647, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group 0 and group 1 is not equal to 0
## 95 percent confidence interval:
## -0.003104182 -0.002054350
## sample estimates:
## mean in group 0 mean in group 1
## -0.000826765 0.001752501
# want to see if higher expected points added is correlated with higher win probability to understand if increasing expected points will improve a team's results
# select only the columns I need
# don't include blowouts because expected points is done on a drive-by-drive basis and win probability is game-by-game
# meaning that in blowouts expected points can change significantly while win probability won't
epawpaCOR = all_data %>%
select(pos_score_diff, half, EPA, wpa) %>%
filter(half == 1, between(pos_score_diff, -21, 21),
!is.na(EPA), !is.na(wpa))
# get the r value for the relationship between EPA and wpa
correlation_coefficient = cor(epawpaCOR$EPA, epawpaCOR$wpa, method = "pearson")
# test significance of r value
cor_test_result <- cor.test(epawpaCOR$EPA, epawpaCOR$wpa, method = "pearson")
# visualize correlation
corGraph = ggplot(epawpaCOR, aes(x = EPA, y = wpa)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE, color = "red") + # Add a linear regression line
labs(title = "EPA vs wpa",
x = "Expected Points Added",
y = "Win Probability Added") +
theme_minimal()
# print values and visual
correlation_coefficient
## [1] 0.6140792
cor_test_result
##
## Pearson's product-moment correlation
##
## data: epawpaCOR$EPA and epawpaCOR$wpa
## t = 626.75, df = 648876, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.6125613 0.6155926
## sample estimates:
## cor
## 0.6140792
corGraph
## `geom_smooth()` using formula = 'y ~ x'

# calculate and print r squared for better understanding of the relationship
r_squared = correlation_coefficient ** 2
r_squared
## [1] 0.3770933
# use best fit line to predict wpa given EPA
# create linear model using EPA to predict wpa
wpa_given_epa = lm(wpa ~ EPA, data = epawpaCOR)
wpa_given_epa
##
## Call:
## lm(formula = wpa ~ EPA, data = epawpaCOR)
##
## Coefficients:
## (Intercept) EPA
## 0.0002909 0.0256306
Team-by-Team Evaluation
# get result fucntion
# gets the result of the game as W or L based on the teams and score
get_result <- function(home_team, away_team, home_score, away_score, target_team) {
if (home_team == target_team) {
# If target_team is the home team
if (home_score > away_score) {
return("W")
} else {
return("L")
}
} else if (away_team == target_team) {
if (away_score > home_score) {
return("W")
} else {
return("L")
}
} else {
return("N/A")
}
}
# Input a team name and get data for that specific team
# input team that you want to evaluate
team = "Rice"
# load the schedule from 2024 to get the results from each game
game_results = load_cfb_schedules()
# use get result function to get the results from each game
game_results = game_results %>%
select(game_id, home_team, away_team, home_points, away_points) %>%
rowwise() %>%
mutate(result = get_result(home_team, away_team, home_points, away_points, team)) %>%
ungroup()
# modify the table game_results for better visualization
game_results = game_results %>%
mutate(game_score = paste(home_points, away_points, sep = "-"),
matchup = paste(home_team, away_team, sep = " vs ")) %>%
select(game_id, matchup, game_score, result)
# evaluate the team's season as a whole for when they called plays that matched the expected call vs the plays that didn't match
teamEval = expected_decision %>%
filter(pos_team == team) %>%
group_by(match) %>%
summarise(net_EPA = sum(EPA),
plays = n())
# weekly evaluation instead of whole season approach
weeklyEval = expected_decision %>%
filter(pos_team == team) %>%
group_by(game_id, match) %>%
summarise(net_EPA = sum(EPA),
plays = n())
# combine the team's game results and evaluation to better understand the effects of using xEPA as a way to call plays
weeklyEval = merge(weeklyEval, game_results, by=c("game_id"))
# clarify and simplify the merged table to more clearly display the data in a table format
weekEvalprocessed = weeklyEval %>%
# game by game approach
group_by(game_id) %>%
# create new columns to understand decision making in each game
mutate(
total_plays_game = sum(plays),
plays_with_match_1_game = sum(plays[match == 1]),
percentage_match = (plays_with_match_1_game / total_plays_game) * 100
) %>%
ungroup() %>%
# get the columns I need to show and reorganize data so things are more clear
select(matchup, match, net_EPA, game_score, result, percentage_match) %>%
pivot_wider(
names_from = match,
values_from = net_EPA,
names_prefix = "netEPA_Match_",
id_cols = c(matchup, game_score, result, percentage_match)
) %>%
# rename columns for better understanding
dplyr::rename(
Matchup = matchup,
`net_EPA (Match = 0)` = netEPA_Match_0,
`net_EPA (Match = 1)` = netEPA_Match_1,
`Game Score` = game_score,
`Result` = result,
`Match %` = percentage_match
)
# display the game-by-game evaluation data in a table
evalTable = weekEvalprocessed %>%
gt() %>%
gt_hulk_col_numeric(columns = c(`net_EPA (Match = 0)`, `net_EPA (Match = 1)`, `Match %`)) %>%
tab_header(
title = md(paste0("**Net EPA by Matchup and Match Segment: ", team, "**")),
subtitle = "Match %: percentage of plays where actual call matches the expected call"
) %>%
cols_label(
Matchup = md("**Matchup**"),
`net_EPA (Match = 0)` = md("**Net EPA (No Match)**"),
`net_EPA (Match = 1)` = md("**Net EPA (Match)**"),
`Game Score` = md("**Game Result**"),
`Result` = "",
`Match %` = md("**Match %**")
) %>%
fmt_number(
columns = c(`net_EPA (Match = 0)`, `net_EPA (Match = 1)`),
decimals = 3
) %>%
cols_align(align = "left",columns = Matchup) %>%
cols_align(align = "right", columns = c(`net_EPA (Match = 0)`, `net_EPA (Match = 1)`, `Match %`)) %>%
cols_align(align = "center",columns = c(`Game Score`, `Result`))
evalTable
| Net EPA by Matchup and Match Segment: Rice |
| Match %: percentage of plays where actual call matches the expected call |
| Matchup |
Game Result |
|
Match % |
Net EPA (No Match) |
Net EPA (Match) |
| UConn vs Rice |
17-10 |
L |
46.55172 |
−15.006 |
−14.366 |
| Houston vs Rice |
33-7 |
L |
38.00000 |
−11.001 |
−5.528 |
| Rice vs Sam Houston |
14-34 |
L |
60.65574 |
−10.259 |
−0.857 |
| Rice vs Texas Southern |
69-7 |
W |
48.38710 |
18.202 |
1.638 |
| Army vs Rice |
37-14 |
L |
76.92308 |
−0.918 |
7.750 |
| Rice vs Charlotte |
20-21 |
L |
55.71429 |
−0.898 |
2.203 |
| Rice vs UTSA |
29-27 |
W |
67.79661 |
−1.702 |
12.061 |
| Tulane vs Rice |
24-10 |
L |
62.29508 |
−5.755 |
−15.132 |
| Rice vs Navy |
24-10 |
W |
56.45161 |
−3.593 |
6.149 |
| Memphis vs Rice |
27-20 |
L |
47.05882 |
−2.590 |
0.436 |
| UAB vs Rice |
40-14 |
L |
51.35135 |
−13.459 |
−4.597 |
| Rice vs South Florida |
35-28 |
W |
46.83544 |
−6.568 |
9.694 |