

# https://www.nflfastr.com/articles/beginners_guide.html
# Install, if needed, and load necessary packages
if (!require("tidyverse")) install.packages("tidyverse")
if (!require("ggrepel")) install.packages("ggrepel")
if (!require("nflreadr")) install.packages("nflreadr")
if (!require("nflplotR")) install.packages("nflplotR")
library(tidyverse)
library(ggrepel)
library(nflreadr)
library(nflplotR)
# Turn off scientific notation
options(scipen = 9999)
# Load play-by-plat data for 2024 season. Year can be adjusted
data <- load_pbp(2024)
# Filter data for run or pass plays that produced an
# expected points added (epa) value.
pbp_rp <- data %>%
filter(rush == 1 | pass == 1, !is.na(epa))
# Create a "mydata" data frame containing only the variables
# needed for the analysis. Then show the first 25 lines.
# Note: I'm customizing the example code, here.
mydata <- pbp_rp %>%
select(posteam, pass, wp, qtr, down, half_seconds_remaining)
glimpse(mydata)
# Further filter the mydata data frame for cases needed to examine
# "Which teams were the most pass-heavy in the first half on early
# downs with win probability between 20 and 80, excluding the final
# 2 minutes of the half when everyone is pass-happy." Note that the
# code creates a "mean_pass" variable that averages each team's "pass"
# values for every play of the season. "Pass" is coded 1 for a pass
# and zero for not a pass, so an average above 0.5 indicates more passes
# than runs. The code collapses the data by team, then saves the results
# in a "mydata_summary" data frame.
mydata_summary <- mydata %>%
filter(wp > .20 &
wp < .80 &
down <= 2 &
qtr <= 2 &
half_seconds_remaining > 120) %>%
group_by(posteam) %>%
summarize(mean_pass = mean(pass), plays = n()) %>%
arrange(-mean_pass)
glimpse(mydata_summary)
# Graphing the mean_pass values by team.
graph <- ggplot(mydata_summary,
aes(x=reorder(posteam,-mean_pass),
y=mean_pass)) +
geom_text(aes(label=posteam, size = 2)) +
theme(axis.text.x = element_blank())
graph