Link to access this: https://rpubs.com/tbreedy/1345623
# ---- Libraries ----
install.packages("nflreadr")
install.packages("tidyverse")
install.packages("lubridate")
install.packages("janitor")
install.packages("scales")
install.packages("gt")
install.packages("ggrepel")
install.packages("zoo")
library(nflreadr) # data access
## Warning: package 'nflreadr' was built under R version 4.4.3
library(tidyverse) # dplyr/ggplot/readr, etc.
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) # dates/times
library(janitor) # clean_names, tabyl
## Warning: package 'janitor' was built under R version 4.4.3
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(scales) # pretty labels
## Warning: package 'scales' was built under R version 4.4.3
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(gt) # presentation tables (not saved here)
## Warning: package 'gt' was built under R version 4.4.3
library(ggrepel) # labels for charts
## Warning: package 'ggrepel' was built under R version 4.4.3
library(zoo) # rolling calcs
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# ---- Parameters ----
team <- "IND"
season <- 2025
# ---- 1) Identify Colts' first regular-season game in 2025 ----
sched <- load_schedules(seasons = season) |> clean_names()
colts_g1 <- sched |>
filter(game_type == "REG",
season == !!season,
home_team == !!team | away_team == !!team) |>
arrange(gameday, week) |>
slice(1)
colts_g1
## ── nflverse games and schedules ────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 13:33:11 EDT
## # A tibble: 1 × 46
## game_id season game_type week gameday weekday gametime away_team away_score
## <chr> <int> <chr> <int> <chr> <chr> <chr> <chr> <int>
## 1 2025_01_… 2025 REG 1 2025-0… Sunday 13:00 MIA 8
## # ℹ 37 more variables: home_team <chr>, home_score <int>, location <chr>,
## # result <int>, total <int>, overtime <int>, old_game_id <chr>, gsis <int>,
## # nfl_detail_id <chr>, pfr <chr>, pff <int>, espn <chr>, ftn <int>,
## # away_rest <int>, home_rest <int>, away_moneyline <int>,
## # home_moneyline <int>, spread_line <dbl>, away_spread_odds <int>,
## # home_spread_odds <int>, total_line <dbl>, under_odds <int>,
## # over_odds <int>, div_game <int>, roof <chr>, surface <chr>, temp <int>, …
load_schedule loads the scheduled data (really helpful the way nflreadr does this)
get the schedule, filter for regular season, then arrange the games in chronological order so if we select the first game of the data, we get the first game of the season.
you want to make sure the data is arranged first so the first value is the first game. If it was arranged, the first value could be any game in the season, so the first value you pull may not be the first game.
stopifnot(nrow(colts_g1) == 1)
game_id <- colts_g1$game_id
week <- colts_g1$week
gameday <- colts_g1$gameday
home_tm <- colts_g1$home_team
away_tm <- colts_g1$away_team
opp <- ifelse(home_tm == team, away_tm, home_tm)
message(glue::glue("Found Colts Week {week} ({gameday}): {away_tm} @ {home_tm} | game_id = {game_id}"))
## Found Colts Week 1 (2025-09-07): MIA @ IND | game_id = 2025_01_MIA_IND
# ---- 2) Pull core data for this game ----
pbp_g1 <- load_pbp(seasons = season) |>
clean_names() |>
filter(game_id == !!game_id)
rw_g1 <- load_rosters_weekly(seasons = season) |>
clean_names() |>
filter(week == !!week, team %in% c(team, opp))
officials_g1 <- load_officials(seasons = season) |>
clean_names() |>
filter(game_id == !!game_id)
players_master <- load_players() |> clean_names()
# ---- A) Structural sanity: dimensions & basic types ----
cat("\n--- STRUCTURE ---\n")
##
## --- STRUCTURE ---
cat("PBP rows/cols:", nrow(pbp_g1), ncol(pbp_g1), "\n")
## PBP rows/cols: 152 372
cat("Weekly roster rows/cols:", nrow(rw_g1), ncol(rw_g1), "\n")
## Weekly roster rows/cols: 3047 36
cat("Officials rows/cols:", nrow(officials_g1), ncol(officials_g1), "\n\n")
## Officials rows/cols: 0 9
# ---- B) Key constraints & duplicates ----
# Expect play-level key uniqueness by (game_id, play_id)
dup_plays <- pbp_g1 |>
count(game_id, play_id) |>
filter(n > 1)
if (nrow(dup_plays) > 0) {
warning("Duplicate plays detected in (game_id, play_id). Inspect `dup_plays`.")
} else {
cat("Key check passed: (game_id, play_id) appear unique for this game.\n")
}
## Key check passed: (game_id, play_id) appear unique for this game.
# For weekly rosters, (team, gsis_id) often repeats by position slots; check basic duplication
dup_roster <- rw_g1 |>
count(team, gsis_id, week) |>
filter(n > 1)
if (nrow(dup_roster) > 0) {
message("Note: roster has repeated (team, gsis_id, week) rows—can be normal when multiple rows exist per role/slot.")
}
# ---- C) Referential integrity (players present in master) ----
# Attempt to match weekly roster players to master by gsis_id (best available common key)
if ("gsis_id" %in% names(rw_g1) && "gsis_id" %in% names(players_master)) {
roster_master_match <- rw_g1 |>
mutate(in_master = gsis_id %in% players_master$gsis_id) |>
summarize(match_rate = mean(in_master, na.rm = TRUE))
cat("Player master match rate (weekly roster -> master by gsis_id):",
round(roster_master_match$match_rate*100, 1), "%\n")
} else {
message("gsis_id not present to check roster->player master referential integrity.")
}
## Player master match rate (weekly roster -> master by gsis_id): 93.3 %
# ---- D) Missingness (critical analytical fields) ----
na_rate <- function(x) mean(is.na(x))
na_summary <- pbp_g1 |>
summarize(
na_epa = na_rate(epa),
na_success = na_rate(success),
na_posteam = na_rate(posteam),
na_defteam = na_rate(defteam),
na_yardline_100 = na_rate(yardline_100),
na_down = na_rate(down),
na_ydstogo = na_rate(ydstogo),
na_play_type = na_rate(play_type)
)
cat("\n--- MISSINGNESS (PBP) ---\n"); print(na_summary)
##
## --- MISSINGNESS (PBP) ---
## # A tibble: 1 × 8
## na_epa na_success na_posteam na_defteam na_yardline_100 na_down na_ydstogo
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0132 0.0132 0.0658 0.0658 0.0789 0.171 0
## # ℹ 1 more variable: na_play_type <dbl>
# ---- E) Range & logical checks ----
# yardline_100 should be within [0, 100]
bad_yardline <- pbp_g1 |> filter(!is.na(yardline_100) & (yardline_100 < 0 | yardline_100 > 100))
if (nrow(bad_yardline) > 0) warning("Out-of-range yardline_100 values observed.")
# down should be 1–4 (ignore special plays like kickoffs which may have NA)
bad_down <- pbp_g1 |> filter(!is.na(down) & !(down %in% 1:4))
if (nrow(bad_down) > 0) warning("Unexpected down values outside 1–4.")
# ydstogo should be positive for standard plays
bad_ydstogo <- pbp_g1 |> filter(!is.na(ydstogo) & ydstogo <= 0)
if (nrow(bad_ydstogo) > 0) message("Non-positive ydstogo rows exist (could be goal-to-go anomalies or data glitches).")
## Non-positive ydstogo rows exist (could be goal-to-go anomalies or data glitches).
# game_id consistency
if (length(unique(pbp_g1$game_id)) != 1) warning("Multiple game_ids in pbp_g1; filter may be off.")
# posteam/defteam should be one of the two teams (allow NA on non-plays)
valid_teams <- c(team, opp)
weird_teams <- pbp_g1 |>
filter(!is.na(posteam) & !(posteam %in% valid_teams)) |>
distinct(posteam)
if (nrow(weird_teams) > 0) warning("Found posteam not matching Colts or opponent: check `weird_teams`.")
# ---- F) Categorical consistency (team abbreviations) ----
cat("\n--- TEAM ABBREVS IN PBP ---\n")
##
## --- TEAM ABBREVS IN PBP ---
print(sort(unique(na.omit(pbp_g1$posteam))))
## [1] "IND" "MIA"
print(sort(unique(na.omit(pbp_g1$defteam))))
## [1] "IND" "MIA"
dup_plays are any plays that are duplicated in the data (each play should have one instance because it happened once).
you want to make sure that there aren’t any egregious errors in the data. It’s impossible for a value to be outside of the range 0-100, but if there is, then there is some sort of error. At a minimum, this could suggest that the yard is unknown (perhaps a -1 or NAN value), but it could also suggest that there is a pervasive issue that alters some subset of the yard data. That could be bad if say a quarter of the data was shifted 5 yards down or up, not too many of the issues would be flagged, so errors would go unnoticed.
goal-to-go anomalies or data glitches. It could potentially be a rounding error (4th and inches?), and maybe there is something strange happening like a kick off or other glitches like a penalty which messes with the value.
# ---- 3) Game overview: scoring & tempo ----
pbp_g1 <- pbp_g1 |>
mutate(
points_scored = case_when(
touchdown == 1 ~ 6, # Touchdowns
field_goal_result == "made" ~ 3, # Field goals
safety == 1 ~ 2, # Safeties
extra_point_result == "good" ~ 1, # PAT kicks
two_point_attempt == 1 & two_point_conv_result == "success" ~ 2, # 2PT conversions
TRUE ~ 0
)
)
case_when is nice for interpretability when working with so many datatypes, especially when they are conditional and non-numeric. Formulas could be made such as, say, sum(touchdowns) * 6 to get touchdown points (if a touchdown is a 1, then it counts the number of touchdowns and multiplies by 6 points). However, field_goal_made would need to be converted from “made” to the value 1, then that needs to be put into a formula. It’s complicated, but case_when makes it easier.
The scores would be understated by 2x the number of successful two point conversions omitted. This would also affect every other statistic that may be based off of the success of this play (player ratings, win/loss if calculated dependent from this score calculation, etc).
score_by_q <- pbp_g1 |>
filter(!is.na(qtr), qtr %in% 1:4) |>
group_by(qtr) |>
summarize(
ind_points = sum(if_else(posteam == "IND", points_scored, 0), na.rm = TRUE),
opp_points = sum(if_else(posteam == opp, points_scored, 0), na.rm = TRUE),
plays = n(),
.groups = "drop"
)
cat("\n--- SCORE BY QUARTER (derived from PBP points events) ---\n")
##
## --- SCORE BY QUARTER (derived from PBP points events) ---
print(score_by_q)
## # A tibble: 4 × 4
## qtr ind_points opp_points plays
## <dbl> <dbl> <dbl> <int>
## 1 1 3 0 31
## 2 2 17 0 45
## 3 3 3 0 33
## 4 4 10 8 43
You need to group by quarter first or else you’d summarize on the entire data set. In other words, you wouldn’t get the data aggregated by quarter.
You could do this many ways. You could group by halves, you could create a new variable called halves and put 1 or 2 based on the quarter value, you could even group by quarter then add the 1-2 and 3-4 quarter values together to make the first and second half values.
tempo <- pbp_g1 |>
filter(!is.na(posteam), !is.na(game_seconds_remaining)) |>
arrange(game_seconds_remaining) |>
group_by(posteam) |>
mutate(sec_between = lag(game_seconds_remaining) - game_seconds_remaining) |>
summarize(sec_per_play = median(sec_between, na.rm = TRUE), .groups = "drop") |>
filter(!is.na(sec_per_play))
cat("\n--- TEMPO (median sec/play) ---\n"); print(tempo)
##
## --- TEMPO (median sec/play) ---
## # A tibble: 2 × 2
## posteam sec_per_play
## <chr> <dbl>
## 1 IND -38.5
## 2 MIA -34.5
# ---- 4) Offensive profile: play mix & efficiency ----
off_mix <- pbp_g1 |>
filter(!is.na(posteam)) |>
mutate(play_family = case_when(
play_type %in% c("run") ~ "Run",
play_type %in% c("pass") ~ "Pass",
play_type %in% c("qb_kneel","qb_spike") ~ "Clock",
play_type %in% c("no_play","timeout") ~ "Other",
TRUE ~ "Other"
)) |>
group_by(posteam, play_family) |>
summarize(
plays = n(),
share = n()/nrow(pbp_g1),
epa_per_play = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE),
yards_per_play = mean(yards_gained, na.rm = TRUE),
.groups = "drop"
) |>
arrange(posteam, desc(plays))
cat("\n--- OFFENSIVE MIX & EFFICIENCY ---\n"); print(off_mix, n = 50)
##
## --- OFFENSIVE MIX & EFFICIENCY ---
## # A tibble: 6 × 7
## posteam play_family plays share epa_per_play success_rate yards_per_play
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 IND Run 40 0.263 0.138 0.5 3.9
## 2 IND Pass 30 0.197 0.433 0.567 8.73
## 3 IND Other 15 0.0987 0.190 0.667 0
## 4 MIA Pass 35 0.230 -0.461 0.371 3.86
## 5 MIA Run 12 0.0789 0.339 0.583 6.5
## 6 MIA Other 10 0.0658 0.352 0.778 0
There may be different play types that we are interested in, such as qb_kneel or qb_spike, so instead of putting an or statement when looking at both, we can simply filter by clock.
sum that value and ensure it adds up to 1
# EPA distribution (visual)
epa_dist <- pbp_g1 |>
filter(!is.na(posteam), play_type %in% c("run","pass")) |>
mutate(is_colts = if_else(posteam == "IND", "Colts", "Opponent"))
ggplot(epa_dist, aes(x = epa, fill = is_colts)) +
geom_histogram(bins = 40, alpha = 0.6, position = "identity") +
geom_vline(data = epa_dist |> group_by(is_colts) |> summarize(mu = mean(epa, na.rm = TRUE)),
aes(xintercept = mu, color = is_colts), linewidth = 0.9) +
scale_x_continuous(labels = number_format(accuracy = 0.1)) +
labs(title = "EPA distribution (pass & run plays)",
x = "EPA per play", y = "Count", fill = "", color = "") +
theme_minimal()
# ---- 5) Situational analysis ----
situ <- pbp_g1 |>
filter(!is.na(down), !is.na(ydstogo), play_type %in% c("pass","run")) |>
mutate(
ytg_bucket = case_when(
ydstogo <= 2 ~ "1-2",
ydstogo <= 5 ~ "3-5",
ydstogo <= 10 ~ "6-10",
TRUE ~ "11+"
),
down_lab = paste0("Down ", down)
) |>
group_by(posteam, down_lab, ytg_bucket) |>
summarize(
plays = n(),
success_rate = mean(success, na.rm = TRUE),
epa_per_play = mean(epa, na.rm = TRUE),
.groups = "drop"
)
ggplot(situ |> filter(posteam %in% c("IND", opp)),
aes(x = ytg_bucket, y = success_rate, group = posteam, color = posteam)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
facet_wrap(~down_lab) +
scale_y_continuous(labels = percent) +
labs(title = "Success rate by down & yards-to-go",
x = "Yards to go (bucket)", y = "Success rate", color = "Offense") +
theme_minimal()
it helps turn numeric values, especially in varying ranges, into categorical variables which can easily be displayed and aggregated.
It looks like the further away a team is on downs, the higher the success rate - but with two major exceptions: this doesn’t appear to apply for 3rd downs, and there is a nike swoosh for 2nd downs where the rate is high for 1-2 yards, drops significantly for 3-5, then graudually increases from distances past there.
# Field position zones
fieldpos <- pbp_g1 |>
filter(!is.na(yardline_100), play_type %in% c("run","pass")) |>
mutate(
field_zone = case_when(
yardline_100 >= 80 ~ "Own 20-0",
yardline_100 >= 50 ~ "Own 49-21",
yardline_100 >= 21 ~ "Opp 49-21",
TRUE ~ "Red Zone (<=20)"
)
) |>
group_by(posteam, field_zone) |>
summarize(
plays = n(),
epa_per_play = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE),
.groups = "drop"
)
ggplot(fieldpos |> filter(posteam %in% c("IND", opp)),
aes(x = field_zone, y = epa_per_play, fill = posteam)) +
geom_col(position = position_dodge(width = 0.8)) +
coord_flip() +
labs(title = "EPA/play by field zone",
x = "", y = "EPA per play", fill = "Offense") +
theme_minimal()
# ---- 6) Drive lens ----
drive_sum <- pbp_g1 |>
filter(!is.na(drive), !is.na(posteam)) |>
group_by(posteam, drive) |>
summarize(
plays = n(),
yards = sum(yards_gained, na.rm = TRUE),
points = sum(points_scored, na.rm = TRUE), # <-- FIXED
epa = sum(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE),
ended_score = any(touchdown == 1 | field_goal_result == "made" | safety == 1, na.rm = TRUE),
.groups = "drop"
) |>
arrange(posteam, desc(points), desc(yards))
the play wasn’t a drive, maybe it was another classification of play like timeout, penalty, kick off, etc.
it means the drive was bad and the team is in a worse position than they were before the drive.
# ---- 7) Player involvement ----
colts_targets <- pbp_g1 |>
filter(posteam == team, pass == 1) |>
count(receiver_player_name, sort = TRUE, name = "targets")
colts_rushers <- pbp_g1 |>
filter(posteam == team, rush == 1) |>
count(rusher_player_name, sort = TRUE, name = "rush_att")
epa_by_receiver <- pbp_g1 |>
filter(posteam == team, pass == 1, !is.na(receiver_player_name)) |>
group_by(receiver_player_name) |>
summarize(
targets = n(),
epa_sum = sum(epa, na.rm = TRUE),
epa_mean = mean(epa, na.rm = TRUE),
.groups = "drop"
) |> arrange(desc(epa_sum))
ggplot(epa_by_receiver, aes(x = targets, y = epa_sum, label = receiver_player_name)) +
geom_point(size = 3) +
geom_text_repel(min.segment.length = 0) +
labs(title = "Colts receivers: total EPA vs targets",
x = "Targets", y = "Total EPA") +
theme_minimal()
epa_by_receiver
## # A tibble: 7 × 4
## receiver_player_name targets epa_sum epa_mean
## <chr> <int> <dbl> <dbl>
## 1 M.Pittman 8 6.89 0.861
## 2 J.Taylor 3 3.09 1.03
## 3 M.Alie-Cox 1 1.36 1.36
## 4 T.Warren 9 1.19 0.132
## 5 A.Pierce 3 1.03 0.342
## 6 A.Mitchell 2 0.730 0.365
## 7 J.Downs 3 0.434 0.145
because you want to make sure the play is a pass - when it isn’t, it doesn’t matter.
M. Alie-Cox (1.36)
# ---- 8) Penalties & Special Teams ----
penalties <- pbp_g1 |>
filter(!is.na(penalty)) |>
mutate(off_def = if_else(penalty_team == posteam, "Offense", "Defense")) |>
count(penalty_team, off_def, penalty_type, sort = TRUE)
cat("\n--- PENALTIES ---\n"); print(penalties, n = 30)
##
## --- PENALTIES ---
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 6 × 4
## penalty_team off_def penalty_type n
## <chr> <chr> <chr> <int>
## 1 <NA> <NA> <NA> 140
## 2 IND Offense Offensive Holding 3
## 3 IND Defense Unnecessary Roughness 1
## 4 MIA Defense Defensive Holding 1
## 5 MIA Defense Running Into the Kicker 1
## 6 MIA Offense Offensive Too Many Men on Field 1
st_fg <- pbp_g1 |>
filter(play_type == "field_goal") |>
count(posteam, field_goal_result)
st_punt <- pbp_g1 |>
filter(play_type == "punt") |>
summarize(punts = n(), avg_punt_yards = mean(yards_gained, na.rm = TRUE))
cat("\n--- SPECIAL TEAMS ---\n"); print(list(field_goals = st_fg, punts = st_punt))
##
## --- SPECIAL TEAMS ---
## $field_goals
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 1 × 3
## posteam field_goal_result n
## <chr> <chr> <int>
## 1 IND made 4
##
## $punts
## # A tibble: 1 × 2
## punts avg_punt_yards
## <int> <dbl>
## 1 1 0
IND
If there’s a penalty such as a long 4th quarter, 4th down play being turned into an automatic 1st down, this goes from a team being unlikely to score a critical touchdown (and perhaps immediately turning into an unfavorable defensive switch), to a fighting hope to continue the run.
# ---- 9) Time-based trends ----
epa_by_q <- pbp_g1 %>%
filter(qtr %in% 1:4, play_type %in% c("pass","run")) %>%
group_by(posteam, qtr) %>%
summarize(
epa_per_play = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE),
.groups = "drop"
)
ggplot(epa_by_q, aes(x = factor(qtr), y = epa_per_play, group = posteam, color = posteam)) +
geom_line(linewidth = 1) + geom_point(size = 2) +
labs(title = "EPA/play by quarter", x = "Quarter", y = "EPA/play", color = "Offense") +
theme_minimal()
# Rolling pass rate (10-play window) for Colts
colts_seq <- pbp_g1 %>%
filter(posteam == team, play_type %in% c("pass","run")) %>%
mutate(play_index = row_number(),
pass_flag = if_else(play_type == "pass", 1, 0)) %>%
arrange(play_index)
if (nrow(colts_seq) >= 10) {
colts_seq_roll <- colts_seq %>%
mutate(pass_rate_10 = zoo::rollapply(pass_flag, width = 10, FUN = mean,
align = "right", fill = NA_real_))
ggplot(colts_seq_roll, aes(x = play_index, y = pass_rate_10)) +
geom_line(linewidth = 1) +
scale_y_continuous(labels = percent) +
labs(title = "Colts rolling pass rate (10-play window)",
x = "Offensive play index", y = "Pass rate") +
theme_minimal()
}
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_line()`).
colts_seq_roll[c('game_half', 'pass_rate_10')] |>
group_by(game_half) |>
summarise(avg_pass_rate_10 = mean(pass_rate_10, na.rm = TRUE))
## # A tibble: 2 × 2
## game_half avg_pass_rate_10
## <chr> <dbl>
## 1 Half1 0.535
## 2 Half2 0.274