library(nflreadr)
# Load data
schedules <- load_schedules(2025)
rosters <- load_rosters(2025)
pbp <- load_pbp(2025)
officials <- load_officials(2025)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Get Colts first reg season game using arrange and slice
colts_week1 <- schedules %>%
filter((home_team == "IND" | away_team == "IND") & game_type == "REG") %>%
arrange(week) %>%
slice(1)
game_id <- colts_week1$game_id
# Get play-by-play data for this specific game
colts_dolphins_pbp <- pbp %>% filter(game_id == !!game_id)
# Get officials for this game
colts_dolphins_officials <- officials %>% filter(game_id == !!game_id)
# Key uniqueness checks
cat("=== KEY UNIQUENESS CHECKS ===\n")
## === KEY UNIQUENESS CHECKS ===
# Check game_id uniqueness
cat("Duplicate game_ids:\n")
## Duplicate game_ids:
duplicate_games <- pbp %>%
count(game_id) %>%
filter(n > 1)
print(nrow(duplicate_games))
## [1] 33
# Check play_id uniqueness
cat("\nDuplicate play_ids:\n")
##
## Duplicate play_ids:
duplicate_plays <- pbp %>%
count(play_id) %>%
filter(n > 1)
print(nrow(duplicate_plays))
## [1] 1607
cat("\n=== MISSINGNESS CHECKS ===\n")
##
## === MISSINGNESS CHECKS ===
# Check yardline_100 missingness
missing_yardline <- sum(is.na(pbp$yardline_100))
cat("Missing yardline_100 values:", missing_yardline, "\n")
## Missing yardline_100 values: 419
# Check down missingness
missing_down <- sum(is.na(pbp$down))
cat("Missing down values:", missing_down, "\n")
## Missing down values: 941
# Check play_type missingness
missing_play_type <- sum(is.na(pbp$play_type))
cat("Missing play_type values:", missing_play_type, "\n")
## Missing play_type values: 171
# Check ydstogo missingness
missing_ydstogo <- sum(is.na(pbp$ydstogo))
cat("Missing ydstogo values:", missing_ydstogo, "\n")
## Missing ydstogo values: 0
cat("\n=== RANGE CHECKS ===\n")
##
## === RANGE CHECKS ===
# Check yardline_100 range
bad_yardline <- pbp %>%
filter(!is.na(yardline_100) & (yardline_100 < 0 | yardline_100 > 100)) %>%
nrow()
cat("Yardline_100 out of range (should be 0-100):", bad_yardline, "\n")
## Yardline_100 out of range (should be 0-100): 0
# Check down range
bad_down <- pbp %>%
filter(!is.na(down) & (down < 1 | down > 4)) %>%
nrow()
cat("Down out of range (should be 1-4):", bad_down, "\n")
## Down out of range (should be 1-4): 0
nrow(pbp)
## [1] 5689
names(pbp)[1:130] # first 10 column names
## [1] "play_id" "game_id"
## [3] "old_game_id" "home_team"
## [5] "away_team" "season_type"
## [7] "week" "posteam"
## [9] "posteam_type" "defteam"
## [11] "side_of_field" "yardline_100"
## [13] "game_date" "quarter_seconds_remaining"
## [15] "half_seconds_remaining" "game_seconds_remaining"
## [17] "game_half" "quarter_end"
## [19] "drive" "sp"
## [21] "qtr" "down"
## [23] "goal_to_go" "time"
## [25] "yrdln" "ydstogo"
## [27] "ydsnet" "desc"
## [29] "play_type" "yards_gained"
## [31] "shotgun" "no_huddle"
## [33] "qb_dropback" "qb_kneel"
## [35] "qb_spike" "qb_scramble"
## [37] "pass_length" "pass_location"
## [39] "air_yards" "yards_after_catch"
## [41] "run_location" "run_gap"
## [43] "field_goal_result" "kick_distance"
## [45] "extra_point_result" "two_point_conv_result"
## [47] "home_timeouts_remaining" "away_timeouts_remaining"
## [49] "timeout" "timeout_team"
## [51] "td_team" "td_player_name"
## [53] "td_player_id" "posteam_timeouts_remaining"
## [55] "defteam_timeouts_remaining" "total_home_score"
## [57] "total_away_score" "posteam_score"
## [59] "defteam_score" "score_differential"
## [61] "posteam_score_post" "defteam_score_post"
## [63] "score_differential_post" "no_score_prob"
## [65] "opp_fg_prob" "opp_safety_prob"
## [67] "opp_td_prob" "fg_prob"
## [69] "safety_prob" "td_prob"
## [71] "extra_point_prob" "two_point_conversion_prob"
## [73] "ep" "epa"
## [75] "total_home_epa" "total_away_epa"
## [77] "total_home_rush_epa" "total_away_rush_epa"
## [79] "total_home_pass_epa" "total_away_pass_epa"
## [81] "air_epa" "yac_epa"
## [83] "comp_air_epa" "comp_yac_epa"
## [85] "total_home_comp_air_epa" "total_away_comp_air_epa"
## [87] "total_home_comp_yac_epa" "total_away_comp_yac_epa"
## [89] "total_home_raw_air_epa" "total_away_raw_air_epa"
## [91] "total_home_raw_yac_epa" "total_away_raw_yac_epa"
## [93] "wp" "def_wp"
## [95] "home_wp" "away_wp"
## [97] "wpa" "vegas_wpa"
## [99] "vegas_home_wpa" "home_wp_post"
## [101] "away_wp_post" "vegas_wp"
## [103] "vegas_home_wp" "total_home_rush_wpa"
## [105] "total_away_rush_wpa" "total_home_pass_wpa"
## [107] "total_away_pass_wpa" "air_wpa"
## [109] "yac_wpa" "comp_air_wpa"
## [111] "comp_yac_wpa" "total_home_comp_air_wpa"
## [113] "total_away_comp_air_wpa" "total_home_comp_yac_wpa"
## [115] "total_away_comp_yac_wpa" "total_home_raw_air_wpa"
## [117] "total_away_raw_air_wpa" "total_home_raw_yac_wpa"
## [119] "total_away_raw_yac_wpa" "punt_blocked"
## [121] "first_down_rush" "first_down_pass"
## [123] "first_down_penalty" "third_down_converted"
## [125] "third_down_failed" "fourth_down_converted"
## [127] "fourth_down_failed" "incomplete_pass"
## [129] "touchback" "interception"
summary(pbp$yardline_100)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.00 30.00 48.00 47.07 66.00 99.00 419
summary(pbp$down)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.000 2.000 2.016 3.000 4.000 941
library(janitor)
## Warning: package 'janitor' was built under R version 4.3.3
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
# ---- Parameters ----
team <- "IND"
season <- 2025
# ---- 1) Identify Colts' first regular-season game in 2025 ----
sched <- load_schedules(seasons = season) %>% clean_names()
colts_g1 <- sched %>%
filter(game_type == "REG",
season == !!season,
home_team == !!team | away_team == !!team) %>%
arrange(gameday, week) %>%
slice(1)
stopifnot(nrow(colts_g1) == 1)
game_id <- colts_g1$game_id
week <- colts_g1$week
gameday <- colts_g1$gameday
home_tm <- colts_g1$home_team
away_tm <- colts_g1$away_team
opp <- ifelse(home_tm == team, away_tm, home_tm)
message(glue::glue("Found Colts Week {week} ({gameday}): {away_tm} @ {home_tm} | game_id = {game_id}"))
## Found Colts Week 1 (2025-09-07): MIA @ IND | game_id = 2025_01_MIA_IND
# ---- 2) Pull core data for this game ----
pbp_g1 <- load_pbp(seasons = season) %>%
clean_names() %>%
filter(game_id == !!game_id)
rw_g1 <- load_rosters_weekly(seasons = season) %>%
clean_names() %>%
filter(week == !!week, team %in% c(team, opp))
officials_g1 <- load_officials(seasons = season) %>%
clean_names() %>%
filter(game_id == !!game_id)
players_master <- load_players() %>% clean_names()
# ============================================================
# EMPHASIS: DATA INTEGRITY CHECKS
# ============================================================
# ---- A) Structural sanity: dimensions & basic types ----
cat("\n--- STRUCTURE ---\n")
##
## --- STRUCTURE ---
cat("PBP rows/cols:", nrow(pbp_g1), ncol(pbp_g1), "\n")
## PBP rows/cols: 152 372
cat("Weekly roster rows/cols:", nrow(rw_g1), ncol(rw_g1), "\n")
## Weekly roster rows/cols: 3047 36
cat("Officials rows/cols:", nrow(officials_g1), ncol(officials_g1), "\n\n")
## Officials rows/cols: 0 9
# ---- B) Key constraints & duplicates ----
# Expect play-level key uniqueness by (game_id, play_id)
dup_plays <- pbp_g1 %>%
count(game_id, play_id) %>%
filter(n > 1)
if (nrow(dup_plays) > 0) {
warning("Duplicate plays detected in (game_id, play_id). Inspect `dup_plays`.")
} else {
cat("Key check passed: (game_id, play_id) appear unique for this game.\n")
}
## Key check passed: (game_id, play_id) appear unique for this game.
# For weekly rosters, (team, gsis_id) often repeats by position slots; check basic duplication
dup_roster <- rw_g1 %>%
count(team, gsis_id, week) %>%
filter(n > 1)
if (nrow(dup_roster) > 0) {
message("Note: roster has repeated (team, gsis_id, week) rows—can be normal when multiple rows exist per role/slot.")
}
# ---- C) Referential integrity (players present in master) ----
# Attempt to match weekly roster players to master by gsis_id (best available common key)
if ("gsis_id" %in% names(rw_g1) && "gsis_id" %in% names(players_master)) {
roster_master_match <- rw_g1 %>%
mutate(in_master = gsis_id %in% players_master$gsis_id) %>%
summarize(match_rate = mean(in_master, na.rm = TRUE))
cat("Player master match rate (weekly roster -> master by gsis_id):",
round(roster_master_match$match_rate*100, 1), "%\n")
} else {
message("gsis_id not present to check roster->player master referential integrity.")
}
## Player master match rate (weekly roster -> master by gsis_id): 93.3 %
# ---- D) Missingness (critical analytical fields) ----
na_rate <- function(x) mean(is.na(x))
na_summary <- pbp_g1 %>%
summarize(
na_epa = na_rate(epa),
na_success = na_rate(success),
na_posteam = na_rate(posteam),
na_defteam = na_rate(defteam),
na_yardline_100 = na_rate(yardline_100),
na_down = na_rate(down),
na_ydstogo = na_rate(ydstogo),
na_play_type = na_rate(play_type)
)
cat("\n--- MISSINGNESS (PBP) ---\n"); print(na_summary)
##
## --- MISSINGNESS (PBP) ---
## # A tibble: 1 × 8
## na_epa na_success na_posteam na_defteam na_yardline_100 na_down na_ydstogo
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0132 0.0132 0.0658 0.0658 0.0789 0.171 0
## # ℹ 1 more variable: na_play_type <dbl>
# ---- E) Range & logical checks ----
# yardline_100 should be within [0, 100]
bad_yardline <- pbp_g1 %>% filter(!is.na(yardline_100) & (yardline_100 < 0 | yardline_100 > 100))
if (nrow(bad_yardline) > 0) warning("Out-of-range yardline_100 values observed.")
# down should be 1–4 (ignore special plays like kickoffs which may have NA)
bad_down <- pbp_g1 %>% filter(!is.na(down) & !(down %in% 1:4))
if (nrow(bad_down) > 0) warning("Unexpected down values outside 1–4.")
# ydstogo should be positive for standard plays
bad_ydstogo <- pbp_g1 %>% filter(!is.na(ydstogo) & ydstogo <= 0)
if (nrow(bad_ydstogo) > 0) message("Non-positive ydstogo rows exist (could be goal-to-go anomalies or data glitches).")
## Non-positive ydstogo rows exist (could be goal-to-go anomalies or data glitches).
# game_id consistency
if (length(unique(pbp_g1$game_id)) != 1) warning("Multiple game_ids in pbp_g1; filter may be off.")
# posteam/defteam should be one of the two teams (allow NA on non-plays)
valid_teams <- c(team, opp)
weird_teams <- pbp_g1 %>%
filter(!is.na(posteam) & !(posteam %in% valid_teams)) %>%
distinct(posteam)
if (nrow(weird_teams) > 0) warning("Found posteam not matching Colts or opponent: check `weird_teams`.")
# ---- F) Categorical consistency (team abbreviations) ----
cat("\n--- TEAM ABBREVS IN PBP ---\n")
##
## --- TEAM ABBREVS IN PBP ---
print(sort(unique(na.omit(pbp_g1$posteam))))
## [1] "IND" "MIA"
print(sort(unique(na.omit(pbp_g1$defteam))))
## [1] "IND" "MIA"
# Find rows with zero or negative ydstogo values
zero_negative_ydstogo <- pbp %>%
filter(!is.na(ydstogo) & ydstogo <= 0) %>%
select(game_id, play_id, qtr, down, ydstogo, yardline_100, goal_to_go,
play_type, desc, posteam, defteam, yards_gained, first_down_rush,
first_down_pass, first_down_penalty)
# Show how many we found
cat("Total rows with ydstogo <= 0:", nrow(zero_negative_ydstogo), "\n\n")
## Total rows with ydstogo <= 0: 941
# Show first 10 examples
cat("First 10 examples:\n")
## First 10 examples:
print(head(zero_negative_ydstogo, 10))
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 10 × 15
## game_id play_id qtr down ydstogo yardline_100 goal_to_go play_type desc
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <chr> <chr>
## 1 2025_01_… 1 1 NA 0 NA 0 <NA> GAME
## 2 2025_01_… 40 1 NA 0 35 0 kickoff 19-B…
## 3 2025_01_… 665 1 NA 0 35 0 kickoff 38-C…
## 4 2025_01_… 1042 1 NA 0 NA 0 <NA> END …
## 5 2025_01_… 1124 2 NA 0 15 0 extra_po… 19-B…
## 6 2025_01_… 1139 2 NA 0 35 0 kickoff 19-B…
## 7 2025_01_… 1366 2 NA 0 15 0 extra_po… 38-C…
## 8 2025_01_… 1381 2 NA 0 35 0 kickoff 38-C…
## 9 2025_01_… 1700 2 NA 0 35 0 kickoff 19-B…
## 10 2025_01_… 2067 2 NA 0 15 0 extra_po… 38-C…
## # ℹ 6 more variables: posteam <chr>, defteam <chr>, yards_gained <dbl>,
## # first_down_rush <dbl>, first_down_pass <dbl>, first_down_penalty <dbl>
# Break down by play_type to see patterns
cat("\n\nBreakdown by play_type:\n")
##
##
## Breakdown by play_type:
ydstogo_by_type <- zero_negative_ydstogo %>%
count(play_type, sort = TRUE)
print(ydstogo_by_type)
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 6 × 2
## play_type n
## <chr> <int>
## 1 kickoff 343
## 2 no_play 264
## 3 <NA> 170
## 4 extra_point 149
## 5 pass 11
## 6 run 4
# Look at goal_to_go situations specifically
cat("\n\nGoal-to-go situations:\n")
##
##
## Goal-to-go situations:
goal_to_go_cases <- zero_negative_ydstogo %>%
filter(goal_to_go == 1) %>%
select(down, ydstogo, yardline_100, play_type, desc) %>%
head(5)
print(goal_to_go_cases)
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 0 × 5
## # ℹ 5 variables: down <dbl>, ydstogo <dbl>, yardline_100 <dbl>,
## # play_type <chr>, desc <chr>
# Get ranges for yardline_100 and down in Colts Game 1
cat("yardline_100 range:", range(pbp_g1$yardline_100, na.rm = TRUE), "\n")
## yardline_100 range: 1 88
cat("down range:", range(pbp_g1$down, na.rm = TRUE), "\n")
## down range: 1 4
# ============================================================
# EXPLORATORY ANALYSIS
# ============================================================
# ---- 3) Game overview: scoring & tempo ----
pbp_g1 <- pbp_g1 %>%
mutate(
points_scored = case_when(
touchdown == 1 ~ 6, # Touchdowns
field_goal_result == "made" ~ 3, # Field goals
safety == 1 ~ 2, # Safeties
extra_point_result == "good" ~ 1, # PAT kicks
two_point_attempt == 1 & two_point_conv_result == "success" ~ 2, # 2PT conversions
TRUE ~ 0
)
)
score_by_q <- pbp_g1 %>%
filter(!is.na(qtr), qtr %in% 1:4) %>%
group_by(qtr) %>%
summarize(
ind_points = sum(if_else(posteam == "IND", points_scored, 0), na.rm = TRUE),
opp_points = sum(if_else(posteam == opp, points_scored, 0), na.rm = TRUE),
plays = n(),
.groups = "drop"
)
cat("\n--- SCORE BY QUARTER (derived from PBP points events) ---\n")
##
## --- SCORE BY QUARTER (derived from PBP points events) ---
print(score_by_q)
## # A tibble: 4 × 4
## qtr ind_points opp_points plays
## <dbl> <dbl> <dbl> <int>
## 1 1 3 0 31
## 2 2 17 0 45
## 3 3 3 0 33
## 4 4 10 8 43
tempo <- pbp_g1 %>%
filter(!is.na(posteam), !is.na(game_seconds_remaining)) %>%
arrange(game_seconds_remaining) %>%
group_by(posteam) %>%
mutate(sec_between = lag(game_seconds_remaining) - game_seconds_remaining) %>%
summarize(sec_per_play = median(sec_between, na.rm = TRUE), .groups = "drop") %>%
filter(!is.na(sec_per_play))
cat("\n--- TEMPO (median sec/play) ---\n"); print(tempo)
##
## --- TEMPO (median sec/play) ---
## # A tibble: 2 × 2
## posteam sec_per_play
## <chr> <dbl>
## 1 IND -38.5
## 2 MIA -34.5
library(tidyverse) # dplyr/ggplot/readr, etc.
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ ggplot2 3.5.2 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) # dates/times
library(janitor) # clean_names, tabyl
library(scales) # pretty labels
## Warning: package 'scales' was built under R version 4.3.3
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(ggrepel) # labels for charts
## Warning: package 'ggrepel' was built under R version 4.3.2
library(zoo)
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# ---- 4) Offensive profile: play mix & efficiency ----
off_mix <- pbp_g1 %>%
filter(!is.na(posteam)) %>%
mutate(play_family = case_when(
play_type %in% c("run") ~ "Run",
play_type %in% c("pass") ~ "Pass",
play_type %in% c("qb_kneel","qb_spike") ~ "Clock",
play_type %in% c("no_play","timeout") ~ "Other",
TRUE ~ "Other"
)) %>%
group_by(posteam, play_family) %>%
summarize(
plays = n(),
share = n()/nrow(pbp_g1),
epa_per_play = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE),
yards_per_play = mean(yards_gained, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(posteam, desc(plays))
cat("\n--- OFFENSIVE MIX & EFFICIENCY ---\n"); print(off_mix, n = 50)
##
## --- OFFENSIVE MIX & EFFICIENCY ---
## # A tibble: 6 × 7
## posteam play_family plays share epa_per_play success_rate yards_per_play
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 IND Run 40 0.263 0.138 0.5 3.9
## 2 IND Pass 30 0.197 0.433 0.567 8.73
## 3 IND Other 15 0.0987 0.190 0.667 0
## 4 MIA Pass 35 0.230 -0.461 0.371 3.86
## 5 MIA Run 12 0.0789 0.339 0.583 6.5
## 6 MIA Other 10 0.0658 0.352 0.778 0
# EPA distribution (visual)
epa_dist <- pbp_g1 %>%
filter(!is.na(posteam), play_type %in% c("run","pass")) %>%
mutate(is_colts = if_else(posteam == "IND", "Colts", "Opponent"))
ggplot(epa_dist, aes(x = epa, fill = is_colts)) +
geom_histogram(bins = 40, alpha = 0.6, position = "identity") +
geom_vline(data = epa_dist %>% group_by(is_colts) %>% summarize(mu = mean(epa, na.rm = TRUE)),
aes(xintercept = mu, color = is_colts), linewidth = 0.9) +
scale_x_continuous(labels = number_format(accuracy = 0.1)) +
labs(title = "EPA distribution (pass & run plays)",
x = "EPA per play", y = "Count", fill = "", color = "") +
theme_minimal()

# ---- 5) Situational analysis ----
situ <- pbp_g1 %>%
filter(!is.na(down), !is.na(ydstogo), play_type %in% c("pass","run")) %>%
mutate(
ytg_bucket = case_when(
ydstogo <= 2 ~ "1-2",
ydstogo <= 5 ~ "3-5",
ydstogo <= 10 ~ "6-10",
TRUE ~ "11+"
),
down_lab = paste0("Down ", down)
) %>%
group_by(posteam, down_lab, ytg_bucket) %>%
summarize(
plays = n(),
success_rate = mean(success, na.rm = TRUE),
epa_per_play = mean(epa, na.rm = TRUE),
.groups = "drop"
)
ggplot(situ %>% filter(posteam %in% c("IND", opp)),
aes(x = ytg_bucket, y = success_rate, group = posteam, color = posteam)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
facet_wrap(~down_lab) +
scale_y_continuous(labels = percent) +
labs(title = "Success rate by down & yards-to-go",
x = "Yards to go (bucket)", y = "Success rate", color = "Offense") +
theme_minimal()

# Field position zones
fieldpos <- pbp_g1 %>%
filter(!is.na(yardline_100), play_type %in% c("run","pass")) %>%
mutate(
field_zone = case_when(
yardline_100 >= 80 ~ "Own 20-0",
yardline_100 >= 50 ~ "Own 49-21",
yardline_100 >= 21 ~ "Opp 49-21",
TRUE ~ "Red Zone (<=20)"
)
) %>%
group_by(posteam, field_zone) %>%
summarize(
plays = n(),
epa_per_play = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE),
.groups = "drop"
)
ggplot(fieldpos %>% filter(posteam %in% c("IND", opp)),
aes(x = field_zone, y = epa_per_play, fill = posteam)) +
geom_col(position = position_dodge(width = 0.8)) +
coord_flip() +
labs(title = "EPA/play by field zone",
x = "", y = "EPA per play", fill = "Offense") +
theme_minimal()

# Analyze missing drive values in Colts Game 1
missing_drive_analysis <- pbp_g1 %>%
mutate(has_drive = !is.na(drive)) %>%
select(play_id, qtr, time, play_type, desc, posteam, defteam, drive, has_drive,
touchdown, field_goal_attempt, punt_attempt, kickoff_attempt,
extra_point_attempt, two_point_attempt)
# Summary of missing drive plays
cat("=== MISSING DRIVE SUMMARY ===\n")
## === MISSING DRIVE SUMMARY ===
missing_count <- sum(is.na(pbp_g1$drive))
total_count <- nrow(pbp_g1)
cat("Missing drive values:", missing_count, "out of", total_count,
"(", round(missing_count/total_count*100, 1), "%)\n\n")
## Missing drive values: 3 out of 152 ( 2 %)
# Break down missing drives by play type
cat("=== MISSING DRIVES BY PLAY TYPE ===\n")
## === MISSING DRIVES BY PLAY TYPE ===
missing_by_type <- missing_drive_analysis %>%
filter(!has_drive) %>%
count(play_type, sort = TRUE)
print(missing_by_type)
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 2 × 2
## play_type n
## <chr> <int>
## 1 <NA> 2
## 2 no_play 1
# Look at specific examples of missing drive plays
cat("\n=== EXAMPLES OF MISSING DRIVE PLAYS ===\n")
##
## === EXAMPLES OF MISSING DRIVE PLAYS ===
missing_examples <- missing_drive_analysis %>%
filter(!has_drive) %>%
select(qtr, time, play_type, desc, posteam) %>%
head(10)
print(missing_examples)
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 3 × 5
## qtr time play_type desc posteam
## <dbl> <chr> <chr> <chr> <chr>
## 1 1 15:00 <NA> GAME <NA>
## 2 2 00:00 <NA> END QUARTER 2 <NA>
## 3 4 06:21 no_play Timeout #1 by MIA at 06:21. <NA>
# Check if certain special plays are more likely to have missing drives
cat("\n=== SPECIAL SITUATIONS WITH MISSING DRIVES ===\n")
##
## === SPECIAL SITUATIONS WITH MISSING DRIVES ===
special_situations <- missing_drive_analysis %>%
filter(!has_drive) %>%
summarise(
kickoffs = sum(kickoff_attempt == 1, na.rm = TRUE),
punts = sum(punt_attempt == 1, na.rm = TRUE),
field_goals = sum(field_goal_attempt == 1, na.rm = TRUE),
extra_points = sum(extra_point_attempt == 1, na.rm = TRUE),
two_points = sum(two_point_attempt == 1, na.rm = TRUE),
touchdowns = sum(touchdown == 1, na.rm = TRUE)
)
print(special_situations)
## # A tibble: 1 × 6
## kickoffs punts field_goals extra_points two_points touchdowns
## <int> <int> <int> <int> <int> <int>
## 1 0 0 0 0 0 0
colts_targets <- pbp_g1 %>%
filter(posteam == team, pass == 1) %>%
count(receiver_player_name, sort = TRUE, name = "targets")
colts_rushers <- pbp_g1 %>%
filter(posteam == team, rush == 1) %>%
count(rusher_player_name, sort = TRUE, name = "rush_att")
epa_by_receiver <- pbp_g1 %>%
filter(posteam == team, pass == 1, !is.na(receiver_player_name)) %>%
group_by(receiver_player_name) %>%
summarize(
targets = n(),
epa_sum = sum(epa, na.rm = TRUE),
epa_mean = mean(epa, na.rm = TRUE),
.groups = "drop"
) %>% arrange(desc(epa_sum))
ggplot(epa_by_receiver, aes(x = targets, y = epa_sum, label = receiver_player_name)) +
geom_point(size = 3) +
geom_text_repel(min.segment.length = 0) +
labs(title = "Colts receivers: total EPA vs targets",
x = "Targets", y = "Total EPA") +
theme_minimal()

# Combine receiving and rushing touches for EPA per touch analysis
receiving_touches <- pbp_g1 %>%
filter(posteam == team, pass == 1, !is.na(receiver_player_name)) %>%
group_by(receiver_player_name) %>%
summarise(
touches = n(),
total_epa = sum(epa, na.rm = TRUE),
.groups = "drop"
) %>%
rename(player_name = receiver_player_name)
rushing_touches <- pbp_g1 %>%
filter(posteam == team, rush == 1, !is.na(rusher_player_name)) %>%
group_by(rusher_player_name) %>%
summarise(
touches = n(),
total_epa = sum(epa, na.rm = TRUE),
.groups = "drop"
) %>%
rename(player_name = rusher_player_name)
# Combine both types of touches
all_touches <- bind_rows(receiving_touches, rushing_touches) %>%
group_by(player_name) %>%
summarise(
total_touches = sum(touches),
total_epa = sum(total_epa),
.groups = "drop"
) %>%
mutate(epa_per_touch = total_epa / total_touches) %>%
arrange(desc(epa_per_touch))
# Print the results
cat("=== EPA PER TOUCH (Colts Players) ===\n")
## === EPA PER TOUCH (Colts Players) ===
print(all_touches, n = Inf)
## # A tibble: 11 × 4
## player_name total_touches total_epa epa_per_touch
## <chr> <int> <dbl> <dbl>
## 1 A.Dulin 1 1.49 1.49
## 2 M.Alie-Cox 1 1.36 1.36
## 3 D.Jones 4 3.79 0.947
## 4 M.Pittman 8 6.89 0.861
## 5 A.Mitchell 2 0.730 0.365
## 6 A.Pierce 3 1.03 0.342
## 7 T.Warren 10 1.67 0.167
## 8 J.Downs 3 0.434 0.145
## 9 J.Taylor 21 2.67 0.127
## 10 D.Giddens 12 0.983 0.0819
## 11 U.Bentley 1 -0.795 -0.795
# ---- Penalties Analysis ----
penalties <- pbp_g1 %>%
filter(!is.na(penalty_team)) %>%
group_by(penalty_team) %>%
summarise(
total_penalties = n(),
penalty_yards = sum(penalty_yards, na.rm = TRUE),
avg_penalty_yards = mean(penalty_yards, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(desc(total_penalties))
cat(" PENALTIES BY TEAM \n")
## PENALTIES BY TEAM
print(penalties)
## # A tibble: 2 × 4
## penalty_team total_penalties penalty_yards avg_penalty_yards
## <chr> <int> <dbl> <dbl>
## 1 IND 4 45 45
## 2 MIA 3 15 15
# Penalty types breakdown
penalty_types <- pbp_g1 %>%
filter(!is.na(penalty_type)) %>%
count(penalty_team, penalty_type, sort = TRUE) %>%
head(10)
cat("\n TOP PENALTY TYPES \n")
##
## TOP PENALTY TYPES
print(penalty_types)
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 5 × 3
## penalty_team penalty_type n
## <chr> <chr> <int>
## 1 IND Offensive Holding 3
## 2 IND Unnecessary Roughness 1
## 3 MIA Defensive Holding 1
## 4 MIA Offensive Too Many Men on Field 1
## 5 MIA Running Into the Kicker 1
# ---- Field Goal Analysis ----
field_goals <- pbp_g1 %>%
filter(field_goal_attempt == 1) %>%
select(posteam, kick_distance, field_goal_result, kicker_player_name)
cat("\n FIELD GOAL ATTEMPTS \n")
##
## FIELD GOAL ATTEMPTS
if(nrow(field_goals) > 0) {
print(field_goals)
} else {
cat("No field goal attempts in this game\n")
}
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 4 × 4
## posteam kick_distance field_goal_result kicker_player_name
## <chr> <dbl> <chr> <chr>
## 1 IND 24 made S.Shrader
## 2 IND 35 made S.Shrader
## 3 IND 28 made S.Shrader
## 4 IND 48 made S.Shrader
# ---- Punt Analysis ----
punts <- pbp_g1 %>%
filter(punt_attempt == 1) %>%
select(posteam, kick_distance, punter_player_name,
touchback, return_yards) %>%
mutate(net_punt = kick_distance - coalesce(return_yards, 0))
cat("\nPUNT ATTEMPTS \n")
##
## PUNT ATTEMPTS
if(nrow(punts) > 0) {
print(punts)
cat("\nPunt Summary:\n")
punt_summary <- punts %>%
group_by(posteam) %>%
summarise(
punts = n(),
avg_distance = mean(kick_distance, na.rm = TRUE),
avg_net = mean(net_punt, na.rm = TRUE),
.groups = "drop"
)
print(punt_summary)
} else {
cat("No punts in this game\n")
}
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 1 × 6
## posteam kick_distance punter_player_name touchback return_yards net_punt
## <chr> <dbl> <chr> <dbl> <dbl> <dbl>
## 1 MIA 42 J.Bailey 0 0 42
##
## Punt Summary:
## # A tibble: 1 × 4
## posteam punts avg_distance avg_net
## <chr> <int> <dbl> <dbl>
## 1 MIA 1 42 42
# ---- 9) Time-based trends ----
epa_by_q <- pbp_g1 %>%
filter(qtr %in% 1:4, play_type %in% c("pass","run")) %>%
group_by(posteam, qtr) %>%
summarize(
epa_per_play = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE),
.groups = "drop"
)
ggplot(epa_by_q, aes(x = factor(qtr), y = epa_per_play, group = posteam, color = posteam)) +
geom_line(linewidth = 1) + geom_point(size = 2) +
labs(title = "EPA/play by quarter", x = "Quarter", y = "EPA/play", color = "Offense") +
theme_minimal()

# Rolling pass rate (10-play window) for Colts
colts_seq <- pbp_g1 %>%
filter(posteam == team, play_type %in% c("pass","run")) %>%
mutate(play_index = row_number(),
pass_flag = if_else(play_type == "pass", 1, 0)) %>%
arrange(play_index)
if (nrow(colts_seq) >= 10) {
colts_seq_roll <- colts_seq %>%
mutate(pass_rate_10 = zoo::rollapply(pass_flag, width = 10, FUN = mean,
align = "right", fill = NA_real_))
ggplot(colts_seq_roll, aes(x = play_index, y = pass_rate_10)) +
geom_line(linewidth = 1) +
scale_y_continuous(labels = percent) +
labs(title = "Colts rolling pass rate (10-play window)",
x = "Offensive play index", y = "Pass rate") +
theme_minimal()
}
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_line()`).
