This report analyzes the Indianapolis Colts’ first regular-season
game of the 2025 NFL season. The analysis begins by programmatically
identifying the game, pulling the relevant play-by-play (PBP), roster,
and officials data using the nflreadr
package. A
significant portion of the script is dedicated to data integrity checks
before moving on to exploratory analysis of the game’s offensive,
defensive, and situational trends.
First, we install (if needed) and load the necessary R packages for data manipulation, visualization, and analysis.
packages <- c("nflreadr", "tidyverse", "lubridate", "janitor", "scales", "gt", "ggrepel", "zoo")
library(nflreadr) # data access
library(tidyverse) # dplyr/ggplot/readr, etc.
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate) # dates/times
library(janitor) # clean_names, tabyl
##
## Attaching package: 'janitor'
##
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(scales) # pretty labels
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
library(gt) # presentation tables (not saved here)
library(ggrepel) # labels for charts
library(zoo) # rolling calcs
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
We set the team of interest (IND
) and the season
(2025
).
team <- "IND"
season <- 2025
We load the 2025 schedule and filter to find the first regular-season game involving the Colts.
sched <- load_schedules(seasons = season) %>% clean_names()
colts_g1 <- sched %>%
filter(game_type == "REG",
season == !!season,
home_team == !!team | away_team == !!team) %>%
arrange(gameday, week) %>%
slice(1)
stopifnot(nrow(colts_g1) == 1)
game_id <- colts_g1$game_id
week <- colts_g1$week
gameday <- colts_g1$gameday
home_tm <- colts_g1$home_team
away_tm <- colts_g1$away_team
opp <- ifelse(home_tm == team, away_tm, home_tm)
message(glue::glue("Found Colts Week {week} ({gameday}): {away_tm} @ {home_tm} | game_id = {game_id}"))
## Found Colts Week 1 (2025-09-07): MIA @ IND | game_id = 2025_01_MIA_IND
Using the game_id
from the previous step, we pull the
play-by-play data, weekly rosters, officials, and the master player
list.
pbp_g1 <- load_pbp(seasons = season) %>%
clean_names() %>%
filter(game_id == !!game_id)
rw_g1 <- load_rosters_weekly(seasons = season) %>%
clean_names() %>%
filter(week == !!week, team %in% c(team, opp))
officials_g1 <- load_officials(seasons = season) %>%
clean_names() %>%
filter(game_id == !!game_id)
players_master <- load_players() %>% clean_names()
This section is dedicated to ensuring the data is sound before analysis. We check for structural issues, key constraints, referential integrity, missingness, and logical consistency.
# ---- A) Structural sanity ----
cat("--- STRUCTURE ---\n")
## --- STRUCTURE ---
cat("PBP rows/cols:", nrow(pbp_g1), ncol(pbp_g1), "\n")
## PBP rows/cols: 152 372
cat("Weekly roster rows/cols:", nrow(rw_g1), ncol(rw_g1), "\n")
## Weekly roster rows/cols: 3047 36
cat("Officials rows/cols:", nrow(officials_g1), ncol(officials_g1), "\n\n")
## Officials rows/cols: 0 9
# ---- B) Key constraints & duplicates ----
dup_plays <- pbp_g1 %>% count(game_id, play_id) %>% filter(n > 1)
if (nrow(dup_plays) > 0) {
warning("Duplicate plays detected in (game_id, play_id).")
} else {
cat("Key check passed: (game_id, play_id) appear unique for this game.\n")
}
## Key check passed: (game_id, play_id) appear unique for this game.
# ---- C) Referential integrity ----
if ("gsis_id" %in% names(rw_g1) && "gsis_id" %in% names(players_master)) {
roster_master_match <- rw_g1 %>%
mutate(in_master = gsis_id %in% players_master$gsis_id) %>%
summarize(match_rate = mean(in_master, na.rm = TRUE))
cat("Player master match rate (weekly roster -> master by gsis_id):",
round(roster_master_match$match_rate*100, 1), "%\n")
}
## Player master match rate (weekly roster -> master by gsis_id): 93.3 %
# ---- D) Missingness (critical analytical fields) ----
na_rate <- function(x) mean(is.na(x))
na_summary <- pbp_g1 %>%
summarize(
na_epa = na_rate(epa),
na_success = na_rate(success),
na_posteam = na_rate(posteam),
na_defteam = na_rate(defteam),
na_yardline_100 = na_rate(yardline_100),
na_down = na_rate(down),
na_ydstogo = na_rate(ydstogo),
na_play_type = na_rate(play_type)
)
cat("\n--- MISSINGNESS (PBP) ---\n"); print(na_summary)
##
## --- MISSINGNESS (PBP) ---
## # A tibble: 1 × 8
## na_epa na_success na_posteam na_defteam na_yardline_100 na_down na_ydstogo
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.0132 0.0132 0.0658 0.0658 0.0789 0.171 0
## # ℹ 1 more variable: na_play_type <dbl>
# ---- E) Range & logical checks ----
# yardline_100 should be within [0, 100]
bad_yardline <- pbp_g1 %>% filter(!is.na(yardline_100) & (yardline_100 < 0 | yardline_100 > 100))
if (nrow(bad_yardline) > 0) warning("Out-of-range yardline_100 values observed.")
# down should be 1–4 (ignore special plays like kickoffs which may have NA)
bad_down <- pbp_g1 %>% filter(!is.na(down) & !(down %in% 1:4))
if (nrow(bad_down) > 0) warning("Unexpected down values outside 1–4.")
# ydstogo should be positive for standard plays
bad_ydstogo <- pbp_g1 %>% filter(!is.na(ydstogo) & ydstogo <= 0)
if (nrow(bad_ydstogo) > 0) message("Non-positive ydstogo rows exist (could be goal-to-go anomalies or data glitches).")
## Non-positive ydstogo rows exist (could be goal-to-go anomalies or data glitches).
# game_id consistency
if (length(unique(pbp_g1$game_id)) != 1) warning("Multiple game_ids in pbp_g1; filter may be off.")
# posteam/defteam should be one of the two teams (allow NA on non-plays)
valid_teams <- c(team, opp)
weird_teams <- pbp_g1 %>%
filter(!is.na(posteam) & !(posteam %in% valid_teams)) %>%
distinct(posteam)
if (nrow(weird_teams) > 0) warning("Found posteam not matching Colts or opponent: check `weird_teams`.")
# ---- F) Categorical consistency ----
cat("\n--- TEAM ABBREVS IN PBP ---\n")
##
## --- TEAM ABBREVS IN PBP ---
print(sort(unique(na.omit(pbp_g1$posteam))))
## [1] "IND" "MIA"
print(sort(unique(na.omit(pbp_g1$defteam))))
## [1] "IND" "MIA"
We first derive a points_scored
column from various PBP
events to summarize scoring by quarter.
pbp_g1 <- pbp_g1 %>%
mutate(
points_scored = case_when(
touchdown == 1 ~ 6,
field_goal_result == "made" ~ 3,
safety == 1 ~ 2,
extra_point_result == "good" ~ 1,
two_point_attempt == 1 & two_point_conv_result == "success" ~ 2,
TRUE ~ 0
)
)
score_by_q <- pbp_g1 %>%
filter(!is.na(qtr), qtr %in% 1:4) %>%
group_by(qtr) %>%
summarize(
ind_points = sum(if_else(posteam == "IND", points_scored, 0), na.rm = TRUE),
opp_points = sum(if_else(posteam == opp, points_scored, 0), na.rm = TRUE),
.groups = "drop"
)
cat("\n--- SCORE BY QUARTER (derived from PBP points events) ---\n")
##
## --- SCORE BY QUARTER (derived from PBP points events) ---
print(score_by_q)
## # A tibble: 4 × 3
## qtr ind_points opp_points
## <dbl> <dbl> <dbl>
## 1 1 3 0
## 2 2 17 0
## 3 3 3 0
## 4 4 10 8
#knitr::kable(score_by_q, caption = "Score by Quarter")
tempo <- pbp_g1 %>%
filter(!is.na(posteam), !is.na(game_seconds_remaining)) %>%
arrange(game_seconds_remaining) %>%
group_by(posteam) %>%
mutate(sec_between = lag(game_seconds_remaining) - game_seconds_remaining) %>%
summarize(sec_per_play = median(sec_between, na.rm = TRUE), .groups = "drop") %>%
filter(!is.na(sec_per_play))
cat("\n--- TEMPO (median sec/play) ---\n"); print(tempo)
##
## --- TEMPO (median sec/play) ---
## # A tibble: 2 × 2
## posteam sec_per_play
## <chr> <dbl>
## 1 IND -38.5
## 2 MIA -34.5
off_mix <- pbp_g1 %>%
filter(!is.na(posteam)) %>%
mutate(play_family = case_when(
play_type %in% c("run") ~ "Run",
play_type %in% c("pass") ~ "Pass",
play_type %in% c("qb_kneel","qb_spike") ~ "Clock",
play_type %in% c("no_play","timeout") ~ "Other",
TRUE ~ "Other"
)) %>%
group_by(posteam, play_family) %>%
summarize(
plays = n(),
share = n()/nrow(pbp_g1),
epa_per_play = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE),
yards_per_play = mean(yards_gained, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(posteam, desc(plays))
cat("\n--- OFFENSIVE MIX & EFFICIENCY ---\n"); print(off_mix, n = 50)
##
## --- OFFENSIVE MIX & EFFICIENCY ---
## # A tibble: 6 × 7
## posteam play_family plays share epa_per_play success_rate yards_per_play
## <chr> <chr> <int> <dbl> <dbl> <dbl> <dbl>
## 1 IND Run 40 0.263 0.138 0.5 3.9
## 2 IND Pass 30 0.197 0.433 0.567 8.73
## 3 IND Other 15 0.0987 0.190 0.667 0
## 4 MIA Pass 35 0.230 -0.461 0.371 3.86
## 5 MIA Run 12 0.0789 0.339 0.583 6.5
## 6 MIA Other 10 0.0658 0.352 0.778 0
Here we visualize the distribution of Expected Points Added (EPA) for both teams on run and pass plays.
epa_dist <- pbp_g1 %>%
filter(!is.na(posteam), play_type %in% c("run","pass")) %>%
mutate(is_colts = if_else(posteam == "IND", "Colts", "Opponent"))
ggplot(epa_dist, aes(x = epa, fill = is_colts)) +
geom_histogram(bins = 40, alpha = 0.6, position = "identity") +
geom_vline(data = epa_dist %>% group_by(is_colts) %>% summarize(mu = mean(epa, na.rm = TRUE)),
aes(xintercept = mu, color = is_colts), linewidth = 0.9) +
scale_x_continuous(labels = number_format(accuracy = 0.1)) +
labs(title = "EPA distribution (pass & run plays)",
x = "EPA per play", y = "Count", fill = "", color = "") +
theme_minimal()
This chart breaks down offensive success rate by down and the distance required for a first down.
situ <- pbp_g1 %>%
filter(!is.na(down), !is.na(ydstogo), play_type %in% c("pass","run")) %>%
mutate(
ytg_bucket = case_when(
ydstogo <= 2 ~ "1-2", ydstogo <= 5 ~ "3-5",
ydstogo <= 10 ~ "6-10", TRUE ~ "11+"
),
down_lab = paste0("Down ", down)
) %>%
group_by(posteam, down_lab, ytg_bucket) %>%
summarize(plays = n(), success_rate = mean(success, na.rm = TRUE),epa_per_play = mean(epa, na.rm = TRUE), .groups = "drop")
ggplot(situ %>% filter(posteam %in% c("IND", opp)),
aes(x = ytg_bucket, y = success_rate, group = posteam, color = posteam)) +
geom_line(linewidth = 1) +
geom_point(size = 2) +
facet_wrap(~down_lab) +
scale_y_continuous(labels = percent) +
labs(title = "Success rate by down & yards-to-go",
x = "Yards to go (bucket)", y = "Success rate", color = "Offense") +
theme_minimal()
fieldpos <- pbp_g1 %>%
filter(!is.na(yardline_100), play_type %in% c("run","pass")) %>%
mutate(
field_zone = case_when(
yardline_100 >= 80 ~ "Own 20-0",
yardline_100 >= 50 ~ "Own 49-21",
yardline_100 >= 21 ~ "Opp 49-21",
TRUE ~ "Red Zone (<=20)"
)
) %>%
group_by(posteam, field_zone) %>%
summarize(
plays = n(),
epa_per_play = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE),
.groups = "drop"
)
ggplot(fieldpos %>% filter(posteam %in% c("IND", opp)),
aes(x = field_zone, y = epa_per_play, fill = posteam)) +
geom_col(position = position_dodge(width = 0.8)) +
coord_flip() +
labs(title = "EPA/play by field zone",
x = "", y = "EPA per play", fill = "Offense") +
theme_minimal()
drive_sum <- pbp_g1 %>%
filter(!is.na(drive), !is.na(posteam)) %>%
group_by(posteam, drive) %>%
summarize(
plays = n(),
yards = sum(yards_gained, na.rm = TRUE),
points = sum(points_scored, na.rm = TRUE), # <-- FIXED
epa = sum(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE),
ended_score = any(touchdown == 1 | field_goal_result == "made" | safety == 1, na.rm = TRUE),
.groups = "drop"
) %>%
arrange(posteam, desc(points), desc(yards))
print(drive_sum)
## # A tibble: 14 × 8
## posteam drive plays yards points epa success_rate ended_score
## <chr> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <lgl>
## 1 IND 3 16 84 7 6.16 0.625 TRUE
## 2 IND 11 19 84 7 5.21 0.579 TRUE
## 3 IND 5 6 42 7 3.76 0.667 TRUE
## 4 IND 7 18 67 3 2.62 0.389 TRUE
## 5 IND 1 10 62 3 1.41 0.6 TRUE
## 6 IND 9 8 58 3 0.445 0.625 TRUE
## 7 IND 13 8 21 3 1.77 0.5 TRUE
## 8 MIA 12 12 72 8 6.78 0.583 TRUE
## 9 MIA 10 8 44 0 -2.88 0.5 FALSE
## 10 MIA 14 11 35 0 -1.46 0.4 FALSE
## 11 MIA 6 10 22 0 -1.55 0.4 FALSE
## 12 MIA 8 7 19 0 -3.49 0.571 FALSE
## 13 MIA 2 6 15 0 -1.94 0.333 FALSE
## 14 MIA 4 3 6 0 -4.37 0.667 FALSE
colts_targets <- pbp_g1 %>%
filter(posteam == team, pass == 1) %>%
count(receiver_player_name, sort = TRUE, name = "targets")
colts_rushers <- pbp_g1 %>%
filter(posteam == team, rush == 1) %>%
count(rusher_player_name, sort = TRUE, name = "rush_att")
epa_by_receiver <- pbp_g1 %>%
filter(posteam == team, pass == 1, !is.na(receiver_player_name)) %>%
group_by(receiver_player_name) %>%
summarize(
targets = n(),
epa_sum = sum(epa, na.rm = TRUE),
epa_mean = mean(epa, na.rm = TRUE),
.groups = "drop"
) %>% arrange(desc(epa_sum))
ggplot(epa_by_receiver, aes(x = targets, y = epa_sum, label = receiver_player_name)) +
geom_point(size = 3) +
geom_text_repel(min.segment.length = 0) +
labs(title = "Colts receivers: total EPA vs targets",
x = "Targets", y = "Total EPA") +
theme_minimal()
penalties <- pbp_g1 %>%
filter(!is.na(penalty)) %>%
mutate(off_def = if_else(penalty_team == posteam, "Offense", "Defense")) %>%
count(penalty_team, off_def, penalty_type, sort = TRUE)
cat("\n--- PENALTIES ---\n"); print(penalties, n = 30)
##
## --- PENALTIES ---
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 6 × 4
## penalty_team off_def penalty_type n
## <chr> <chr> <chr> <int>
## 1 <NA> <NA> <NA> 140
## 2 IND Offense Offensive Holding 3
## 3 IND Defense Unnecessary Roughness 1
## 4 MIA Defense Defensive Holding 1
## 5 MIA Defense Running Into the Kicker 1
## 6 MIA Offense Offensive Too Many Men on Field 1
st_fg <- pbp_g1 %>%
filter(play_type == "field_goal") %>%
count(posteam, field_goal_result)
st_punt <- pbp_g1 %>%
filter(play_type == "punt") %>%
summarize(punts = n(), avg_punt_yards = mean(yards_gained, na.rm = TRUE))
cat("\n--- SPECIAL TEAMS ---\n"); print(list(field_goals = st_fg, punts = st_punt))
##
## --- SPECIAL TEAMS ---
## $field_goals
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 1 × 3
## posteam field_goal_result n
## <chr> <chr> <int>
## 1 IND made 4
##
## $punts
## # A tibble: 1 × 2
## punts avg_punt_yards
## <int> <dbl>
## 1 1 0
epa_by_q <- pbp_g1 %>%
filter(qtr %in% 1:4, play_type %in% c("pass","run")) %>%
group_by(posteam, qtr) %>%
summarize(
epa_per_play = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE),
.groups = "drop"
)
ggplot(epa_by_q, aes(x = factor(qtr), y = epa_per_play, group = posteam, color = posteam)) +
geom_line(linewidth = 1) + geom_point(size = 2) +
labs(title = "EPA/play by quarter", x = "Quarter", y = "EPA/play", color = "Offense") +
theme_minimal()
Finally, we analyze the Colts’ offensive play-calling tendency by calculating their pass rate over a 10-play rolling window.
colts_seq <- pbp_g1 %>%
filter(posteam == team, play_type %in% c("pass","run")) %>%
mutate(play_index = row_number(),
pass_flag = if_else(play_type == "pass", 1, 0)) %>%
arrange(play_index)
if (nrow(colts_seq) >= 10) {
colts_seq_roll <- colts_seq %>%
mutate(pass_rate_10 = zoo::rollapply(pass_flag, width = 10, FUN = mean,
align = "right", fill = NA_real_))
ggplot(colts_seq_roll, aes(x = play_index, y = pass_rate_10)) +
geom_line(linewidth = 1) +
scale_y_continuous(labels = percent) +
labs(title = "Colts rolling pass rate (10-play window)",
x = "Offensive play index", y = "Pass rate") +
theme_minimal()
}
colts_off <- pbp_g1 %>%
filter(posteam == team) %>%
summarize(
plays = n(),
pass_plays = sum(play_type == "pass", na.rm = TRUE),
rush_plays = sum(play_type == "run", na.rm = TRUE),
total_epa = sum(epa, na.rm = TRUE),
mean_epa = mean(epa, na.rm = TRUE),
success_rate = mean(success, na.rm = TRUE)
)
plays_by_team_type <- pbp_g1 %>%
filter(!is.na(posteam)) %>%
count(posteam, play_type, sort = TRUE)
cat("\n--- SUMMARY ---\n")
##
## --- SUMMARY ---
print(list(
game_header = tibble(
season = season, week = week, gameday = gameday,
home = home_tm, away = away_tm, game_id = game_id, opp_for_colts = opp
),
pbp_rows = nrow(pbp_g1),
roster_rows = nrow(rw_g1),
officials_rows = nrow(officials_g1),
tempo_sec_per_play= tempo,
plays_by_team_type= plays_by_team_type,
colts_off_summary = colts_off
))
## $game_header
## # A tibble: 1 × 7
## season week gameday home away game_id opp_for_colts
## <dbl> <int> <chr> <chr> <chr> <chr> <chr>
## 1 2025 1 2025-09-07 IND MIA 2025_01_MIA_IND MIA
##
## $pbp_rows
## [1] 152
##
## $roster_rows
## [1] 3047
##
## $officials_rows
## [1] 0
##
## $tempo_sec_per_play
## # A tibble: 2 × 2
## posteam sec_per_play
## <chr> <dbl>
## 1 IND -38.5
## 2 MIA -34.5
##
## $plays_by_team_type
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 13 × 3
## posteam play_type n
## <chr> <chr> <int>
## 1 IND run 40
## 2 MIA pass 35
## 3 IND pass 30
## 4 MIA run 12
## 5 MIA kickoff 7
## 6 IND field_goal 4
## 7 IND no_play 4
## 8 IND extra_point 3
## 9 IND kickoff 2
## 10 IND <NA> 2
## 11 MIA no_play 1
## 12 MIA punt 1
## 13 MIA <NA> 1
##
## $colts_off_summary
## # A tibble: 1 × 6
## plays pass_plays rush_plays total_epa mean_epa success_rate
## <int> <int> <int> <dbl> <dbl> <dbl>
## 1 85 30 40 21.4 0.251 0.553
# Helper: summarize receiver efficiency by team
receiver_efficiency <- function(pbp_g1) {
pbp_g1 %>%
filter(!is.na(receiver), receiver != "") %>%
group_by(team = posteam, receiver) %>%
summarise(
targets = n(),
receptions = sum(as.integer(complete_pass) %in% 1, na.rm = TRUE),
yards = sum(yards_gained, na.rm = TRUE),
epa_per_tgt = mean(epa, na.rm = TRUE),
.groups = "drop"
) %>%
mutate(
yards_per_target = ifelse(targets > 0, yards/targets, NA_real_),
catch_pct = ifelse(targets > 0, receptions/targets, NA_real_)
) %>%
# keep receivers with meaningful sample size
filter(targets >= 3)
}
# Main plotting function
plot_receiver_efficiency <- function(pbp, team = NULL) {
df <- receiver_efficiency(pbp)
if (!is.null(team)) df <- dplyr::filter(df, team %in% team)
p <- ggplot(
df,
aes(x = targets, y = yards_per_target, color = team, size = receptions)
) +
geom_point(alpha = 0.8) +
# If you want labels for top performers, uncomment:
# ggrepel::geom_text_repel(
# data = df |>
# group_by(team) |>
# slice_max(order_by = yards_per_target, n = 1, with_ties = FALSE),
# aes(label = receiver),
# size = 3, show.legend = FALSE, max.overlaps = 100
# ) +
scale_size_continuous(name = "Receptions") +
labs(
title = "Receiver Efficiency by Team",
subtitle = "Yards per Target vs Targets (min 3 targets)",
x = "Targets",
y = "Yards per Target",
color = "Team"
) +
theme_minimal(base_size = 12)
# For team-by-team panels, uncomment this instead of filtering:
# p <- p + facet_wrap(~ team)
p
}
# EXAMPLE USAGE
# 1) League-wide view colored by team
plot_receiver_efficiency(pbp_g1)
# 2) Single-team view (e.g., Colts only)
# plot_receiver_efficiency(pbp, team = "IND")