Introduction

This report analyzes the Indianapolis Colts’ first regular-season game of the 2025 NFL season. The analysis begins by programmatically identifying the game, pulling the relevant play-by-play (PBP), roster, and officials data using the nflreadr package. A significant portion of the script is dedicated to data integrity checks before moving on to exploratory analysis of the game’s offensive, defensive, and situational trends.

1. Environment Setup

Libraries

First, we install (if needed) and load the necessary R packages for data manipulation, visualization, and analysis.

packages <- c("nflreadr", "tidyverse", "lubridate", "janitor", "scales", "gt", "ggrepel", "zoo")

library(nflreadr)    # data access
library(tidyverse)   # dplyr/ggplot/readr, etc.
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)   # dates/times
library(janitor)     # clean_names, tabyl
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(scales)      # pretty labels
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(gt)          # presentation tables (not saved here)
library(ggrepel)     # labels for charts
library(zoo)         # rolling calcs
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

Parameters

We set the team of interest (IND) and the season (2025).

team <- "IND"
season <- 2025

2. Data Acquisition

Identify Colts’ First Game

We load the 2025 schedule and filter to find the first regular-season game involving the Colts.

sched <- load_schedules(seasons = season) %>% clean_names()

colts_g1 <- sched %>%
  filter(game_type == "REG",
         season == !!season,
         home_team == !!team | away_team == !!team) %>%
  arrange(gameday, week) %>%
  slice(1)

stopifnot(nrow(colts_g1) == 1)

game_id <- colts_g1$game_id
week    <- colts_g1$week
gameday <- colts_g1$gameday
home_tm <- colts_g1$home_team
away_tm <- colts_g1$away_team
opp     <- ifelse(home_tm == team, away_tm, home_tm)

message(glue::glue("Found Colts Week {week} ({gameday}): {away_tm} @ {home_tm} | game_id = {game_id}"))
## Found Colts Week 1 (2025-09-07): MIA @ IND | game_id = 2025_01_MIA_IND

Pull Game-Specific Data

Using the game_id from the previous step, we pull the play-by-play data, weekly rosters, officials, and the master player list.

pbp_g1 <- load_pbp(seasons = season) %>%
  clean_names() %>%
  filter(game_id == !!game_id)

rw_g1 <- load_rosters_weekly(seasons = season) %>%
  clean_names() %>%
  filter(week == !!week, team %in% c(team, opp))

officials_g1 <- load_officials(seasons = season) %>%
  clean_names() %>%
  filter(game_id == !!game_id)

players_master <- load_players() %>% clean_names()

3. Data Integrity Checks

This section is dedicated to ensuring the data is sound before analysis. We check for structural issues, key constraints, referential integrity, missingness, and logical consistency.

# ---- A) Structural sanity ----
cat("--- STRUCTURE ---\n")
## --- STRUCTURE ---
cat("PBP rows/cols:", nrow(pbp_g1), ncol(pbp_g1), "\n")
## PBP rows/cols: 152 372
cat("Weekly roster rows/cols:", nrow(rw_g1), ncol(rw_g1), "\n")
## Weekly roster rows/cols: 3047 36
cat("Officials rows/cols:", nrow(officials_g1), ncol(officials_g1), "\n\n")
## Officials rows/cols: 0 9
# ---- B) Key constraints & duplicates ----
dup_plays <- pbp_g1 %>% count(game_id, play_id) %>% filter(n > 1)
if (nrow(dup_plays) > 0) {
  warning("Duplicate plays detected in (game_id, play_id).")
} else {
  cat("Key check passed: (game_id, play_id) appear unique for this game.\n")
}
## Key check passed: (game_id, play_id) appear unique for this game.
# ---- C) Referential integrity ----
if ("gsis_id" %in% names(rw_g1) && "gsis_id" %in% names(players_master)) {
  roster_master_match <- rw_g1 %>%
    mutate(in_master = gsis_id %in% players_master$gsis_id) %>%
    summarize(match_rate = mean(in_master, na.rm = TRUE))
  cat("Player master match rate (weekly roster -> master by gsis_id):",
      round(roster_master_match$match_rate*100, 1), "%\n")
}
## Player master match rate (weekly roster -> master by gsis_id): 93.3 %
# ---- D) Missingness (critical analytical fields) ----
na_rate <- function(x) mean(is.na(x))
na_summary <- pbp_g1 %>%
  summarize(
    na_epa          = na_rate(epa),
    na_success      = na_rate(success),
    na_posteam      = na_rate(posteam),
    na_defteam      = na_rate(defteam),
    na_yardline_100 = na_rate(yardline_100),
    na_down         = na_rate(down),
    na_ydstogo      = na_rate(ydstogo),
    na_play_type    = na_rate(play_type)
  )
cat("\n--- MISSINGNESS (PBP) ---\n"); print(na_summary)
## 
## --- MISSINGNESS (PBP) ---
## # A tibble: 1 × 8
##   na_epa na_success na_posteam na_defteam na_yardline_100 na_down na_ydstogo
##    <dbl>      <dbl>      <dbl>      <dbl>           <dbl>   <dbl>      <dbl>
## 1 0.0132     0.0132     0.0658     0.0658          0.0789   0.171          0
## # ℹ 1 more variable: na_play_type <dbl>
# ---- E) Range & logical checks ----
# yardline_100 should be within [0, 100]
bad_yardline <- pbp_g1 %>% filter(!is.na(yardline_100) & (yardline_100 < 0 | yardline_100 > 100))
if (nrow(bad_yardline) > 0) warning("Out-of-range yardline_100 values observed.")

# down should be 1–4 (ignore special plays like kickoffs which may have NA)
bad_down <- pbp_g1 %>% filter(!is.na(down) & !(down %in% 1:4))
if (nrow(bad_down) > 0) warning("Unexpected down values outside 1–4.")

# ydstogo should be positive for standard plays
bad_ydstogo <- pbp_g1 %>% filter(!is.na(ydstogo) & ydstogo <= 0)
if (nrow(bad_ydstogo) > 0) message("Non-positive ydstogo rows exist (could be goal-to-go anomalies or data glitches).")
## Non-positive ydstogo rows exist (could be goal-to-go anomalies or data glitches).
# game_id consistency
if (length(unique(pbp_g1$game_id)) != 1) warning("Multiple game_ids in pbp_g1; filter may be off.")

# posteam/defteam should be one of the two teams (allow NA on non-plays)
valid_teams <- c(team, opp)
weird_teams <- pbp_g1 %>%
  filter(!is.na(posteam) & !(posteam %in% valid_teams)) %>%
  distinct(posteam)
if (nrow(weird_teams) > 0) warning("Found posteam not matching Colts or opponent: check `weird_teams`.")
# ---- F) Categorical consistency ----
cat("\n--- TEAM ABBREVS IN PBP ---\n")
## 
## --- TEAM ABBREVS IN PBP ---
print(sort(unique(na.omit(pbp_g1$posteam))))
## [1] "IND" "MIA"
print(sort(unique(na.omit(pbp_g1$defteam))))
## [1] "IND" "MIA"

4. Exploratory Analysis

Game Overview: Scoring & Tempo

We first derive a points_scored column from various PBP events to summarize scoring by quarter.

pbp_g1 <- pbp_g1 %>%
  mutate(
    points_scored = case_when(
      touchdown == 1 ~ 6,
      field_goal_result == "made" ~ 3,
      safety == 1 ~ 2,
      extra_point_result == "good" ~ 1,
      two_point_attempt == 1 & two_point_conv_result == "success" ~ 2,
      TRUE ~ 0
    )
  )

score_by_q <- pbp_g1 %>%
  filter(!is.na(qtr), qtr %in% 1:4) %>%
  group_by(qtr) %>%
  summarize(
    ind_points = sum(if_else(posteam == "IND", points_scored, 0), na.rm = TRUE),
    opp_points = sum(if_else(posteam == opp,   points_scored, 0), na.rm = TRUE),
    .groups = "drop"
  )
cat("\n--- SCORE BY QUARTER (derived from PBP points events) ---\n")
## 
## --- SCORE BY QUARTER (derived from PBP points events) ---
print(score_by_q)
## # A tibble: 4 × 3
##     qtr ind_points opp_points
##   <dbl>      <dbl>      <dbl>
## 1     1          3          0
## 2     2         17          0
## 3     3          3          0
## 4     4         10          8
#knitr::kable(score_by_q, caption = "Score by Quarter")
tempo <- pbp_g1 %>%
  filter(!is.na(posteam), !is.na(game_seconds_remaining)) %>%
  arrange(game_seconds_remaining) %>%
  group_by(posteam) %>%
  mutate(sec_between = lag(game_seconds_remaining) - game_seconds_remaining) %>%
  summarize(sec_per_play = median(sec_between, na.rm = TRUE), .groups = "drop") %>%
  filter(!is.na(sec_per_play))
cat("\n--- TEMPO (median sec/play) ---\n"); print(tempo)
## 
## --- TEMPO (median sec/play) ---
## # A tibble: 2 × 2
##   posteam sec_per_play
##   <chr>          <dbl>
## 1 IND            -38.5
## 2 MIA            -34.5

Offensive Profile: Play mix & Efficiency

off_mix <- pbp_g1 %>%
  filter(!is.na(posteam)) %>%
  mutate(play_family = case_when(
    play_type %in% c("run") ~ "Run",
    play_type %in% c("pass") ~ "Pass",
    play_type %in% c("qb_kneel","qb_spike") ~ "Clock",
    play_type %in% c("no_play","timeout") ~ "Other",
    TRUE ~ "Other"
  )) %>%
  group_by(posteam, play_family) %>%
  summarize(
    plays          = n(),
    share          = n()/nrow(pbp_g1),
    epa_per_play   = mean(epa, na.rm = TRUE),
    success_rate   = mean(success, na.rm = TRUE),
    yards_per_play = mean(yards_gained, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(posteam, desc(plays))
cat("\n--- OFFENSIVE MIX & EFFICIENCY ---\n"); print(off_mix, n = 50)
## 
## --- OFFENSIVE MIX & EFFICIENCY ---
## # A tibble: 6 × 7
##   posteam play_family plays  share epa_per_play success_rate yards_per_play
##   <chr>   <chr>       <int>  <dbl>        <dbl>        <dbl>          <dbl>
## 1 IND     Run            40 0.263         0.138        0.5             3.9 
## 2 IND     Pass           30 0.197         0.433        0.567           8.73
## 3 IND     Other          15 0.0987        0.190        0.667           0   
## 4 MIA     Pass           35 0.230        -0.461        0.371           3.86
## 5 MIA     Run            12 0.0789        0.339        0.583           6.5 
## 6 MIA     Other          10 0.0658        0.352        0.778           0

Offensive Profile: EPA Distribution

Here we visualize the distribution of Expected Points Added (EPA) for both teams on run and pass plays.

epa_dist <- pbp_g1 %>%
  filter(!is.na(posteam), play_type %in% c("run","pass")) %>%
  mutate(is_colts = if_else(posteam == "IND", "Colts", "Opponent"))

ggplot(epa_dist, aes(x = epa, fill = is_colts)) +
  geom_histogram(bins = 40, alpha = 0.6, position = "identity") +
  geom_vline(data = epa_dist %>% group_by(is_colts) %>% summarize(mu = mean(epa, na.rm = TRUE)),
             aes(xintercept = mu, color = is_colts), linewidth = 0.9) +
  scale_x_continuous(labels = number_format(accuracy = 0.1)) +
  labs(title = "EPA distribution (pass & run plays)",
       x = "EPA per play", y = "Count", fill = "", color = "") +
  theme_minimal()

Situational Analysis: Success Rate

This chart breaks down offensive success rate by down and the distance required for a first down.

situ <- pbp_g1 %>%
  filter(!is.na(down), !is.na(ydstogo), play_type %in% c("pass","run")) %>%
  mutate(
    ytg_bucket = case_when(
      ydstogo <= 2 ~ "1-2", ydstogo <= 5 ~ "3-5",
      ydstogo <= 10 ~ "6-10", TRUE ~ "11+"
    ),
    down_lab = paste0("Down ", down)
  ) %>%
  group_by(posteam, down_lab, ytg_bucket) %>%
  summarize(plays = n(), success_rate = mean(success, na.rm = TRUE),epa_per_play = mean(epa, na.rm = TRUE), .groups = "drop")

ggplot(situ %>% filter(posteam %in% c("IND", opp)),
       aes(x = ytg_bucket, y = success_rate, group = posteam, color = posteam)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  facet_wrap(~down_lab) +
  scale_y_continuous(labels = percent) +
  labs(title = "Success rate by down & yards-to-go",
       x = "Yards to go (bucket)", y = "Success rate", color = "Offense") +
  theme_minimal()

Field position zones

fieldpos <- pbp_g1 %>%
  filter(!is.na(yardline_100), play_type %in% c("run","pass")) %>%
  mutate(
    field_zone = case_when(
      yardline_100 >= 80 ~ "Own 20-0",
      yardline_100 >= 50 ~ "Own 49-21",
      yardline_100 >= 21 ~ "Opp 49-21",
      TRUE ~ "Red Zone (<=20)"
    )
  ) %>%
  group_by(posteam, field_zone) %>%
  summarize(
    plays = n(),
    epa_per_play = mean(epa, na.rm = TRUE),
    success_rate = mean(success, na.rm = TRUE),
    .groups = "drop"
  )
ggplot(fieldpos %>% filter(posteam %in% c("IND", opp)),
       aes(x = field_zone, y = epa_per_play, fill = posteam)) +
  geom_col(position = position_dodge(width = 0.8)) +
  coord_flip() +
  labs(title = "EPA/play by field zone",
       x = "", y = "EPA per play", fill = "Offense") +
  theme_minimal()

Drive lens

drive_sum <- pbp_g1 %>%
  filter(!is.na(drive), !is.na(posteam)) %>%
  group_by(posteam, drive) %>%
  summarize(
    plays        = n(),
    yards        = sum(yards_gained, na.rm = TRUE),
    points       = sum(points_scored, na.rm = TRUE),   # <-- FIXED
    epa          = sum(epa, na.rm = TRUE),
    success_rate = mean(success, na.rm = TRUE),
    ended_score  = any(touchdown == 1 | field_goal_result == "made" | safety == 1, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(posteam, desc(points), desc(yards))
print(drive_sum)
## # A tibble: 14 × 8
##    posteam drive plays yards points    epa success_rate ended_score
##    <chr>   <dbl> <int> <dbl>  <dbl>  <dbl>        <dbl> <lgl>      
##  1 IND         3    16    84      7  6.16         0.625 TRUE       
##  2 IND        11    19    84      7  5.21         0.579 TRUE       
##  3 IND         5     6    42      7  3.76         0.667 TRUE       
##  4 IND         7    18    67      3  2.62         0.389 TRUE       
##  5 IND         1    10    62      3  1.41         0.6   TRUE       
##  6 IND         9     8    58      3  0.445        0.625 TRUE       
##  7 IND        13     8    21      3  1.77         0.5   TRUE       
##  8 MIA        12    12    72      8  6.78         0.583 TRUE       
##  9 MIA        10     8    44      0 -2.88         0.5   FALSE      
## 10 MIA        14    11    35      0 -1.46         0.4   FALSE      
## 11 MIA         6    10    22      0 -1.55         0.4   FALSE      
## 12 MIA         8     7    19      0 -3.49         0.571 FALSE      
## 13 MIA         2     6    15      0 -1.94         0.333 FALSE      
## 14 MIA         4     3     6      0 -4.37         0.667 FALSE

Player Involvement

colts_targets <- pbp_g1 %>%
  filter(posteam == team, pass == 1) %>%
  count(receiver_player_name, sort = TRUE, name = "targets")

colts_rushers <- pbp_g1 %>%
  filter(posteam == team, rush == 1) %>%
  count(rusher_player_name, sort = TRUE, name = "rush_att")

epa_by_receiver <- pbp_g1 %>%
  filter(posteam == team, pass == 1, !is.na(receiver_player_name)) %>%
  group_by(receiver_player_name) %>%
  summarize(
    targets = n(),
    epa_sum = sum(epa, na.rm = TRUE),
    epa_mean = mean(epa, na.rm = TRUE),
    .groups = "drop"
  ) %>% arrange(desc(epa_sum))

ggplot(epa_by_receiver, aes(x = targets, y = epa_sum, label = receiver_player_name)) +
  geom_point(size = 3) +
  geom_text_repel(min.segment.length = 0) +
  labs(title = "Colts receivers: total EPA vs targets",
       x = "Targets", y = "Total EPA") +
  theme_minimal()

Penalties & Special Teams

penalties <- pbp_g1 %>%
  filter(!is.na(penalty)) %>%
  mutate(off_def = if_else(penalty_team == posteam, "Offense", "Defense")) %>%
  count(penalty_team, off_def, penalty_type, sort = TRUE)
cat("\n--- PENALTIES ---\n"); print(penalties, n = 30)
## 
## --- PENALTIES ---
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 6 × 4
##   penalty_team off_def penalty_type                        n
##   <chr>        <chr>   <chr>                           <int>
## 1 <NA>         <NA>    <NA>                              140
## 2 IND          Offense Offensive Holding                   3
## 3 IND          Defense Unnecessary Roughness               1
## 4 MIA          Defense Defensive Holding                   1
## 5 MIA          Defense Running Into the Kicker             1
## 6 MIA          Offense Offensive Too Many Men on Field     1
st_fg <- pbp_g1 %>%
  filter(play_type == "field_goal") %>%
  count(posteam, field_goal_result)
st_punt <- pbp_g1 %>%
  filter(play_type == "punt") %>%
  summarize(punts = n(), avg_punt_yards = mean(yards_gained, na.rm = TRUE))
cat("\n--- SPECIAL TEAMS ---\n"); print(list(field_goals = st_fg, punts = st_punt))
## 
## --- SPECIAL TEAMS ---
## $field_goals
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 1 × 3
##   posteam field_goal_result     n
##   <chr>   <chr>             <int>
## 1 IND     made                  4
## 
## $punts
## # A tibble: 1 × 2
##   punts avg_punt_yards
##   <int>          <dbl>
## 1     1              0

Compact console summary

colts_off <- pbp_g1 %>%
  filter(posteam == team) %>%
  summarize(
    plays        = n(),
    pass_plays   = sum(play_type == "pass", na.rm = TRUE),
    rush_plays   = sum(play_type == "run",  na.rm = TRUE),
    total_epa    = sum(epa, na.rm = TRUE),
    mean_epa     = mean(epa, na.rm = TRUE),
    success_rate = mean(success, na.rm = TRUE)
  )

plays_by_team_type <- pbp_g1 %>%
  filter(!is.na(posteam)) %>%
  count(posteam, play_type, sort = TRUE)

cat("\n--- SUMMARY ---\n")
## 
## --- SUMMARY ---
print(list(
  game_header = tibble(
    season = season, week = week, gameday = gameday,
    home = home_tm, away = away_tm, game_id = game_id, opp_for_colts = opp
  ),
  pbp_rows          = nrow(pbp_g1),
  roster_rows       = nrow(rw_g1),
  officials_rows    = nrow(officials_g1),
  tempo_sec_per_play= tempo,
  plays_by_team_type= plays_by_team_type,
  colts_off_summary = colts_off
))
## $game_header
## # A tibble: 1 × 7
##   season  week gameday    home  away  game_id         opp_for_colts
##    <dbl> <int> <chr>      <chr> <chr> <chr>           <chr>        
## 1   2025     1 2025-09-07 IND   MIA   2025_01_MIA_IND MIA          
## 
## $pbp_rows
## [1] 152
## 
## $roster_rows
## [1] 3047
## 
## $officials_rows
## [1] 0
## 
## $tempo_sec_per_play
## # A tibble: 2 × 2
##   posteam sec_per_play
##   <chr>          <dbl>
## 1 IND            -38.5
## 2 MIA            -34.5
## 
## $plays_by_team_type
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 13 × 3
##    posteam play_type       n
##    <chr>   <chr>       <int>
##  1 IND     run            40
##  2 MIA     pass           35
##  3 IND     pass           30
##  4 MIA     run            12
##  5 MIA     kickoff         7
##  6 IND     field_goal      4
##  7 IND     no_play         4
##  8 IND     extra_point     3
##  9 IND     kickoff         2
## 10 IND     <NA>            2
## 11 MIA     no_play         1
## 12 MIA     punt            1
## 13 MIA     <NA>            1
## 
## $colts_off_summary
## # A tibble: 1 × 6
##   plays pass_plays rush_plays total_epa mean_epa success_rate
##   <int>      <int>      <int>     <dbl>    <dbl>        <dbl>
## 1    85         30         40      21.4    0.251        0.553
# Helper: summarize receiver efficiency by team
receiver_efficiency <- function(pbp_g1) {
  pbp_g1 %>%
    filter(!is.na(receiver), receiver != "") %>%
    group_by(team = posteam, receiver) %>%
    summarise(
      targets      = n(),
      receptions   = sum(as.integer(complete_pass) %in% 1, na.rm = TRUE),
      yards        = sum(yards_gained, na.rm = TRUE),
      epa_per_tgt  = mean(epa, na.rm = TRUE),
      .groups = "drop"
    ) %>%
    mutate(
      yards_per_target = ifelse(targets > 0, yards/targets, NA_real_),
      catch_pct        = ifelse(targets > 0, receptions/targets, NA_real_)
    ) %>%
    # keep receivers with meaningful sample size
    filter(targets >= 3)
}

# Main plotting function
plot_receiver_efficiency <- function(pbp, team = NULL) {
  df <- receiver_efficiency(pbp)
  if (!is.null(team)) df <- dplyr::filter(df, team %in% team)

  p <- ggplot(
    df,
    aes(x = targets, y = yards_per_target, color = team, size = receptions)
  ) +
    geom_point(alpha = 0.8) +
    # If you want labels for top performers, uncomment:
    # ggrepel::geom_text_repel(
    #   data = df |>
    #     group_by(team) |>
    #     slice_max(order_by = yards_per_target, n = 1, with_ties = FALSE),
    #   aes(label = receiver),
    #   size = 3, show.legend = FALSE, max.overlaps = 100
    # ) +
    scale_size_continuous(name = "Receptions") +
    labs(
      title = "Receiver Efficiency by Team",
      subtitle = "Yards per Target vs Targets (min 3 targets)",
      x = "Targets",
      y = "Yards per Target",
      color = "Team"
    ) +
    theme_minimal(base_size = 12)

  # For team-by-team panels, uncomment this instead of filtering:
  # p <- p + facet_wrap(~ team)

  p
}

# EXAMPLE USAGE
# 1) League-wide view colored by team
plot_receiver_efficiency(pbp_g1)

# 2) Single-team view (e.g., Colts only)
# plot_receiver_efficiency(pbp, team = "IND")