library(nflreadr)

# Load data
schedules <- load_schedules(2025)
rosters <- load_rosters(2025)
pbp <- load_pbp(2025)
officials <- load_officials(2025)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
# Get Colts first reg season game using arrange and slice
colts_week1 <- schedules %>% 
  filter((home_team == "IND" | away_team == "IND") & game_type == "REG") %>%
  arrange(week) %>%
  slice(1)

game_id <- colts_week1$game_id

# Get play-by-play data for this specific game
colts_dolphins_pbp <- pbp %>% filter(game_id == !!game_id)

# Get officials for this game
colts_dolphins_officials <- officials %>% filter(game_id == !!game_id)
# Key uniqueness checks
cat("=== KEY UNIQUENESS CHECKS ===\n")
## === KEY UNIQUENESS CHECKS ===
# Check game_id uniqueness
cat("Duplicate game_ids:\n")
## Duplicate game_ids:
duplicate_games <- pbp %>% 
  count(game_id) %>% 
  filter(n > 1)
print(nrow(duplicate_games))
## [1] 33
# Check play_id uniqueness
cat("\nDuplicate play_ids:\n")
## 
## Duplicate play_ids:
duplicate_plays <- pbp %>% 
  count(play_id) %>% 
  filter(n > 1)
print(nrow(duplicate_plays))
## [1] 1607
cat("\n=== MISSINGNESS CHECKS ===\n")
## 
## === MISSINGNESS CHECKS ===
# Check yardline_100 missingness
missing_yardline <- sum(is.na(pbp$yardline_100))
cat("Missing yardline_100 values:", missing_yardline, "\n")
## Missing yardline_100 values: 419
# Check down missingness
missing_down <- sum(is.na(pbp$down))
cat("Missing down values:", missing_down, "\n")
## Missing down values: 941
# Check play_type missingness
missing_play_type <- sum(is.na(pbp$play_type))
cat("Missing play_type values:", missing_play_type, "\n")
## Missing play_type values: 171
# Check ydstogo missingness
missing_ydstogo <- sum(is.na(pbp$ydstogo))
cat("Missing ydstogo values:", missing_ydstogo, "\n")
## Missing ydstogo values: 0
cat("\n=== RANGE CHECKS ===\n")
## 
## === RANGE CHECKS ===
# Check yardline_100 range
bad_yardline <- pbp %>% 
  filter(!is.na(yardline_100) & (yardline_100 < 0 | yardline_100 > 100)) %>%
  nrow()
cat("Yardline_100 out of range (should be 0-100):", bad_yardline, "\n")
## Yardline_100 out of range (should be 0-100): 0
# Check down range
bad_down <- pbp %>% 
  filter(!is.na(down) & (down < 1 | down > 4)) %>%
  nrow()
cat("Down out of range (should be 1-4):", bad_down, "\n")
## Down out of range (should be 1-4): 0
nrow(pbp)
## [1] 5689
names(pbp)[1:130]  # first 10 column names
##   [1] "play_id"                    "game_id"                   
##   [3] "old_game_id"                "home_team"                 
##   [5] "away_team"                  "season_type"               
##   [7] "week"                       "posteam"                   
##   [9] "posteam_type"               "defteam"                   
##  [11] "side_of_field"              "yardline_100"              
##  [13] "game_date"                  "quarter_seconds_remaining" 
##  [15] "half_seconds_remaining"     "game_seconds_remaining"    
##  [17] "game_half"                  "quarter_end"               
##  [19] "drive"                      "sp"                        
##  [21] "qtr"                        "down"                      
##  [23] "goal_to_go"                 "time"                      
##  [25] "yrdln"                      "ydstogo"                   
##  [27] "ydsnet"                     "desc"                      
##  [29] "play_type"                  "yards_gained"              
##  [31] "shotgun"                    "no_huddle"                 
##  [33] "qb_dropback"                "qb_kneel"                  
##  [35] "qb_spike"                   "qb_scramble"               
##  [37] "pass_length"                "pass_location"             
##  [39] "air_yards"                  "yards_after_catch"         
##  [41] "run_location"               "run_gap"                   
##  [43] "field_goal_result"          "kick_distance"             
##  [45] "extra_point_result"         "two_point_conv_result"     
##  [47] "home_timeouts_remaining"    "away_timeouts_remaining"   
##  [49] "timeout"                    "timeout_team"              
##  [51] "td_team"                    "td_player_name"            
##  [53] "td_player_id"               "posteam_timeouts_remaining"
##  [55] "defteam_timeouts_remaining" "total_home_score"          
##  [57] "total_away_score"           "posteam_score"             
##  [59] "defteam_score"              "score_differential"        
##  [61] "posteam_score_post"         "defteam_score_post"        
##  [63] "score_differential_post"    "no_score_prob"             
##  [65] "opp_fg_prob"                "opp_safety_prob"           
##  [67] "opp_td_prob"                "fg_prob"                   
##  [69] "safety_prob"                "td_prob"                   
##  [71] "extra_point_prob"           "two_point_conversion_prob" 
##  [73] "ep"                         "epa"                       
##  [75] "total_home_epa"             "total_away_epa"            
##  [77] "total_home_rush_epa"        "total_away_rush_epa"       
##  [79] "total_home_pass_epa"        "total_away_pass_epa"       
##  [81] "air_epa"                    "yac_epa"                   
##  [83] "comp_air_epa"               "comp_yac_epa"              
##  [85] "total_home_comp_air_epa"    "total_away_comp_air_epa"   
##  [87] "total_home_comp_yac_epa"    "total_away_comp_yac_epa"   
##  [89] "total_home_raw_air_epa"     "total_away_raw_air_epa"    
##  [91] "total_home_raw_yac_epa"     "total_away_raw_yac_epa"    
##  [93] "wp"                         "def_wp"                    
##  [95] "home_wp"                    "away_wp"                   
##  [97] "wpa"                        "vegas_wpa"                 
##  [99] "vegas_home_wpa"             "home_wp_post"              
## [101] "away_wp_post"               "vegas_wp"                  
## [103] "vegas_home_wp"              "total_home_rush_wpa"       
## [105] "total_away_rush_wpa"        "total_home_pass_wpa"       
## [107] "total_away_pass_wpa"        "air_wpa"                   
## [109] "yac_wpa"                    "comp_air_wpa"              
## [111] "comp_yac_wpa"               "total_home_comp_air_wpa"   
## [113] "total_away_comp_air_wpa"    "total_home_comp_yac_wpa"   
## [115] "total_away_comp_yac_wpa"    "total_home_raw_air_wpa"    
## [117] "total_away_raw_air_wpa"     "total_home_raw_yac_wpa"    
## [119] "total_away_raw_yac_wpa"     "punt_blocked"              
## [121] "first_down_rush"            "first_down_pass"           
## [123] "first_down_penalty"         "third_down_converted"      
## [125] "third_down_failed"          "fourth_down_converted"     
## [127] "fourth_down_failed"         "incomplete_pass"           
## [129] "touchback"                  "interception"
summary(pbp$yardline_100)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    1.00   30.00   48.00   47.07   66.00   99.00     419
summary(pbp$down)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   2.000   2.016   3.000   4.000     941
library(janitor)
## Warning: package 'janitor' was built under R version 4.3.3
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
# ---- Parameters ----
team   <- "IND"
season <- 2025

# ---- 1) Identify Colts' first regular-season game in 2025 ----
sched <- load_schedules(seasons = season) %>% clean_names()

colts_g1 <- sched %>%
  filter(game_type == "REG",
         season == !!season,
         home_team == !!team | away_team == !!team) %>%
  arrange(gameday, week) %>%
  slice(1)

stopifnot(nrow(colts_g1) == 1)

game_id <- colts_g1$game_id
week    <- colts_g1$week
gameday <- colts_g1$gameday
home_tm <- colts_g1$home_team
away_tm <- colts_g1$away_team
opp     <- ifelse(home_tm == team, away_tm, home_tm)

message(glue::glue("Found Colts Week {week} ({gameday}): {away_tm} @ {home_tm} | game_id = {game_id}"))
## Found Colts Week 1 (2025-09-07): MIA @ IND | game_id = 2025_01_MIA_IND
# ---- 2) Pull core data for this game ----
pbp_g1 <- load_pbp(seasons = season) %>%
  clean_names() %>%
  filter(game_id == !!game_id)

rw_g1 <- load_rosters_weekly(seasons = season) %>%
  clean_names() %>%
  filter(week == !!week, team %in% c(team, opp))

officials_g1 <- load_officials(seasons = season) %>%
  clean_names() %>%
  filter(game_id == !!game_id)

players_master <- load_players() %>% clean_names()

# ============================================================
#               EMPHASIS: DATA INTEGRITY CHECKS
# ============================================================

# ---- A) Structural sanity: dimensions & basic types ----
cat("\n--- STRUCTURE ---\n")
## 
## --- STRUCTURE ---
cat("PBP rows/cols:", nrow(pbp_g1), ncol(pbp_g1), "\n")
## PBP rows/cols: 152 372
cat("Weekly roster rows/cols:", nrow(rw_g1), ncol(rw_g1), "\n")
## Weekly roster rows/cols: 3047 36
cat("Officials rows/cols:", nrow(officials_g1), ncol(officials_g1), "\n\n")
## Officials rows/cols: 0 9
# ---- B) Key constraints & duplicates ----
# Expect play-level key uniqueness by (game_id, play_id)
dup_plays <- pbp_g1 %>%
  count(game_id, play_id) %>%
  filter(n > 1)

if (nrow(dup_plays) > 0) {
  warning("Duplicate plays detected in (game_id, play_id). Inspect `dup_plays`.")
} else {
  cat("Key check passed: (game_id, play_id) appear unique for this game.\n")
}
## Key check passed: (game_id, play_id) appear unique for this game.
# For weekly rosters, (team, gsis_id) often repeats by position slots; check basic duplication
dup_roster <- rw_g1 %>%
  count(team, gsis_id, week) %>%
  filter(n > 1)
if (nrow(dup_roster) > 0) {
  message("Note: roster has repeated (team, gsis_id, week) rows—can be normal when multiple rows exist per role/slot.")
}

# ---- C) Referential integrity (players present in master) ----
# Attempt to match weekly roster players to master by gsis_id (best available common key)
if ("gsis_id" %in% names(rw_g1) && "gsis_id" %in% names(players_master)) {
  roster_master_match <- rw_g1 %>%
    mutate(in_master = gsis_id %in% players_master$gsis_id) %>%
    summarize(match_rate = mean(in_master, na.rm = TRUE))
  cat("Player master match rate (weekly roster -> master by gsis_id):",
      round(roster_master_match$match_rate*100, 1), "%\n")
} else {
  message("gsis_id not present to check roster->player master referential integrity.")
}
## Player master match rate (weekly roster -> master by gsis_id): 93.3 %
# ---- D) Missingness (critical analytical fields) ----
na_rate <- function(x) mean(is.na(x))
na_summary <- pbp_g1 %>%
  summarize(
    na_epa          = na_rate(epa),
    na_success      = na_rate(success),
    na_posteam      = na_rate(posteam),
    na_defteam      = na_rate(defteam),
    na_yardline_100 = na_rate(yardline_100),
    na_down         = na_rate(down),
    na_ydstogo      = na_rate(ydstogo),
    na_play_type    = na_rate(play_type)
  )
cat("\n--- MISSINGNESS (PBP) ---\n"); print(na_summary)
## 
## --- MISSINGNESS (PBP) ---
## # A tibble: 1 × 8
##   na_epa na_success na_posteam na_defteam na_yardline_100 na_down na_ydstogo
##    <dbl>      <dbl>      <dbl>      <dbl>           <dbl>   <dbl>      <dbl>
## 1 0.0132     0.0132     0.0658     0.0658          0.0789   0.171          0
## # ℹ 1 more variable: na_play_type <dbl>
# ---- E) Range & logical checks ----
# yardline_100 should be within [0, 100]
bad_yardline <- pbp_g1 %>% filter(!is.na(yardline_100) & (yardline_100 < 0 | yardline_100 > 100))
if (nrow(bad_yardline) > 0) warning("Out-of-range yardline_100 values observed.")

# down should be 1–4 (ignore special plays like kickoffs which may have NA)
bad_down <- pbp_g1 %>% filter(!is.na(down) & !(down %in% 1:4))
if (nrow(bad_down) > 0) warning("Unexpected down values outside 1–4.")

# ydstogo should be positive for standard plays
bad_ydstogo <- pbp_g1 %>% filter(!is.na(ydstogo) & ydstogo <= 0)
if (nrow(bad_ydstogo) > 0) message("Non-positive ydstogo rows exist (could be goal-to-go anomalies or data glitches).")
## Non-positive ydstogo rows exist (could be goal-to-go anomalies or data glitches).
# game_id consistency
if (length(unique(pbp_g1$game_id)) != 1) warning("Multiple game_ids in pbp_g1; filter may be off.")

# posteam/defteam should be one of the two teams (allow NA on non-plays)
valid_teams <- c(team, opp)
weird_teams <- pbp_g1 %>%
  filter(!is.na(posteam) & !(posteam %in% valid_teams)) %>%
  distinct(posteam)
if (nrow(weird_teams) > 0) warning("Found posteam not matching Colts or opponent: check `weird_teams`.")

# ---- F) Categorical consistency (team abbreviations) ----
cat("\n--- TEAM ABBREVS IN PBP ---\n")
## 
## --- TEAM ABBREVS IN PBP ---
print(sort(unique(na.omit(pbp_g1$posteam))))
## [1] "IND" "MIA"
print(sort(unique(na.omit(pbp_g1$defteam))))
## [1] "IND" "MIA"
# Find rows with zero or negative ydstogo values
zero_negative_ydstogo <- pbp %>%
  filter(!is.na(ydstogo) & ydstogo <= 0) %>%
  select(game_id, play_id, qtr, down, ydstogo, yardline_100, goal_to_go, 
         play_type, desc, posteam, defteam, yards_gained, first_down_rush, 
         first_down_pass, first_down_penalty)

# Show how many we found
cat("Total rows with ydstogo <= 0:", nrow(zero_negative_ydstogo), "\n\n")
## Total rows with ydstogo <= 0: 941
# Show first 10 examples
cat("First 10 examples:\n")
## First 10 examples:
print(head(zero_negative_ydstogo, 10))
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 10 × 15
##    game_id   play_id   qtr  down ydstogo yardline_100 goal_to_go play_type desc 
##    <chr>       <dbl> <dbl> <dbl>   <dbl>        <dbl>      <int> <chr>     <chr>
##  1 2025_01_…       1     1    NA       0           NA          0 <NA>      GAME 
##  2 2025_01_…      40     1    NA       0           35          0 kickoff   19-B…
##  3 2025_01_…     665     1    NA       0           35          0 kickoff   38-C…
##  4 2025_01_…    1042     1    NA       0           NA          0 <NA>      END …
##  5 2025_01_…    1124     2    NA       0           15          0 extra_po… 19-B…
##  6 2025_01_…    1139     2    NA       0           35          0 kickoff   19-B…
##  7 2025_01_…    1366     2    NA       0           15          0 extra_po… 38-C…
##  8 2025_01_…    1381     2    NA       0           35          0 kickoff   38-C…
##  9 2025_01_…    1700     2    NA       0           35          0 kickoff   19-B…
## 10 2025_01_…    2067     2    NA       0           15          0 extra_po… 38-C…
## # ℹ 6 more variables: posteam <chr>, defteam <chr>, yards_gained <dbl>,
## #   first_down_rush <dbl>, first_down_pass <dbl>, first_down_penalty <dbl>
# Break down by play_type to see patterns
cat("\n\nBreakdown by play_type:\n")
## 
## 
## Breakdown by play_type:
ydstogo_by_type <- zero_negative_ydstogo %>%
  count(play_type, sort = TRUE)
print(ydstogo_by_type)
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 6 × 2
##   play_type       n
##   <chr>       <int>
## 1 kickoff       343
## 2 no_play       264
## 3 <NA>          170
## 4 extra_point   149
## 5 pass           11
## 6 run             4
# Look at goal_to_go situations specifically
cat("\n\nGoal-to-go situations:\n")
## 
## 
## Goal-to-go situations:
goal_to_go_cases <- zero_negative_ydstogo %>%
  filter(goal_to_go == 1) %>%
  select(down, ydstogo, yardline_100, play_type, desc) %>%
  head(5)
print(goal_to_go_cases)
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 0 × 5
## # ℹ 5 variables: down <dbl>, ydstogo <dbl>, yardline_100 <dbl>,
## #   play_type <chr>, desc <chr>
# Get ranges for yardline_100 and down in Colts Game 1
cat("yardline_100 range:", range(pbp_g1$yardline_100, na.rm = TRUE), "\n")
## yardline_100 range: 1 88
cat("down range:", range(pbp_g1$down, na.rm = TRUE), "\n")
## down range: 1 4
# ============================================================
#                      EXPLORATORY ANALYSIS
# ============================================================

# ---- 3) Game overview: scoring & tempo ----

pbp_g1 <- pbp_g1 %>%
  mutate(
    points_scored = case_when(
      touchdown == 1 ~ 6,                               # Touchdowns
      field_goal_result == "made" ~ 3,                  # Field goals
      safety == 1 ~ 2,                                  # Safeties
      extra_point_result == "good" ~ 1,                 # PAT kicks
      two_point_attempt == 1 & two_point_conv_result == "success" ~ 2, # 2PT conversions
      TRUE ~ 0
    )
  )

score_by_q <- pbp_g1 %>%
  filter(!is.na(qtr), qtr %in% 1:4) %>%
  group_by(qtr) %>%
  summarize(
    ind_points = sum(if_else(posteam == "IND", points_scored, 0), na.rm = TRUE),
    opp_points = sum(if_else(posteam == opp,   points_scored, 0), na.rm = TRUE),
    plays = n(),
    .groups = "drop"
  )
cat("\n--- SCORE BY QUARTER (derived from PBP points events) ---\n")
## 
## --- SCORE BY QUARTER (derived from PBP points events) ---
print(score_by_q)
## # A tibble: 4 × 4
##     qtr ind_points opp_points plays
##   <dbl>      <dbl>      <dbl> <int>
## 1     1          3          0    31
## 2     2         17          0    45
## 3     3          3          0    33
## 4     4         10          8    43
tempo <- pbp_g1 %>%
  filter(!is.na(posteam), !is.na(game_seconds_remaining)) %>%
  arrange(game_seconds_remaining) %>%
  group_by(posteam) %>%
  mutate(sec_between = lag(game_seconds_remaining) - game_seconds_remaining) %>%
  summarize(sec_per_play = median(sec_between, na.rm = TRUE), .groups = "drop") %>%
  filter(!is.na(sec_per_play))
cat("\n--- TEMPO (median sec/play) ---\n"); print(tempo)
## 
## --- TEMPO (median sec/play) ---
## # A tibble: 2 × 2
##   posteam sec_per_play
##   <chr>          <dbl>
## 1 IND            -38.5
## 2 MIA            -34.5
library(tidyverse)   # dplyr/ggplot/readr, etc.
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ readr     2.1.4
## ✔ ggplot2   3.5.2     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(lubridate)   # dates/times
library(janitor)     # clean_names, tabyl
library(scales)      # pretty labels
## Warning: package 'scales' was built under R version 4.3.3
## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor
library(ggrepel)     # labels for charts
## Warning: package 'ggrepel' was built under R version 4.3.2
library(zoo)    
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# ---- 4) Offensive profile: play mix & efficiency ----
off_mix <- pbp_g1 %>%
  filter(!is.na(posteam)) %>%
  mutate(play_family = case_when(
    play_type %in% c("run") ~ "Run",
    play_type %in% c("pass") ~ "Pass",
    play_type %in% c("qb_kneel","qb_spike") ~ "Clock",
    play_type %in% c("no_play","timeout") ~ "Other",
    TRUE ~ "Other"
  )) %>%
  group_by(posteam, play_family) %>%
  summarize(
    plays          = n(),
    share          = n()/nrow(pbp_g1),
    epa_per_play   = mean(epa, na.rm = TRUE),
    success_rate   = mean(success, na.rm = TRUE),
    yards_per_play = mean(yards_gained, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(posteam, desc(plays))
cat("\n--- OFFENSIVE MIX & EFFICIENCY ---\n"); print(off_mix, n = 50)
## 
## --- OFFENSIVE MIX & EFFICIENCY ---
## # A tibble: 6 × 7
##   posteam play_family plays  share epa_per_play success_rate yards_per_play
##   <chr>   <chr>       <int>  <dbl>        <dbl>        <dbl>          <dbl>
## 1 IND     Run            40 0.263         0.138        0.5             3.9 
## 2 IND     Pass           30 0.197         0.433        0.567           8.73
## 3 IND     Other          15 0.0987        0.190        0.667           0   
## 4 MIA     Pass           35 0.230        -0.461        0.371           3.86
## 5 MIA     Run            12 0.0789        0.339        0.583           6.5 
## 6 MIA     Other          10 0.0658        0.352        0.778           0
# EPA distribution (visual)
epa_dist <- pbp_g1 %>%
  filter(!is.na(posteam), play_type %in% c("run","pass")) %>%
  mutate(is_colts = if_else(posteam == "IND", "Colts", "Opponent"))
ggplot(epa_dist, aes(x = epa, fill = is_colts)) +
  geom_histogram(bins = 40, alpha = 0.6, position = "identity") +
  geom_vline(data = epa_dist %>% group_by(is_colts) %>% summarize(mu = mean(epa, na.rm = TRUE)),
             aes(xintercept = mu, color = is_colts), linewidth = 0.9) +
  scale_x_continuous(labels = number_format(accuracy = 0.1)) +
  labs(title = "EPA distribution (pass & run plays)",
       x = "EPA per play", y = "Count", fill = "", color = "") +
  theme_minimal()

# ---- 5) Situational analysis ----
situ <- pbp_g1 %>%
  filter(!is.na(down), !is.na(ydstogo), play_type %in% c("pass","run")) %>%
  mutate(
    ytg_bucket = case_when(
      ydstogo <= 2 ~ "1-2",
      ydstogo <= 5 ~ "3-5",
      ydstogo <= 10 ~ "6-10",
      TRUE ~ "11+"
    ),
    down_lab = paste0("Down ", down)
  ) %>%
  group_by(posteam, down_lab, ytg_bucket) %>%
  summarize(
    plays = n(),
    success_rate = mean(success, na.rm = TRUE),
    epa_per_play = mean(epa, na.rm = TRUE),
    .groups = "drop"
  )
ggplot(situ %>% filter(posteam %in% c("IND", opp)),
       aes(x = ytg_bucket, y = success_rate, group = posteam, color = posteam)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  facet_wrap(~down_lab) +
  scale_y_continuous(labels = percent) +
  labs(title = "Success rate by down & yards-to-go",
       x = "Yards to go (bucket)", y = "Success rate", color = "Offense") +
  theme_minimal()

# Field position zones
fieldpos <- pbp_g1 %>%
  filter(!is.na(yardline_100), play_type %in% c("run","pass")) %>%
  mutate(
    field_zone = case_when(
      yardline_100 >= 80 ~ "Own 20-0",
      yardline_100 >= 50 ~ "Own 49-21",
      yardline_100 >= 21 ~ "Opp 49-21",
      TRUE ~ "Red Zone (<=20)"
    )
  ) %>%
  group_by(posteam, field_zone) %>%
  summarize(
    plays = n(),
    epa_per_play = mean(epa, na.rm = TRUE),
    success_rate = mean(success, na.rm = TRUE),
    .groups = "drop"
  )
ggplot(fieldpos %>% filter(posteam %in% c("IND", opp)),
       aes(x = field_zone, y = epa_per_play, fill = posteam)) +
  geom_col(position = position_dodge(width = 0.8)) +
  coord_flip() +
  labs(title = "EPA/play by field zone",
       x = "", y = "EPA per play", fill = "Offense") +
  theme_minimal()

# Analyze missing drive values in Colts Game 1
missing_drive_analysis <- pbp_g1 %>%
  mutate(has_drive = !is.na(drive)) %>%
  select(play_id, qtr, time, play_type, desc, posteam, defteam, drive, has_drive, 
         touchdown, field_goal_attempt, punt_attempt, kickoff_attempt, 
         extra_point_attempt, two_point_attempt)

# Summary of missing drive plays
cat("=== MISSING DRIVE SUMMARY ===\n")
## === MISSING DRIVE SUMMARY ===
missing_count <- sum(is.na(pbp_g1$drive))
total_count <- nrow(pbp_g1)
cat("Missing drive values:", missing_count, "out of", total_count, 
    "(", round(missing_count/total_count*100, 1), "%)\n\n")
## Missing drive values: 3 out of 152 ( 2 %)
# Break down missing drives by play type
cat("=== MISSING DRIVES BY PLAY TYPE ===\n")
## === MISSING DRIVES BY PLAY TYPE ===
missing_by_type <- missing_drive_analysis %>%
  filter(!has_drive) %>%
  count(play_type, sort = TRUE)
print(missing_by_type)
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 2 × 2
##   play_type     n
##   <chr>     <int>
## 1 <NA>          2
## 2 no_play       1
# Look at specific examples of missing drive plays
cat("\n=== EXAMPLES OF MISSING DRIVE PLAYS ===\n")
## 
## === EXAMPLES OF MISSING DRIVE PLAYS ===
missing_examples <- missing_drive_analysis %>%
  filter(!has_drive) %>%
  select(qtr, time, play_type, desc, posteam) %>%
  head(10)
print(missing_examples)
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 3 × 5
##     qtr time  play_type desc                        posteam
##   <dbl> <chr> <chr>     <chr>                       <chr>  
## 1     1 15:00 <NA>      GAME                        <NA>   
## 2     2 00:00 <NA>      END QUARTER 2               <NA>   
## 3     4 06:21 no_play   Timeout #1 by MIA at 06:21. <NA>
# Check if certain special plays are more likely to have missing drives
cat("\n=== SPECIAL SITUATIONS WITH MISSING DRIVES ===\n")
## 
## === SPECIAL SITUATIONS WITH MISSING DRIVES ===
special_situations <- missing_drive_analysis %>%
  filter(!has_drive) %>%
  summarise(
    kickoffs = sum(kickoff_attempt == 1, na.rm = TRUE),
    punts = sum(punt_attempt == 1, na.rm = TRUE),
    field_goals = sum(field_goal_attempt == 1, na.rm = TRUE),
    extra_points = sum(extra_point_attempt == 1, na.rm = TRUE),
    two_points = sum(two_point_attempt == 1, na.rm = TRUE),
    touchdowns = sum(touchdown == 1, na.rm = TRUE)
  )
print(special_situations)
## # A tibble: 1 × 6
##   kickoffs punts field_goals extra_points two_points touchdowns
##      <int> <int>       <int>        <int>      <int>      <int>
## 1        0     0           0            0          0          0
colts_targets <- pbp_g1 %>%
  filter(posteam == team, pass == 1) %>%
  count(receiver_player_name, sort = TRUE, name = "targets")

colts_rushers <- pbp_g1 %>%
  filter(posteam == team, rush == 1) %>%
  count(rusher_player_name, sort = TRUE, name = "rush_att")

epa_by_receiver <- pbp_g1 %>%
  filter(posteam == team, pass == 1, !is.na(receiver_player_name)) %>%
  group_by(receiver_player_name) %>%
  summarize(
    targets = n(),
    epa_sum = sum(epa, na.rm = TRUE),
    epa_mean = mean(epa, na.rm = TRUE),
    .groups = "drop"
  ) %>% arrange(desc(epa_sum))

ggplot(epa_by_receiver, aes(x = targets, y = epa_sum, label = receiver_player_name)) +
  geom_point(size = 3) +
  geom_text_repel(min.segment.length = 0) +
  labs(title = "Colts receivers: total EPA vs targets",
       x = "Targets", y = "Total EPA") +
  theme_minimal()

# Combine receiving and rushing touches for EPA per touch analysis
receiving_touches <- pbp_g1 %>%
  filter(posteam == team, pass == 1, !is.na(receiver_player_name)) %>%
  group_by(receiver_player_name) %>%
  summarise(
    touches = n(),
    total_epa = sum(epa, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  rename(player_name = receiver_player_name)

rushing_touches <- pbp_g1 %>%
  filter(posteam == team, rush == 1, !is.na(rusher_player_name)) %>%
  group_by(rusher_player_name) %>%
  summarise(
    touches = n(),
    total_epa = sum(epa, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  rename(player_name = rusher_player_name)

# Combine both types of touches
all_touches <- bind_rows(receiving_touches, rushing_touches) %>%
  group_by(player_name) %>%
  summarise(
    total_touches = sum(touches),
    total_epa = sum(total_epa),
    .groups = "drop"
  ) %>%
  mutate(epa_per_touch = total_epa / total_touches) %>%
  arrange(desc(epa_per_touch))

# Print the results
cat("=== EPA PER TOUCH (Colts Players) ===\n")
## === EPA PER TOUCH (Colts Players) ===
print(all_touches, n = Inf)
## # A tibble: 11 × 4
##    player_name total_touches total_epa epa_per_touch
##    <chr>               <int>     <dbl>         <dbl>
##  1 A.Dulin                 1     1.49         1.49  
##  2 M.Alie-Cox              1     1.36         1.36  
##  3 D.Jones                 4     3.79         0.947 
##  4 M.Pittman               8     6.89         0.861 
##  5 A.Mitchell              2     0.730        0.365 
##  6 A.Pierce                3     1.03         0.342 
##  7 T.Warren               10     1.67         0.167 
##  8 J.Downs                 3     0.434        0.145 
##  9 J.Taylor               21     2.67         0.127 
## 10 D.Giddens              12     0.983        0.0819
## 11 U.Bentley               1    -0.795       -0.795
# ---- Penalties Analysis ----
penalties <- pbp_g1 %>%
  filter(!is.na(penalty_team)) %>%
  group_by(penalty_team) %>%
  summarise(
    total_penalties = n(),
    penalty_yards = sum(penalty_yards, na.rm = TRUE),
    avg_penalty_yards = mean(penalty_yards, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  arrange(desc(total_penalties))

cat(" PENALTIES BY TEAM \n")
##  PENALTIES BY TEAM
print(penalties)
## # A tibble: 2 × 4
##   penalty_team total_penalties penalty_yards avg_penalty_yards
##   <chr>                  <int>         <dbl>             <dbl>
## 1 IND                        4            45                45
## 2 MIA                        3            15                15
# Penalty types breakdown
penalty_types <- pbp_g1 %>%
  filter(!is.na(penalty_type)) %>%
  count(penalty_team, penalty_type, sort = TRUE) %>%
  head(10)

cat("\n TOP PENALTY TYPES \n")
## 
##  TOP PENALTY TYPES
print(penalty_types)
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 5 × 3
##   penalty_team penalty_type                        n
##   <chr>        <chr>                           <int>
## 1 IND          Offensive Holding                   3
## 2 IND          Unnecessary Roughness               1
## 3 MIA          Defensive Holding                   1
## 4 MIA          Offensive Too Many Men on Field     1
## 5 MIA          Running Into the Kicker             1
# ---- Field Goal Analysis ----
field_goals <- pbp_g1 %>%
  filter(field_goal_attempt == 1) %>%
  select(posteam, kick_distance, field_goal_result, kicker_player_name)

cat("\n FIELD GOAL ATTEMPTS \n")
## 
##  FIELD GOAL ATTEMPTS
if(nrow(field_goals) > 0) {
  print(field_goals)
} else {
  cat("No field goal attempts in this game\n")
}
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 4 × 4
##   posteam kick_distance field_goal_result kicker_player_name
##   <chr>           <dbl> <chr>             <chr>             
## 1 IND                24 made              S.Shrader         
## 2 IND                35 made              S.Shrader         
## 3 IND                28 made              S.Shrader         
## 4 IND                48 made              S.Shrader
# ---- Punt Analysis ----
punts <- pbp_g1 %>%
  filter(punt_attempt == 1) %>%
  select(posteam, kick_distance, punter_player_name, 
         touchback, return_yards) %>%
  mutate(net_punt = kick_distance - coalesce(return_yards, 0))

cat("\nPUNT ATTEMPTS \n")
## 
## PUNT ATTEMPTS
if(nrow(punts) > 0) {
  print(punts)
  cat("\nPunt Summary:\n")
  punt_summary <- punts %>%
    group_by(posteam) %>%
    summarise(
      punts = n(),
      avg_distance = mean(kick_distance, na.rm = TRUE),
      avg_net = mean(net_punt, na.rm = TRUE),
      .groups = "drop"
    )
  print(punt_summary)
} else {
  cat("No punts in this game\n")
}
## ── nflverse play by play data ──────────────────────────────────────────────────
## ℹ Data updated: 2025-09-21 05:19:36 EDT
## # A tibble: 1 × 6
##   posteam kick_distance punter_player_name touchback return_yards net_punt
##   <chr>           <dbl> <chr>                  <dbl>        <dbl>    <dbl>
## 1 MIA                42 J.Bailey                   0            0       42
## 
## Punt Summary:
## # A tibble: 1 × 4
##   posteam punts avg_distance avg_net
##   <chr>   <int>        <dbl>   <dbl>
## 1 MIA         1           42      42
# ---- 9) Time-based trends ----
epa_by_q <- pbp_g1 %>%
  filter(qtr %in% 1:4, play_type %in% c("pass","run")) %>%
  group_by(posteam, qtr) %>%
  summarize(
    epa_per_play = mean(epa, na.rm = TRUE),
    success_rate = mean(success, na.rm = TRUE),
    .groups = "drop"
  )
ggplot(epa_by_q, aes(x = factor(qtr), y = epa_per_play, group = posteam, color = posteam)) +
  geom_line(linewidth = 1) + geom_point(size = 2) +
  labs(title = "EPA/play by quarter", x = "Quarter", y = "EPA/play", color = "Offense") +
  theme_minimal()

# Rolling pass rate (10-play window) for Colts
colts_seq <- pbp_g1 %>%
  filter(posteam == team, play_type %in% c("pass","run")) %>%
  mutate(play_index = row_number(),
         pass_flag  = if_else(play_type == "pass", 1, 0)) %>%
  arrange(play_index)
if (nrow(colts_seq) >= 10) {
  colts_seq_roll <- colts_seq %>%
    mutate(pass_rate_10 = zoo::rollapply(pass_flag, width = 10, FUN = mean,
                                         align = "right", fill = NA_real_))
  ggplot(colts_seq_roll, aes(x = play_index, y = pass_rate_10)) +
    geom_line(linewidth = 1) +
    scale_y_continuous(labels = percent) +
    labs(title = "Colts rolling pass rate (10-play window)",
         x = "Offensive play index", y = "Pass rate") +
    theme_minimal()
}
## Warning: Removed 9 rows containing missing values or values outside the scale range
## (`geom_line()`).