library(dplyr)
library(ggplot2)
library(devtools)
install_local('/Users/cincy/OneDrive/Documents/Sam R Work/BDBCleanR')
library(BDBCleanR)
library(DT)
library(knitr)
library(gganimate)
library(gifski)
Let’s load in the data. We are only going to load in a single week of tracking data because each file is about 1GB and we don’t need to use it all for EDA. We will only look at the data provided to us for the time being.
games<-read.csv('~/Sam R Work/BDB_2025/games.csv')
player_play<-read.csv('~/Sam R Work/BDB_2025/player_play.csv')
players<-read.csv('~/Sam R Work/BDB_2025/players.csv')
plays<-read.csv('~/Sam R Work/BDB_2025/plays.csv')
track_sample<-read.csv('~/Sam R Work/BDB_2025/tracking_week_1.csv')%>%rename(team=club)
I love to use the str() function in R to get a feel for a dataset so I will do that first with each of these files. It looks like we have a pretty good data set this year again with 9 weeks of data and 136 games.
str(games)
## 'data.frame': 136 obs. of 9 variables:
## $ gameId : int 2022090800 2022091100 2022091101 2022091102 2022091103 2022091104 2022091105 2022091106 2022091107 2022091109 ...
## $ season : int 2022 2022 2022 2022 2022 2022 2022 2022 2022 2022 ...
## $ week : int 1 1 1 1 1 1 1 1 1 1 ...
## $ gameDate : chr "9/8/2022" "9/11/2022" "9/11/2022" "9/11/2022" ...
## $ gameTimeEastern : chr "20:20:00" "13:00:00" "13:00:00" "13:00:00" ...
## $ homeTeamAbbr : chr "LA" "ATL" "CAR" "CHI" ...
## $ visitorTeamAbbr : chr "BUF" "NO" "CLE" "SF" ...
## $ homeFinalScore : int 10 26 24 19 20 35 20 20 9 28 ...
## $ visitorFinalScore: int 31 27 26 10 23 38 20 7 24 22 ...
print(games%>%select(gameId)%>%distinct()%>%nrow())
## [1] 136
print(games%>%select(week)%>%distinct()%>%pull(week))
## [1] 1 2 3 4 5 6 7 8 9
Now looking at the play file, we see this file has a lot more data than the previous one. Each row is a single play meaning we have more than 16k plays to work with and looking at a summary of plays per game, we have a median and mean of around 118 plays. The team at the NFL running the Big Data Bowl surely started christmas early with this Big Data Haul.
The interactive play table let’s you peruse through 20 rows to see the various columns available and what that data may look like. I think there are some interesting new columns this year.
receiverAlignment
targetX, targetY
dropbackDistance
passLocationType
timeToThrow
timeInTackleBox
passTippedAtLine
unblockedPressure
timeToSack
pff_runConceptPrimary
pff_runConceptSecondary
pff_runPassOption
rushLocationType
### plays EDA
str(plays)
## 'data.frame': 16124 obs. of 50 variables:
## $ gameId : int 2022102302 2022091809 2022103004 2022110610 2022102700 2022100205 2022110605 2022100203 2022091104 2022100204 ...
## $ playId : int 2655 3698 3146 348 2799 2314 3861 3994 3662 1422 ...
## $ playDescription : chr "(1:54) (Shotgun) J.Burrow pass short middle to T.Boyd to CIN 30 for 9 yards (J.Hawkins)." "(2:13) (Shotgun) J.Burrow pass short right to H.Hurst to CIN 12 for 4 yards (L.Vander Esch)." "(2:00) (Shotgun) D.Mills pass short right to D.Pierce to HST 26 for 6 yards (D.Walker)." "(9:28) (Shotgun) P.Mahomes pass short left to I.Pacheco to TEN 19 for 4 yards (Z.Cunningham)." ...
## $ quarter : int 3 4 4 1 3 3 4 4 4 2 ...
## $ down : int 1 1 3 2 2 2 1 3 3 3 ...
## $ yardsToGo : int 10 10 12 10 8 6 10 12 12 8 ...
## $ possessionTeam : chr "CIN" "CIN" "HOU" "KC" ...
## $ defensiveTeam : chr "ATL" "DAL" "TEN" "TEN" ...
## $ yardlineSide : chr "CIN" "CIN" "HOU" "TEN" ...
## $ yardlineNumber : int 21 8 20 23 27 29 40 28 35 35 ...
## $ gameClock : chr "01:54" "02:13" "02:00" "09:28" ...
## $ preSnapHomeScore : int 35 17 3 0 10 15 26 16 28 6 ...
## $ preSnapVisitorScore : int 17 17 17 0 10 31 3 26 38 7 ...
## $ playNullifiedByPenalty : chr "N" "N" "N" "N" ...
## $ absoluteYardlineNumber : int 31 18 30 33 37 39 50 82 45 45 ...
## $ preSnapHomeTeamWinProbability : num 0.98202 0.42436 0.00629 0.88422 0.41037 ...
## $ preSnapVisitorTeamWinProbability: num 0.018 0.576 0.994 0.116 0.59 ...
## $ expectedPoints : num 0.719 0.608 -0.291 4.249 3.928 ...
## $ offenseFormation : chr "EMPTY" "EMPTY" "SHOTGUN" "SHOTGUN" ...
## $ receiverAlignment : chr "3x2" "3x2" "2x2" "2x2" ...
## $ playClockAtSnap : int 10 9 12 11 8 15 18 2 3 12 ...
## $ passResult : chr "C" "C" "C" "C" ...
## $ passLength : int 6 4 -4 -6 NA NA NA NA -6 15 ...
## $ targetX : num 36.7 20.8 26 39 NA ...
## $ targetY : num 16.5 20.5 17.6 14.2 NA ...
## $ playAction : logi FALSE FALSE FALSE FALSE TRUE FALSE ...
## $ dropbackType : chr "TRADITIONAL" "TRADITIONAL" "TRADITIONAL" "TRADITIONAL" ...
## $ dropbackDistance : num 2.4 1.14 3.2 3.02 2.03 ...
## $ passLocationType : chr "INSIDE_BOX" "INSIDE_BOX" "INSIDE_BOX" "INSIDE_BOX" ...
## $ timeToThrow : num 2.99 1.84 2.24 2.2 NA ...
## $ timeInTackleBox : num 2.99 1.84 2.24 2.2 NA ...
## $ timeToSack : num NA NA NA NA NA NA NA NA NA NA ...
## $ passTippedAtLine : logi FALSE FALSE FALSE FALSE NA NA ...
## $ unblockedPressure : logi FALSE FALSE FALSE FALSE NA NA ...
## $ qbSpike : logi FALSE FALSE FALSE FALSE NA NA ...
## $ qbKneel : int 0 0 0 0 0 0 0 1 0 0 ...
## $ qbSneak : logi NA NA NA NA FALSE FALSE ...
## $ rushLocationType : chr NA NA NA NA ...
## $ penaltyYards : int NA NA NA NA NA NA NA NA NA NA ...
## $ prePenaltyYardsGained : int 9 4 6 4 -1 3 5 -1 0 15 ...
## $ yardsGained : int 9 4 6 4 -1 3 5 -1 0 15 ...
## $ homeTeamWinProbabilityAdded : num 0.004634 0.002847 0.000205 -0.001308 0.027141 ...
## $ visitorTeamWinProbilityAdded : num -0.004634 -0.002847 -0.000205 0.001308 -0.027141 ...
## $ expectedPointsAdded : num 0.703 -0.241 -0.218 -0.428 -0.639 ...
## $ isDropback : logi TRUE TRUE TRUE TRUE FALSE FALSE ...
## $ pff_runConceptPrimary : chr NA NA NA NA ...
## $ pff_runConceptSecondary : chr NA NA NA NA ...
## $ pff_runPassOption : int 0 0 0 0 0 0 0 0 0 0 ...
## $ pff_passCoverage : chr "Cover-3" "Quarters" "Quarters" "Quarters" ...
## $ pff_manZone : chr "Zone" "Zone" "Zone" "Zone" ...
## view plays - each row is a single play as observation level
# Create a scrollable and resizable datatable with smaller text size
datatable(
plays %>% head(n = 20),
options = list(
scrollX = TRUE, # Enable horizontal scrolling
scrollY = "400px", # Set vertical scroll height
fixedHeader = TRUE, # Fix the header row when scrolling
pageLength = 20, # Show 20 rows per page
columnDefs = list(list(targets = '_all', className = 'dt-center')) # Center align text
),
class = 'compact' # Use a compact table style (smaller text)
) %>%
formatStyle(
columns = names(plays),
fontSize = '12px' # Reduce the font size
)
## plays per game - looks like pretty much all the plays
play_data<-plays%>%
select(gameId)%>%
group_by(gameId)%>%
mutate(play_count = n())%>%
ungroup()%>%distinct()
summary(play_data)
## gameId play_count
## Min. :2.022e+09 Min. : 67.0
## 1st Qu.:2.022e+09 1st Qu.:113.0
## Median :2.022e+09 Median :118.0
## Mean :2.022e+09 Mean :118.6
## 3rd Qu.:2.022e+09 3rd Qu.:125.0
## Max. :2.022e+09 Max. :156.0
This is a new addition! If you thought the added information in the plays.csv was Christmas, this data set is Hannukah because you get 8 days of presents. This file gives us player level information on the play and tells us a lot of what happened/what each player did on the play. This is not at the frame level, but it makes our life easier. Here are some great new columns we get in this data set.
str(player_play)
## 'data.frame': 354727 obs. of 50 variables:
## $ gameId : int 2022090800 2022090800 2022090800 2022090800 2022090800 2022090800 2022090800 2022090800 2022090800 2022090800 ...
## $ playId : int 56 56 56 56 56 56 56 56 56 56 ...
## $ nflId : int 35472 42392 42489 44875 44985 46076 47857 47879 48512 52536 ...
## $ teamAbbr : chr "BUF" "BUF" "BUF" "BUF" ...
## $ hadRushAttempt : int 0 0 0 0 0 0 0 0 0 0 ...
## $ rushingYards : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hadDropback : int 0 0 0 0 0 1 0 0 0 0 ...
## $ passingYards : int 0 0 0 0 0 6 0 0 0 0 ...
## $ sackYardsAsOffense : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hadPassReception : int 0 0 1 0 0 0 0 0 0 0 ...
## $ receivingYards : int 0 0 6 0 0 0 0 0 0 0 ...
## $ wasTargettedReceiver : int 0 0 1 0 0 0 0 0 0 0 ...
## $ yardageGainedAfterTheCatch : int 0 0 1 0 0 0 0 0 0 0 ...
## $ fumbles : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fumbleLost : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fumbleOutOfBounds : int 0 0 0 0 0 0 0 0 0 0 ...
## $ assistedTackle : int 0 0 0 0 0 0 0 0 0 0 ...
## $ forcedFumbleAsDefense : int 0 0 0 0 0 0 0 0 0 0 ...
## $ halfSackYardsAsDefense : int 0 0 0 0 0 0 0 0 0 0 ...
## $ passDefensed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ quarterbackHit : int 0 0 0 0 0 0 0 0 0 0 ...
## $ sackYardsAsDefense : int 0 0 0 0 0 0 0 0 0 0 ...
## $ safetyAsDefense : int 0 0 0 0 0 0 0 0 0 0 ...
## $ soloTackle : int 0 0 0 0 0 0 0 0 0 0 ...
## $ tackleAssist : int 0 0 0 0 0 0 0 0 0 0 ...
## $ tackleForALoss : int 0 0 0 0 0 0 0 0 0 0 ...
## $ tackleForALossYardage : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hadInterception : int 0 0 0 0 0 0 0 0 0 0 ...
## $ interceptionYards : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fumbleRecoveries : int 0 0 0 0 0 0 0 0 0 0 ...
## $ fumbleRecoveryYards : int 0 0 0 0 0 0 0 0 0 0 ...
## $ penaltyYards : int 0 0 0 0 0 0 0 0 0 0 ...
## $ penaltyNames : chr NA NA NA NA ...
## $ wasInitialPassRusher : int NA NA NA NA NA NA NA NA NA NA ...
## $ causedPressure : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ timeToPressureAsPassRusher : num NA NA NA NA NA NA NA NA NA NA ...
## $ getOffTimeAsPassRusher : num NA NA NA NA NA NA NA NA NA NA ...
## $ inMotionAtBallSnap : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ shiftSinceLineset : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ motionSinceLineset : logi FALSE FALSE TRUE FALSE FALSE FALSE ...
## $ wasRunningRoute : int NA NA 1 NA 1 NA NA 1 NA 1 ...
## $ routeRan : chr NA NA "IN" NA ...
## $ blockedPlayerNFLId1 : int 47917 47917 NA 43335 NA NA NA NA 41239 NA ...
## $ blockedPlayerNFLId2 : int NA NA NA NA NA NA NA NA NA NA ...
## $ blockedPlayerNFLId3 : int NA NA NA NA NA NA NA NA NA NA ...
## $ pressureAllowedAsBlocker : int 0 0 NA 0 NA NA NA NA 0 NA ...
## $ timeToPressureAllowedAsBlocker : num NA NA NA NA NA NA NA NA NA NA ...
## $ pff_defensiveCoverageAssignment : chr NA NA NA NA ...
## $ pff_primaryDefensiveCoverageMatchupNflId : int NA NA NA NA NA NA NA NA NA NA ...
## $ pff_secondaryDefensiveCoverageMatchupNflId: int NA NA NA NA NA NA NA NA NA NA ...
datatable(
player_play %>% head(n = 20),
options = list(
scrollX = TRUE,
scrollY = "400px",
fixedHeader = TRUE,
pageLength = 20,
columnDefs = list(list(targets = '_all', className = 'dt-center'))
),
class = 'compact'
) %>%
formatStyle(
columns = names(player_play),
fontSize = '12px'
)
YAWN!!!! Pretty much the same thing we’ve seen in prior years.
library(tidyr)
## Warning: package 'tidyr' was built under R version 4.2.2
players<-read.csv('~/Sam R Work/BDB_2025/players.csv')%>%# Split the height into feet and inches
separate(height, into = c("feet", "inches"), sep = "-", convert = TRUE) %>%
# Convert to total height in inches
mutate(height_in_inches = feet * 12 + inches) %>%
# Remove the temporary feet and inches columns
select(-feet, -inches)
### Players EDA - same data as last year. Need to add nflfastR data to this
str(players)
## 'data.frame': 1697 obs. of 7 variables:
## $ nflId : int 25511 29550 29851 30842 33084 33099 33107 33130 33131 33138 ...
## $ weight : int 225 328 225 267 217 245 315 175 300 222 ...
## $ birthDate : chr "1977-08-03" "1982-01-22" "1983-12-02" "1984-05-19" ...
## $ collegeName : chr "Michigan" "Arkansas" "California" "UCLA" ...
## $ position : chr "QB" "T" "QB" "TE" ...
## $ displayName : chr "Tom Brady" "Jason Peters" "Aaron Rodgers" "Marcedes Lewis" ...
## $ height_in_inches: num 76 76 74 78 76 78 76 70 80 75 ...
I think we can jazz this up though. I know we are going to need some nflfastR identifiers and eventually I want to get those sweet headshots for some headshot tables. To do this, I’ve taken a function I wrote last year and put it in a local package I am calling BDBCleanR. I’ll use some of these more in later iterations.
players<-add_player_identifiers(players)
datatable(
players %>% head(n = 20),
options = list(
scrollX = TRUE,
scrollY = "400px",
fixedHeader = TRUE,
pageLength = 20,
columnDefs = list(list(targets = '_all', className = 'dt-center'))
),
class = 'compact'
) %>%
formatStyle(
columns = names(players),
fontSize = '12px'
)
This is where we get into the real meat on the bones. Working with this tracking data is what keeps me coming back year after year and I am excited about the possibilities this data is serving up because for the first time, maybe ever, we have full plays. There is a new column called frameType that identifies if it is pre-snap, snap, or post-snap. We have a lot more events so let’s take a look at those and get a feel for how the plays are set up, their length, and what we have to work with here.
### tracking data EDA
## new column called frame type that helps you define before snap, snap, after snap.
str(track_sample)
## 'data.frame': 7104700 obs. of 18 variables:
## $ gameId : int 2022091200 2022091200 2022091200 2022091200 2022091200 2022091200 2022091200 2022091200 2022091200 2022091200 ...
## $ playId : int 64 64 64 64 64 64 64 64 64 64 ...
## $ nflId : int 35459 35459 35459 35459 35459 35459 35459 35459 35459 35459 ...
## $ displayName : chr "Kareem Jackson" "Kareem Jackson" "Kareem Jackson" "Kareem Jackson" ...
## $ frameId : int 1 2 3 4 5 6 7 8 9 10 ...
## $ frameType : chr "BEFORE_SNAP" "BEFORE_SNAP" "BEFORE_SNAP" "BEFORE_SNAP" ...
## $ time : chr "2022-09-13 00:16:03.5" "2022-09-13 00:16:03.6" "2022-09-13 00:16:03.7" "2022-09-13 00:16:03.8" ...
## $ jerseyNumber : int 22 22 22 22 22 22 22 22 22 22 ...
## $ team : chr "DEN" "DEN" "DEN" "DEN" ...
## $ playDirection: chr "right" "right" "right" "right" ...
## $ x : num 51.1 51.1 51.2 51.3 51.3 ...
## $ y : num 28.6 28.6 28.6 28.6 28.6 ...
## $ s : num 0.72 0.71 0.69 0.67 0.65 0.62 0.61 0.61 0.62 0.61 ...
## $ a : num 0.37 0.36 0.23 0.22 0.34 0.4 0.42 0.49 0.46 0.4 ...
## $ dis : num 0.07 0.07 0.07 0.07 0.07 0.06 0.06 0.06 0.06 0.06 ...
## $ o : num 246 245 244 244 246 ...
## $ dir : num 68.3 71.2 69.9 68 62.8 ...
## $ event : chr "huddle_break_offense" NA NA NA ...
events<-track_sample%>%
select(event)%>%
group_by(event)%>%
mutate(n=n())%>%
ungroup()%>%
arrange(desc(n))%>%
distinct()
datatable(events)
## look at a play
sample_play<-track_sample%>%filter(gameId=='2022091200' & playId==64)
datatable(
sample_play %>% head(n = 20),
options = list(
scrollX = TRUE,
scrollY = "400px",
fixedHeader = TRUE,
pageLength = 50,
columnDefs = list(list(targets = '_all', className = 'dt-center'))
),
class = 'compact'
) %>%
formatStyle(
columns = names(sample_play),
fontSize = '12px'
)
## look at a play
animation<-animate_play(gameid=2022091200,playid=64,track_df=track_sample)
animate(animation, nframes = 100, fps = 10, width = 800, height = 400, renderer = gifski_renderer("fig/play_animation.gif"))
include_graphics("~/Sam R Work/BDB_2025/fig/play_animation.gif")
datatable(track_sample%>%
select(frameId,event)%>%
group_by(event)%>%
mutate(avg_frame = round(mean(frameId),2),
min_frame = min(frameId),
median_frame = median(frameId),
max_frame = max(frameId))%>%
select(-frameId)%>%
ungroup()%>%
distinct(),
options = list(
scrollX = TRUE,
scrollY = "400px",
fixedHeader = TRUE,
pageLength = 50,
columnDefs = list(list(targets = '_all', className = 'dt-center'))
),
class = 'compact'
) %>%
formatStyle(
columns = names(c('event','avg_frame','min_frame','median_frame','max_frame')),
fontSize = '12px'
)
### Lets look at the ordering of events
event_rank<-track_sample%>%
select(gameId,playId,frameId,event)%>%
filter(event!='NA')%>%
group_by(gameId,playId)%>%
mutate(frame_rank = dense_rank(frameId)) %>%
distinct()%>%
ungroup()
datatable(event_rank%>%
select(frame_rank,event)%>%
group_by(event)%>%
mutate(avg_frame = round(mean(frame_rank),2),
min_frame = min(frame_rank),
median_frame = median(frame_rank),
max_frame = max(frame_rank),
n=n())%>%
select(-frame_rank)%>%
ungroup()%>%
distinct(),
options = list(
scrollX = TRUE,
scrollY = "400px",
fixedHeader = TRUE,
pageLength = 50,
columnDefs = list(list(targets = '_all', className = 'dt-center'))
),
class = 'compact'
) %>%
formatStyle(
columns = names(c('event','avg_frame','min_frame','median_frame','max_frame','n')),
fontSize = '12px'
)