Summary of today

  1. Tidyverse and NFL analytics

  2. Sample visualizations using ggplot2

  3. Big Data Bowl insight

Tidyverse and NFL analytics

Packages

Favorite packages to use for manipulation, tidying, data viz, analysis

  1. tidyverse : Data manipulation and graphing
  2. lubridate : Handling dates
  3. ggbeeswarm : Fun plots
  4. ggridges : Fun plots
  5. gganimate : Fun animations
  6. tidyr : Tidy data (wide to long/long to wide)
  7. nflscrapr : Public win probabilities and expected points
  8. caret : Machine learning tools
  9. lme4 : Statistical modeling
  10. teamcolors: Team specific hex codes
#install.packages("tidyverse")
library(tidyverse) 
library(lubridate) 
library(beeswarm)  
library(gganimate) 
library(ggridges)  
library(tidyr)     

Reading in data

library(tidyverse)
df.games <- read_csv("prodb/dbo.Game.csv")
df.plays <- read_csv("prodb/dbo.VideoDirectorReport.csv")

Joining data sets

Online cheat-sheet: https://stat545.com/bit001_dplyr-cheatsheet.html

left_join:

inner_join:

right_join:

anti_join:

df.plays.left.join <- df.plays %>% left_join(df.games, by = c("GameKey" = "GameKey"))
df.plays.inner.join <- df.plays %>% inner_join(df.games, by = c("GameKey" = "GameKey"))
dim(df.plays.left.join)
## [1] 2288290      75
dim(df.plays.inner.join)
## [1] 2287952      75

Data manipulation using the tidyverse

Online cheat-sheet: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf

Key commands:

  1. select()

  2. head()

  3. tail()

  4. filter()

df.plays.keep <- df.plays %>% 
  filter(HomeTeamFile == 1) %>% 
  select(GameKey, HomeClubCode, VisitorClubCode, Quarter, 
        PossessionTeam, PlayNullifiedByPenalty, 
        SpecialTeamsPlayType, PlayResult, Down, Distance, PlayDescription)
df.games <- df.games %>% select(GameKey, Season, Season_Type, Week, LeagueType)

df.plays.keep <- df.plays.keep %>% left_join(df.games, by = c("GameKey" = "GameKey"))
df.plays.keep %>% head()
df.plays.keep %>% tail()
  1. More filter(): Categorical variables
df.plays.keep %>% 
  filter(Down == 4, Quarter == 1, Distance == 12, 
         PlayNullifiedByPenalty == "N", SpecialTeamsPlayType == "NULL") 
df.plays.keep <- df.plays.keep  %>% 
  filter(Season_Type == "Reg", Season >= 2005, Season <= 2018, LeagueType == "NFL")

What data set did we create?

  1. arrange()
df.plays.keep %>% 
  filter(PossessionTeam == "MIN", Season == 2018) %>% 
  arrange(-PlayResult) %>% 
  head()

What plays are these??

  1. mutate()
df.plays.keep <- df.plays.keep %>% 
  mutate(is.first.down = PlayResult >= Distance, 
         scrimmage.play = SpecialTeamsPlayType == "NULL")

df.plays.keep %>% 
  filter(PossessionTeam == "MIN", Season == 2018, Down == 4, scrimmage.play, Quarter == 1) 

What do these four plays represent?

  1. group_by()

  2. summarize()

df.plays.keep %>% 
  filter(scrimmage.play, Season == 2018) %>% 
  group_by(PossessionTeam) %>% 
  summarise(ave.yds.gained = mean(PlayResult)) %>% 
  arrange(-ave.yds.gained) %>% 
  head()

Identify the leaderboard above:

df.plays.keep %>% 
  filter(scrimmage.play) %>% 
  group_by(PossessionTeam, Season) %>% 
  summarise(ave.yds.gained = mean(PlayResult)) %>% 
  arrange(-ave.yds.gained) %>% 
  head()

Identify the leaderboard above:

df.plays.keep %>% 
  filter(scrimmage.play) %>% 
  group_by(PossessionTeam, Season) %>% 
  summarise(ave.yds.gained = mean(PlayResult)) %>% 
  arrange(-ave.yds.gained) %>% 
  tail()

Identify the leaderboard above:

Data visualization

Sample Q1: How often do teams go for it on 4th-short in each season?

fourth.down.rates <- df.plays.keep %>% 
  filter(Down == 4, Distance <= 2) %>% 
  group_by(PossessionTeam, Season) %>% 
  summarise(go.forit.rate = mean(scrimmage.play), 
            n.chances = n())


fourth.down.rates %>% 
  arrange(-go.forit.rate) %>% 
  head()
fourth.down.rates %>% 
  arrange(go.forit.rate) %>% 
  head()
library(ggbeeswarm)
fourth.down.rates %>% 
  ggplot(aes(x = Season, y = go.forit.rate, fill = go.forit.rate)) + 
  geom_quasirandom(pch = 21, size = 3) + 
  scale_fill_viridis_c("",  guide = FALSE) + 
  theme_classic(15) +  
  scale_x_continuous(labels = 2005:2018, breaks = 2005:2018) + 
  scale_y_continuous(labels = scales::percent) + 
  labs(title = "Go-for-it rate, 4th-1 or 4th-2", y = "", x = "")

Sample Q2: What’s going on on the punt play?

df.punts.keep <- df.plays %>% 
  filter(HomeTeamFile == 1, SpecialTeamsPlayType == "Punt"|SpecialTeamsPlayType == "Punt Return") %>% 
  select(GameKey, HomeClubCode, VisitorClubCode, Quarter, KickoffResult,
        PossessionTeam, PlayNullifiedByPenalty, 
        SpecialTeamsPlayType, PlayResult, Down, Distance, PlayDescription)

df.punts.keep <- df.punts.keep %>% left_join(df.games, by = c("GameKey" = "GameKey"))

df.punts.keep %>% 
  mutate(is.penalty = grepl("PENALTY", PlayDescription)) %>% 
  summarise(p.rate = mean(is.penalty))
df.punts.keep %>% 
  filter(Season >= 2001) %>% 
  ggplot(aes(Season, fill = KickoffResult)) + 
  geom_bar(position = "fill") + 
  scale_fill_brewer(palette = "Set1") + 
  scale_y_continuous(labels = scales::percent, "Rate") + 
  labs(title = "Punt outcomes over time")

Other charts of interest

Examples from league office

  1. Tyler Lockett
df.lockett <- data.frame(n.targets = 70, pass.rtg = 158.33, 
                         url = "http://static.nfl.com/static/content/public/static/img/fantasy/transparent/200x200/LOC420915.png")
library(ggimage)
p.lockett <- wr.yearly.sum %>% 
  ggplot(aes(n.targets, pass.rtg)) + 
  geom_point(colour = "grey") + 
  geom_image(data = df.lockett, aes(image = url), size = 0.1, hjust = 0) + 
  geom_point(data = df.lockett, colour = "#002244", size = 2) +
  theme_minimal(14) + 
  labs(title = "Passer rating by receiver targeted, 2002 - 2018", 
       x = "# of targets", y = "Passer rating") + 
  scale_y_continuous(breaks = c(0, 50, 100, 150, 158.33)) + 
  theme(panel.grid.minor.y = element_blank())

  1. Maps

  1. Team logos with ggimage: link

  1. Competition committee work: resampling plays to estimate overtime outcomes

What can we do with Next Gen Stats?

General processes for NGS data

  1. Start small. Identify play-type of interest and find 5-10 examples using playId/gameId
  2. Scrape data using https://docs.ngs.nfl.com and personal credentials
  3. Build animation/summary metrics within the sample of plays
  4. Re-center data using playDirection
  5. Re-orgin data using location where (player/ball) started
  6. Cross-check with video
  7. Expand across larger sample of plays
  8. Sample large sample of plays to cross-check for accuracy

Play animation

file.tracking <- "https://raw.githubusercontent.com/nfl-football-ops/Big-Data-Bowl/master/Data/tracking_gameId_2017090700.csv"
tracking.example <- read_csv(file.tracking)

file.game <- "https://raw.githubusercontent.com/nfl-football-ops/Big-Data-Bowl/master/Data/games.csv"
games.sum <- read_csv(file.game) 

file.plays <- "https://raw.githubusercontent.com/nfl-football-ops/Big-Data-Bowl/master/Data/plays.csv"
plays.sum <- read_csv(file.plays) 

tracking.example.merged <- tracking.example %>% inner_join(games.sum) %>% inner_join(plays.sum) 

example.play <- tracking.example.merged %>% filter(playId == 938)
example.play %>% select(playDescription) %>% slice(1)
library(gganimate)
library(cowplot)

## General field boundaries
xmin <- 0
xmax <- 160/3
hash.right <- 38.35
hash.left <- 12
hash.width <- 3.3


## Specific boundaries for a given play
ymin <- max(round(min(example.play$x, na.rm = TRUE) - 10, -1), 0)
ymax <- min(round(max(example.play$x, na.rm = TRUE) + 10, -1), 120)
df.hash <- expand.grid(x = c(0, 23.36667, 29.96667, xmax), y = (10:110))
df.hash <- df.hash %>% filter(!(floor(y %% 5) == 0))
df.hash <- df.hash %>% filter(y < ymax, y > ymin)

animate.play <- ggplot() +
  scale_size_manual(values = c(6, 4, 6), guide = FALSE) + 
  scale_shape_manual(values = c(21, 16, 21), guide = FALSE) +
  scale_fill_manual(values = c("#e31837", "#654321", "#002244"), guide = FALSE) + 
  scale_colour_manual(values = c("black", "#654321", "#c60c30"), guide = FALSE) + 
  annotate("text", x = df.hash$x[df.hash$x < 55/2], 
           y = df.hash$y[df.hash$x < 55/2], label = "_", hjust = 0, vjust = -0.2) + 
  annotate("text", x = df.hash$x[df.hash$x > 55/2], 
           y = df.hash$y[df.hash$x > 55/2], label = "_", hjust = 1, vjust = -0.2) + 
  annotate("segment", x = xmin, 
           y = seq(max(10, ymin), min(ymax, 110), by = 5), 
           xend =  xmax, 
           yend = seq(max(10, ymin), min(ymax, 110), by = 5)) + 
  annotate("text", x = rep(hash.left, 11), y = seq(10, 110, by = 10), 
                    label = c("G   ", seq(10, 50, by = 10), rev(seq(10, 40, by = 10)), "   G"), 
                    angle = 270, size = 4) + 
  annotate("text", x = rep((xmax - hash.left), 11), y = seq(10, 110, by = 10), 
           label = c("   G", seq(10, 50, by = 10), rev(seq(10, 40, by = 10)), "G   "), 
           angle = 90, size = 4) + 
  annotate("segment", x = c(xmin, xmin, xmax, xmax), 
           y = c(ymin, ymax, ymax, ymin), 
           xend = c(xmin, xmax, xmax, xmin), 
           yend = c(ymax, ymax, ymin, ymin), colour = "black") + 
  geom_point(data = example.play, aes(x = (xmax-y), y = x, shape = team,
                                 fill = team, group = nflId, size = team, colour = team), alpha = 0.7) + 
  geom_text(data = example.play, aes(x = (xmax-y), y = x, label = jerseyNumber), colour = "white", 
            vjust = 0.36, size = 3.5) + 
  ylim(ymin, ymax) + 
  coord_fixed() +  
  theme_nothing() + 
  transition_time(frame.id)  +
  ease_aes('linear') + 
  NULL

## Ensure timing of play matches 10 frames-per-second
play.length.ex <- length(unique(example.play$frame.id))
#animate(animate.play, fps = 10, nframe = play.length.ex)

Win probability

anim_pit_no <- df.pbp %>% 
  ggplot(aes(seconds.into, home_wp)) + 
  geom_line(colour = "black", lwd = 1) + 
  geom_line(aes(y = 1-home_wp), colour = "#FFB612", lwd = 1.2)+ 
  geom_image(data = df.logos, aes(image = url), size = 0.08, by = "height") + 
  scale_size_identity() + 
  geom_segment(aes(x = 200, xend =400, y = 0.95, yend = 0.95), colour = "black", lwd = 1.2) + 
  geom_segment(aes(x = 200, xend =400, y = 0.05, yend = 0.05), colour = "#FFB612", lwd = 1.2) + 
  annotate("point", x = df.start$seconds.into[1], y = df.start$home_wp[1], colour = "black", size = 2) + 
  annotate("point", x = df.start$seconds.into[1], y = 1-df.start$home_wp[1], colour = "#FFB612", size = 2) +
  scale_x_continuous(breaks = c(0, 900, 1800, 2700, 3600), 
                     lim = c(0, 4000),
                     labels = c("Start \n Q1", "Start \n Q2", "Start \n Q3", "Start \n Q4", "End \n NO \n 31-28"), "") +   
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1), lim = c(0, 1), labels = scales::percent, "") + 
  theme_classic(30) +  
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) + 
  labs(title = "PITTSBURGH @ NEW ORLEANS", subtitle = "Game winning chances: Week 16, 2018") + 
  geom_label(data = df.add[1,], aes(label = label), hjust = 0.5, size = 5.5) + 
  geom_label(data = df.add[2,], aes(label = label), hjust = 0.5, size = 5.5) + 
  transition_reveal(seconds.into)

Onside kicks

df.anim <- df.onside.ave %>% 
  ggplot(aes(ave.x.start, ave.y, fill =  Season.dir)) +
  theme_classic(23) + 
  scale_fill_manual(values = c("#e41a1c","#377eb8"), labels = c(" 2017", " 2018"), "") +
  labs(title = "ONSIDE KICKS: RIGHT") + 
  scale_y_continuous("Yards downfield") +
  geom_hline(aes(yintercept = c(0)), lwd = 1.2)+ 
  geom_hline(aes(yintercept = c(10)), lwd = 1.1)+ 
  annotate("text", x = df.hash$x[df.hash$x < 55/2], 
           y = df.hash$y[df.hash$x < 55/2], label = "_", hjust = 0.5, vjust = -0.2) +
  geom_point(pch = 21, size = 5, colour = "black") + 
  geom_point(data = ball.x, size = 5, pch = 19, colour = "#624a2e", fill = "#624a2e") + 
  geom_point(data = ball.x, size = 3, pch = 124, colour = "white", fill = "white") + 
  transition_time(time) 

Shown to participants

Spray charts/qb paths

RB locations

RB maps

Shown to participants

Big Data Bowl

Background

  1. Football Ops website with rules and winners: https://operations.nfl.com/the-game/big-data-bowl/

  2. (Static) Github page that hosted the data: https://github.com/nfl-football-ops/Big-Data-Bowl

  3. Forthcoming link: each submission, linked with participants and select resumes