R for NFL analytics

Summary of today

Tidyverse and NFL analytics
Sample visualizations using ggplot2
Big Data Bowl insight

Tidyverse and NFL analytics

Packages

Favorite packages to use for manipulation, tidying, data viz, analysis

tidyverse : Data manipulation and graphing
lubridate : Handling dates
ggbeeswarm : Fun plots
ggridges : Fun plots
gganimate : Fun animations
tidyr : Tidy data (wide to long/long to wide)
nflscrapr : Public win probabilities and expected points
caret : Machine learning tools
lme4 : Statistical modeling
teamcolors: Team specific hex codes

#install.packages("tidyverse")
library(tidyverse) 
library(lubridate) 
library(beeswarm)  
library(gganimate) 
library(ggridges)  
library(tidyr)

Reading in data

library(tidyverse)
df.games <- read_csv("prodb/dbo.Game.csv")
df.plays <- read_csv("prodb/dbo.VideoDirectorReport.csv")

Joining data sets

Online cheat-sheet: https://stat545.com/bit001_dplyr-cheatsheet.html

left_join:

inner_join:

right_join:

anti_join:

df.plays.left.join <- df.plays %>% left_join(df.games, by = c("GameKey" = "GameKey"))
df.plays.inner.join <- df.plays %>% inner_join(df.games, by = c("GameKey" = "GameKey"))
dim(df.plays.left.join)

## [1] 2288290      75

dim(df.plays.inner.join)

## [1] 2287952      75

Data manipulation using the tidyverse

Online cheat-sheet: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf

Key commands:

select()
head()
tail()
filter()

df.plays.keep <- df.plays %>% 
  filter(HomeTeamFile == 1) %>% 
  select(GameKey, HomeClubCode, VisitorClubCode, Quarter, 
        PossessionTeam, PlayNullifiedByPenalty, 
        SpecialTeamsPlayType, PlayResult, Down, Distance, PlayDescription)
df.games <- df.games %>% select(GameKey, Season, Season_Type, Week, LeagueType)

df.plays.keep <- df.plays.keep %>% left_join(df.games, by = c("GameKey" = "GameKey"))
df.plays.keep %>% head()

df.plays.keep %>% tail()

More filter(): Categorical variables

df.plays.keep %>% 
  filter(Down == 4, Quarter == 1, Distance == 12, 
         PlayNullifiedByPenalty == "N", SpecialTeamsPlayType == "NULL")

df.plays.keep <- df.plays.keep  %>% 
  filter(Season_Type == "Reg", Season >= 2005, Season <= 2018, LeagueType == "NFL")

What data set did we create?

arrange()

df.plays.keep %>% 
  filter(PossessionTeam == "MIN", Season == 2018) %>% 
  arrange(-PlayResult) %>% 
  head()

What plays are these??

mutate()

df.plays.keep <- df.plays.keep %>% 
  mutate(is.first.down = PlayResult >= Distance, 
         scrimmage.play = SpecialTeamsPlayType == "NULL")

df.plays.keep %>% 
  filter(PossessionTeam == "MIN", Season == 2018, Down == 4, scrimmage.play, Quarter == 1)

What do these four plays represent?

group_by()
summarize()

df.plays.keep %>% 
  filter(scrimmage.play, Season == 2018) %>% 
  group_by(PossessionTeam) %>% 
  summarise(ave.yds.gained = mean(PlayResult)) %>% 
  arrange(-ave.yds.gained) %>% 
  head()

Identify the leaderboard above:

df.plays.keep %>% 
  filter(scrimmage.play) %>% 
  group_by(PossessionTeam, Season) %>% 
  summarise(ave.yds.gained = mean(PlayResult)) %>% 
  arrange(-ave.yds.gained) %>% 
  head()

Identify the leaderboard above:

df.plays.keep %>% 
  filter(scrimmage.play) %>% 
  group_by(PossessionTeam, Season) %>% 
  summarise(ave.yds.gained = mean(PlayResult)) %>% 
  arrange(-ave.yds.gained) %>% 
  tail()

Identify the leaderboard above:

Data visualization

Sample Q1: How often do teams go for it on 4th-short in each season?

fourth.down.rates <- df.plays.keep %>% 
  filter(Down == 4, Distance <= 2) %>% 
  group_by(PossessionTeam, Season) %>% 
  summarise(go.forit.rate = mean(scrimmage.play), 
            n.chances = n())


fourth.down.rates %>% 
  arrange(-go.forit.rate) %>% 
  head()

fourth.down.rates %>% 
  arrange(go.forit.rate) %>% 
  head()

library(ggbeeswarm)
fourth.down.rates %>% 
  ggplot(aes(x = Season, y = go.forit.rate, fill = go.forit.rate)) + 
  geom_quasirandom(pch = 21, size = 3) + 
  scale_fill_viridis_c("",  guide = FALSE) + 
  theme_classic(15) +  
  scale_x_continuous(labels = 2005:2018, breaks = 2005:2018) + 
  scale_y_continuous(labels = scales::percent) + 
  labs(title = "Go-for-it rate, 4th-1 or 4th-2", y = "", x = "")

Sample Q2: What’s going on on the punt play?

df.punts.keep <- df.plays %>% 
  filter(HomeTeamFile == 1, SpecialTeamsPlayType == "Punt"|SpecialTeamsPlayType == "Punt Return") %>% 
  select(GameKey, HomeClubCode, VisitorClubCode, Quarter, KickoffResult,
        PossessionTeam, PlayNullifiedByPenalty, 
        SpecialTeamsPlayType, PlayResult, Down, Distance, PlayDescription)

df.punts.keep <- df.punts.keep %>% left_join(df.games, by = c("GameKey" = "GameKey"))

df.punts.keep %>% 
  mutate(is.penalty = grepl("PENALTY", PlayDescription)) %>% 
  summarise(p.rate = mean(is.penalty))

df.punts.keep %>% 
  filter(Season >= 2001) %>% 
  ggplot(aes(Season, fill = KickoffResult)) + 
  geom_bar(position = "fill") + 
  scale_fill_brewer(palette = "Set1") + 
  scale_y_continuous(labels = scales::percent, "Rate") + 
  labs(title = "Punt outcomes over time")

Other charts of interest

Examples from league office

Tyler Lockett

df.lockett <- data.frame(n.targets = 70, pass.rtg = 158.33, 
                         url = "http://static.nfl.com/static/content/public/static/img/fantasy/transparent/200x200/LOC420915.png")
library(ggimage)
p.lockett <- wr.yearly.sum %>% 
  ggplot(aes(n.targets, pass.rtg)) + 
  geom_point(colour = "grey") + 
  geom_image(data = df.lockett, aes(image = url), size = 0.1, hjust = 0) + 
  geom_point(data = df.lockett, colour = "#002244", size = 2) +
  theme_minimal(14) + 
  labs(title = "Passer rating by receiver targeted, 2002 - 2018", 
       x = "# of targets", y = "Passer rating") + 
  scale_y_continuous(breaks = c(0, 50, 100, 150, 158.33)) + 
  theme(panel.grid.minor.y = element_blank())

Maps

Team logos with ggimage: link

Competition committee work: resampling plays to estimate overtime outcomes

What can we do with Next Gen Stats?

General processes for NGS data

Start small. Identify play-type of interest and find 5-10 examples using playId/gameId
Scrape data using https://docs.ngs.nfl.com and personal credentials
Build animation/summary metrics within the sample of plays
Re-center data using playDirection
Re-orgin data using location where (player/ball) started
Cross-check with video
Expand across larger sample of plays
Sample large sample of plays to cross-check for accuracy

Play animation

file.tracking <- "https://raw.githubusercontent.com/nfl-football-ops/Big-Data-Bowl/master/Data/tracking_gameId_2017090700.csv"
tracking.example <- read_csv(file.tracking)

file.game <- "https://raw.githubusercontent.com/nfl-football-ops/Big-Data-Bowl/master/Data/games.csv"
games.sum <- read_csv(file.game) 

file.plays <- "https://raw.githubusercontent.com/nfl-football-ops/Big-Data-Bowl/master/Data/plays.csv"
plays.sum <- read_csv(file.plays) 

tracking.example.merged <- tracking.example %>% inner_join(games.sum) %>% inner_join(plays.sum) 

example.play <- tracking.example.merged %>% filter(playId == 938)
example.play %>% select(playDescription) %>% slice(1)

library(gganimate)
library(cowplot)

## General field boundaries
xmin <- 0
xmax <- 160/3
hash.right <- 38.35
hash.left <- 12
hash.width <- 3.3


## Specific boundaries for a given play
ymin <- max(round(min(example.play$x, na.rm = TRUE) - 10, -1), 0)
ymax <- min(round(max(example.play$x, na.rm = TRUE) + 10, -1), 120)
df.hash <- expand.grid(x = c(0, 23.36667, 29.96667, xmax), y = (10:110))
df.hash <- df.hash %>% filter(!(floor(y %% 5) == 0))
df.hash <- df.hash %>% filter(y < ymax, y > ymin)

animate.play <- ggplot() +
  scale_size_manual(values = c(6, 4, 6), guide = FALSE) + 
  scale_shape_manual(values = c(21, 16, 21), guide = FALSE) +
  scale_fill_manual(values = c("#e31837", "#654321", "#002244"), guide = FALSE) + 
  scale_colour_manual(values = c("black", "#654321", "#c60c30"), guide = FALSE) + 
  annotate("text", x = df.hash$x[df.hash$x < 55/2], 
           y = df.hash$y[df.hash$x < 55/2], label = "_", hjust = 0, vjust = -0.2) + 
  annotate("text", x = df.hash$x[df.hash$x > 55/2], 
           y = df.hash$y[df.hash$x > 55/2], label = "_", hjust = 1, vjust = -0.2) + 
  annotate("segment", x = xmin, 
           y = seq(max(10, ymin), min(ymax, 110), by = 5), 
           xend =  xmax, 
           yend = seq(max(10, ymin), min(ymax, 110), by = 5)) + 
  annotate("text", x = rep(hash.left, 11), y = seq(10, 110, by = 10), 
                    label = c("G   ", seq(10, 50, by = 10), rev(seq(10, 40, by = 10)), "   G"), 
                    angle = 270, size = 4) + 
  annotate("text", x = rep((xmax - hash.left), 11), y = seq(10, 110, by = 10), 
           label = c("   G", seq(10, 50, by = 10), rev(seq(10, 40, by = 10)), "G   "), 
           angle = 90, size = 4) + 
  annotate("segment", x = c(xmin, xmin, xmax, xmax), 
           y = c(ymin, ymax, ymax, ymin), 
           xend = c(xmin, xmax, xmax, xmin), 
           yend = c(ymax, ymax, ymin, ymin), colour = "black") + 
  geom_point(data = example.play, aes(x = (xmax-y), y = x, shape = team,
                                 fill = team, group = nflId, size = team, colour = team), alpha = 0.7) + 
  geom_text(data = example.play, aes(x = (xmax-y), y = x, label = jerseyNumber), colour = "white", 
            vjust = 0.36, size = 3.5) + 
  ylim(ymin, ymax) + 
  coord_fixed() +  
  theme_nothing() + 
  transition_time(frame.id)  +
  ease_aes('linear') + 
  NULL

## Ensure timing of play matches 10 frames-per-second
play.length.ex <- length(unique(example.play$frame.id))
#animate(animate.play, fps = 10, nframe = play.length.ex)

Target maps

Shiny app built in R: https://www.cmusportsanalytics.com/introduction-to-next-gen-scrapy/, https://sarahmallepalle.shinyapps.io/next-gen-scrapy/

Win probability

anim_pit_no <- df.pbp %>% 
  ggplot(aes(seconds.into, home_wp)) + 
  geom_line(colour = "black", lwd = 1) + 
  geom_line(aes(y = 1-home_wp), colour = "#FFB612", lwd = 1.2)+ 
  geom_image(data = df.logos, aes(image = url), size = 0.08, by = "height") + 
  scale_size_identity() + 
  geom_segment(aes(x = 200, xend =400, y = 0.95, yend = 0.95), colour = "black", lwd = 1.2) + 
  geom_segment(aes(x = 200, xend =400, y = 0.05, yend = 0.05), colour = "#FFB612", lwd = 1.2) + 
  annotate("point", x = df.start$seconds.into[1], y = df.start$home_wp[1], colour = "black", size = 2) + 
  annotate("point", x = df.start$seconds.into[1], y = 1-df.start$home_wp[1], colour = "#FFB612", size = 2) +
  scale_x_continuous(breaks = c(0, 900, 1800, 2700, 3600), 
                     lim = c(0, 4000),
                     labels = c("Start \n Q1", "Start \n Q2", "Start \n Q3", "Start \n Q4", "End \n NO \n 31-28"), "") +   
  scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1), lim = c(0, 1), labels = scales::percent, "") + 
  theme_classic(30) +  
  theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) + 
  labs(title = "PITTSBURGH @ NEW ORLEANS", subtitle = "Game winning chances: Week 16, 2018") + 
  geom_label(data = df.add[1,], aes(label = label), hjust = 0.5, size = 5.5) + 
  geom_label(data = df.add[2,], aes(label = label), hjust = 0.5, size = 5.5) + 
  transition_reveal(seconds.into)

Onside kicks

df.anim <- df.onside.ave %>% 
  ggplot(aes(ave.x.start, ave.y, fill =  Season.dir)) +
  theme_classic(23) + 
  scale_fill_manual(values = c("#e41a1c","#377eb8"), labels = c(" 2017", " 2018"), "") +
  labs(title = "ONSIDE KICKS: RIGHT") + 
  scale_y_continuous("Yards downfield") +
  geom_hline(aes(yintercept = c(0)), lwd = 1.2)+ 
  geom_hline(aes(yintercept = c(10)), lwd = 1.1)+ 
  annotate("text", x = df.hash$x[df.hash$x < 55/2], 
           y = df.hash$y[df.hash$x < 55/2], label = "_", hjust = 0.5, vjust = -0.2) +
  geom_point(pch = 21, size = 5, colour = "black") + 
  geom_point(data = ball.x, size = 5, pch = 19, colour = "#624a2e", fill = "#624a2e") + 
  geom_point(data = ball.x, size = 3, pch = 124, colour = "white", fill = "white") + 
  transition_time(time)

Shown to participants

Spray charts/qb paths

RB locations

RB maps

Shown to participants

Big Data Bowl

Background

Football Ops website with rules and winners: https://operations.nfl.com/the-game/big-data-bowl/
(Static) Github page that hosted the data: https://github.com/nfl-football-ops/Big-Data-Bowl
Forthcoming link: each submission, linked with participants and select resumes

Sample entries and code:

Kyle Burris:https://github.com/burrisk/Big-Data-Bowl

SFU college:https://operations.nfl.com/media/3670/big-data-bowl-sfu.pdf

R for NFL analytics

Michael Lopez, Director of Football Data & Analytics

Summary of today

Tidyverse and NFL analytics

Packages

Reading in data

Joining data sets

Data manipulation using the tidyverse

Data visualization

Sample Q1: How often do teams go for it on 4th-short in each season?

Sample Q2: What’s going on on the punt play?

Other charts of interest

Examples from league office

What can we do with Next Gen Stats?

Play animation

Target maps

Win probability

Onside kicks

Spray charts/qb paths

RB locations

RB maps

Big Data Bowl

Background

Sample entries and code: