Tidyverse and NFL analytics
Sample visualizations using ggplot2
Big Data Bowl insight
Favorite packages to use for manipulation, tidying, data viz, analysis
tidyverse : Data manipulation and graphinglubridate : Handling datesggbeeswarm : Fun plotsggridges : Fun plotsgganimate : Fun animationstidyr : Tidy data (wide to long/long to wide)nflscrapr : Public win probabilities and expected pointscaret : Machine learning toolslme4 : Statistical modelingteamcolors: Team specific hex codes#install.packages("tidyverse")
library(tidyverse)
library(lubridate)
library(beeswarm)
library(gganimate)
library(ggridges)
library(tidyr)
library(tidyverse)
df.games <- read_csv("prodb/dbo.Game.csv")
df.plays <- read_csv("prodb/dbo.VideoDirectorReport.csv")
Online cheat-sheet: https://stat545.com/bit001_dplyr-cheatsheet.html
left_join:
inner_join:
right_join:
anti_join:
df.plays.left.join <- df.plays %>% left_join(df.games, by = c("GameKey" = "GameKey"))
df.plays.inner.join <- df.plays %>% inner_join(df.games, by = c("GameKey" = "GameKey"))
dim(df.plays.left.join)
## [1] 2288290 75
dim(df.plays.inner.join)
## [1] 2287952 75
Online cheat-sheet: https://www.rstudio.com/wp-content/uploads/2015/02/data-wrangling-cheatsheet.pdf
Key commands:
select()
head()
tail()
filter()
df.plays.keep <- df.plays %>%
filter(HomeTeamFile == 1) %>%
select(GameKey, HomeClubCode, VisitorClubCode, Quarter,
PossessionTeam, PlayNullifiedByPenalty,
SpecialTeamsPlayType, PlayResult, Down, Distance, PlayDescription)
df.games <- df.games %>% select(GameKey, Season, Season_Type, Week, LeagueType)
df.plays.keep <- df.plays.keep %>% left_join(df.games, by = c("GameKey" = "GameKey"))
df.plays.keep %>% head()
df.plays.keep %>% tail()
filter(): Categorical variablesdf.plays.keep %>%
filter(Down == 4, Quarter == 1, Distance == 12,
PlayNullifiedByPenalty == "N", SpecialTeamsPlayType == "NULL")
df.plays.keep <- df.plays.keep %>%
filter(Season_Type == "Reg", Season >= 2005, Season <= 2018, LeagueType == "NFL")
What data set did we create?
arrange()df.plays.keep %>%
filter(PossessionTeam == "MIN", Season == 2018) %>%
arrange(-PlayResult) %>%
head()
What plays are these??
mutate()df.plays.keep <- df.plays.keep %>%
mutate(is.first.down = PlayResult >= Distance,
scrimmage.play = SpecialTeamsPlayType == "NULL")
df.plays.keep %>%
filter(PossessionTeam == "MIN", Season == 2018, Down == 4, scrimmage.play, Quarter == 1)
What do these four plays represent?
group_by()
summarize()
df.plays.keep %>%
filter(scrimmage.play, Season == 2018) %>%
group_by(PossessionTeam) %>%
summarise(ave.yds.gained = mean(PlayResult)) %>%
arrange(-ave.yds.gained) %>%
head()
Identify the leaderboard above:
df.plays.keep %>%
filter(scrimmage.play) %>%
group_by(PossessionTeam, Season) %>%
summarise(ave.yds.gained = mean(PlayResult)) %>%
arrange(-ave.yds.gained) %>%
head()
Identify the leaderboard above:
df.plays.keep %>%
filter(scrimmage.play) %>%
group_by(PossessionTeam, Season) %>%
summarise(ave.yds.gained = mean(PlayResult)) %>%
arrange(-ave.yds.gained) %>%
tail()
Identify the leaderboard above:
fourth.down.rates <- df.plays.keep %>%
filter(Down == 4, Distance <= 2) %>%
group_by(PossessionTeam, Season) %>%
summarise(go.forit.rate = mean(scrimmage.play),
n.chances = n())
fourth.down.rates %>%
arrange(-go.forit.rate) %>%
head()
fourth.down.rates %>%
arrange(go.forit.rate) %>%
head()
library(ggbeeswarm)
fourth.down.rates %>%
ggplot(aes(x = Season, y = go.forit.rate, fill = go.forit.rate)) +
geom_quasirandom(pch = 21, size = 3) +
scale_fill_viridis_c("", guide = FALSE) +
theme_classic(15) +
scale_x_continuous(labels = 2005:2018, breaks = 2005:2018) +
scale_y_continuous(labels = scales::percent) +
labs(title = "Go-for-it rate, 4th-1 or 4th-2", y = "", x = "")
df.punts.keep <- df.plays %>%
filter(HomeTeamFile == 1, SpecialTeamsPlayType == "Punt"|SpecialTeamsPlayType == "Punt Return") %>%
select(GameKey, HomeClubCode, VisitorClubCode, Quarter, KickoffResult,
PossessionTeam, PlayNullifiedByPenalty,
SpecialTeamsPlayType, PlayResult, Down, Distance, PlayDescription)
df.punts.keep <- df.punts.keep %>% left_join(df.games, by = c("GameKey" = "GameKey"))
df.punts.keep %>%
mutate(is.penalty = grepl("PENALTY", PlayDescription)) %>%
summarise(p.rate = mean(is.penalty))
df.punts.keep %>%
filter(Season >= 2001) %>%
ggplot(aes(Season, fill = KickoffResult)) +
geom_bar(position = "fill") +
scale_fill_brewer(palette = "Set1") +
scale_y_continuous(labels = scales::percent, "Rate") +
labs(title = "Punt outcomes over time")
df.lockett <- data.frame(n.targets = 70, pass.rtg = 158.33,
url = "http://static.nfl.com/static/content/public/static/img/fantasy/transparent/200x200/LOC420915.png")
library(ggimage)
p.lockett <- wr.yearly.sum %>%
ggplot(aes(n.targets, pass.rtg)) +
geom_point(colour = "grey") +
geom_image(data = df.lockett, aes(image = url), size = 0.1, hjust = 0) +
geom_point(data = df.lockett, colour = "#002244", size = 2) +
theme_minimal(14) +
labs(title = "Passer rating by receiver targeted, 2002 - 2018",
x = "# of targets", y = "Passer rating") +
scale_y_continuous(breaks = c(0, 50, 100, 150, 158.33)) +
theme(panel.grid.minor.y = element_blank())
ggimage: linkGeneral processes for NGS data
playId/gameIdplayDirectionfile.tracking <- "https://raw.githubusercontent.com/nfl-football-ops/Big-Data-Bowl/master/Data/tracking_gameId_2017090700.csv"
tracking.example <- read_csv(file.tracking)
file.game <- "https://raw.githubusercontent.com/nfl-football-ops/Big-Data-Bowl/master/Data/games.csv"
games.sum <- read_csv(file.game)
file.plays <- "https://raw.githubusercontent.com/nfl-football-ops/Big-Data-Bowl/master/Data/plays.csv"
plays.sum <- read_csv(file.plays)
tracking.example.merged <- tracking.example %>% inner_join(games.sum) %>% inner_join(plays.sum)
example.play <- tracking.example.merged %>% filter(playId == 938)
example.play %>% select(playDescription) %>% slice(1)
library(gganimate)
library(cowplot)
## General field boundaries
xmin <- 0
xmax <- 160/3
hash.right <- 38.35
hash.left <- 12
hash.width <- 3.3
## Specific boundaries for a given play
ymin <- max(round(min(example.play$x, na.rm = TRUE) - 10, -1), 0)
ymax <- min(round(max(example.play$x, na.rm = TRUE) + 10, -1), 120)
df.hash <- expand.grid(x = c(0, 23.36667, 29.96667, xmax), y = (10:110))
df.hash <- df.hash %>% filter(!(floor(y %% 5) == 0))
df.hash <- df.hash %>% filter(y < ymax, y > ymin)
animate.play <- ggplot() +
scale_size_manual(values = c(6, 4, 6), guide = FALSE) +
scale_shape_manual(values = c(21, 16, 21), guide = FALSE) +
scale_fill_manual(values = c("#e31837", "#654321", "#002244"), guide = FALSE) +
scale_colour_manual(values = c("black", "#654321", "#c60c30"), guide = FALSE) +
annotate("text", x = df.hash$x[df.hash$x < 55/2],
y = df.hash$y[df.hash$x < 55/2], label = "_", hjust = 0, vjust = -0.2) +
annotate("text", x = df.hash$x[df.hash$x > 55/2],
y = df.hash$y[df.hash$x > 55/2], label = "_", hjust = 1, vjust = -0.2) +
annotate("segment", x = xmin,
y = seq(max(10, ymin), min(ymax, 110), by = 5),
xend = xmax,
yend = seq(max(10, ymin), min(ymax, 110), by = 5)) +
annotate("text", x = rep(hash.left, 11), y = seq(10, 110, by = 10),
label = c("G ", seq(10, 50, by = 10), rev(seq(10, 40, by = 10)), " G"),
angle = 270, size = 4) +
annotate("text", x = rep((xmax - hash.left), 11), y = seq(10, 110, by = 10),
label = c(" G", seq(10, 50, by = 10), rev(seq(10, 40, by = 10)), "G "),
angle = 90, size = 4) +
annotate("segment", x = c(xmin, xmin, xmax, xmax),
y = c(ymin, ymax, ymax, ymin),
xend = c(xmin, xmax, xmax, xmin),
yend = c(ymax, ymax, ymin, ymin), colour = "black") +
geom_point(data = example.play, aes(x = (xmax-y), y = x, shape = team,
fill = team, group = nflId, size = team, colour = team), alpha = 0.7) +
geom_text(data = example.play, aes(x = (xmax-y), y = x, label = jerseyNumber), colour = "white",
vjust = 0.36, size = 3.5) +
ylim(ymin, ymax) +
coord_fixed() +
theme_nothing() +
transition_time(frame.id) +
ease_aes('linear') +
NULL
## Ensure timing of play matches 10 frames-per-second
play.length.ex <- length(unique(example.play$frame.id))
#animate(animate.play, fps = 10, nframe = play.length.ex)
Shiny app built in R: https://www.cmusportsanalytics.com/introduction-to-next-gen-scrapy/, https://sarahmallepalle.shinyapps.io/next-gen-scrapy/
anim_pit_no <- df.pbp %>%
ggplot(aes(seconds.into, home_wp)) +
geom_line(colour = "black", lwd = 1) +
geom_line(aes(y = 1-home_wp), colour = "#FFB612", lwd = 1.2)+
geom_image(data = df.logos, aes(image = url), size = 0.08, by = "height") +
scale_size_identity() +
geom_segment(aes(x = 200, xend =400, y = 0.95, yend = 0.95), colour = "black", lwd = 1.2) +
geom_segment(aes(x = 200, xend =400, y = 0.05, yend = 0.05), colour = "#FFB612", lwd = 1.2) +
annotate("point", x = df.start$seconds.into[1], y = df.start$home_wp[1], colour = "black", size = 2) +
annotate("point", x = df.start$seconds.into[1], y = 1-df.start$home_wp[1], colour = "#FFB612", size = 2) +
scale_x_continuous(breaks = c(0, 900, 1800, 2700, 3600),
lim = c(0, 4000),
labels = c("Start \n Q1", "Start \n Q2", "Start \n Q3", "Start \n Q4", "End \n NO \n 31-28"), "") +
scale_y_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1), lim = c(0, 1), labels = scales::percent, "") +
theme_classic(30) +
theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(hjust = 0.5)) +
labs(title = "PITTSBURGH @ NEW ORLEANS", subtitle = "Game winning chances: Week 16, 2018") +
geom_label(data = df.add[1,], aes(label = label), hjust = 0.5, size = 5.5) +
geom_label(data = df.add[2,], aes(label = label), hjust = 0.5, size = 5.5) +
transition_reveal(seconds.into)
df.anim <- df.onside.ave %>%
ggplot(aes(ave.x.start, ave.y, fill = Season.dir)) +
theme_classic(23) +
scale_fill_manual(values = c("#e41a1c","#377eb8"), labels = c(" 2017", " 2018"), "") +
labs(title = "ONSIDE KICKS: RIGHT") +
scale_y_continuous("Yards downfield") +
geom_hline(aes(yintercept = c(0)), lwd = 1.2)+
geom_hline(aes(yintercept = c(10)), lwd = 1.1)+
annotate("text", x = df.hash$x[df.hash$x < 55/2],
y = df.hash$y[df.hash$x < 55/2], label = "_", hjust = 0.5, vjust = -0.2) +
geom_point(pch = 21, size = 5, colour = "black") +
geom_point(data = ball.x, size = 5, pch = 19, colour = "#624a2e", fill = "#624a2e") +
geom_point(data = ball.x, size = 3, pch = 124, colour = "white", fill = "white") +
transition_time(time)
Shown to participants
Shown to participants
Football Ops website with rules and winners: https://operations.nfl.com/the-game/big-data-bowl/
(Static) Github page that hosted the data: https://github.com/nfl-football-ops/Big-Data-Bowl
Forthcoming link: each submission, linked with participants and select resumes