setwd("C:/Users/finco/Documents/IS470/RDirec")
library(dplyr)
# ---------- supplied functions start here -----------------------------------------------------------------
# ---------------------------------------------------------------------------
# --------------------------- Load Libraries --------------------------------
# ---------------------------------------------------------------------------
# supply a vector of desired packages. Packages will be installed if not
# already installed and available to RStudio
load_packages <- function(packages = c()) {
if (length(packages) == 0) {
print('You did not specify any packages/libraries to load')
}
else {
for (i in packages){
if(! i %in% installed.packages()){
install.packages(i, dependencies = TRUE)
}
suppressMessages(suppressWarnings(library(i, character.only=T)))
}
}
}
# -----------------------------------------------------------------------------------------------
# ------------ Checks for a valid playId and gameId; optionally check for valid frameId ---------
# -----------------------------------------------------------------------------------------------
check_playId_gameId <- function(game_Id,
play_Id,
frame_Id = c(),
data) {
if (!game_Id %in% unique(data$gameId)) {
return("the game_Id you have used is not in the data frame.")
}
if (!play_Id %in% unique(data$playId)) {
return("the play_Id you have used is not in the data frame.")
}
if(!missing(frame_Id)) {
if (!frame_Id %in% unique(data$frameId)) {
return("the frame_Id you have used is not in the data frame.")
}
}
return("ok")
}
# ---------------------------------------------------------------------------
# ------------------ Visualize an Individual Play (with player) -------------
# ---------------------------------------------------------------------------
# provide a merged data frame, a game Id, a play Id, and a player.
visualize_play <- function(df,
df_team1,
df_team2,
playerNumber = NA)
{
source('https://raw.githubusercontent.com/mlfurman3/gg_field/main/gg_field.R')
g <- ggplot(data = df, aes(x = x, y = y)) +
# customize colors, shapes, and sizes of players and the football
scale_size_manual(values = c(6, 4, 6), guide = "none") +
scale_shape_manual(values = c(21, 16, 21), guide = "none") +
scale_fill_manual(values = c("dodgerblue1", "#663300", "firebrick1"), guide = "none") +
scale_colour_manual(values = c("black", "#663300", "black"), guide = "none") +
gg_field(yardmin = -5, yardmax = 125) +
# add home team logo to the center point
geom_image(data = df_team1, aes(x = 59, y = 53.33/2, image = team_logo_wikipedia), size = 0.15) +
# add end zone team names
annotate("text", x = 4, y = 27, label = df_team2$team_name, angle = 90, size = 5, fontface = 2, color = "white") +
annotate("text", x = 115, y = 27, label = df_team1$team_name, angle = 270, size = 5, fontface = 2, color = "white") +
# add informational text at the top of the field
annotate("text", x = -3, y = 56, label = "Traffic Meter Scale:", colour = "white", vjust = 0, hjust = 0 ) +
annotate("text", x = 38, y = 56, label = "Value:" , colour = "white", vjust = 0, hjust = 0 ) +
annotate("text", x = 52, y = 56, label = "Distance traveled by receiver (yards):", colour = "white", vjust = 0, hjust = 0 ) +
annotate("text", x = 92.5, y = 56, label = "Speed (yards / sec):", colour = "white", vjust = 0, hjust = 0 ) +
# add convex hulls for both teams
geom_encircle( data = df %>% filter(team != "football"),
aes(fill = team,
group = team,
color = team),
s_shape = 1,
expand = 0,
alpha = 0.5,
show.legend = TRUE) +
# add halo around receiver
geom_point(data = df %>% filter(jerseyNumber == playerNumber),
aes(x = x, y = y, alpha = alpha),
shape = 21,
size = 10,
stroke = 1, # width of the circle border
fill = "black",
colour = "black") +
# add points to plot for all players and the football
geom_point(data = df, aes(x = x, y = y, shape = team, colour = team, size = team, fill = team) ) +
# insert jersey number for each player
geom_text( data = df %>% filter(team != "football"),
aes(x = x, y = y,
label = jerseyNumber), colour = "white", size = 3.5, vjust = 0.36 ) +
# report out key metrics (total yards ran by receiver)
geom_text( data = df %>% filter(jerseyNumber == playerNumber),
aes(x = 85.5, y = 56, label = format(dist_running_total, nsmall = 2),
vjust = 0, hjust = 0), color = "white" ) +
# report out key metrics (mean dist to player)
geom_text( data = df %>% filter(jerseyNumber == playerNumber),
aes(x = 44.5, y = 56, label = format(round(100 * (1 - (mean_dist_to_player/113.33)),2), nsmall = 2),
vjust = 0, hjust = 0), color = "white" ) +
# report out key metrics (speed of receiver)
geom_text( data = df %>% filter(jerseyNumber == playerNumber),
aes(x = 111.9, y = 56, label = s, vjust = 0, hjust = 0), color = "white" ) +
# add some labels to report the play description
labs(title = df$playDescription) +
# set the theme to dark green to color the areas beyond the end zones
theme(panel.background = element_rect(fill = "forestgreen",
color = "forestgreen"), panel.grid = element_blank()) +
annotate("rect", xmin = 15, xmax = 35, ymin = 56, ymax = 57, color = "black", fill = "white") +
geom_rect(data = df %>% filter(jerseyNumber == playerNumber),
aes(xmin = 15, xmax = 15 + (alpha*20), ymin = 56, ymax = 57,
alpha = alpha), color = "black", fill = "black" ) +
guides(alpha = "none") +
transition_time(frameId)
g
}
In this assignment, we’re tasked with improving upon Paul’s “dist” metric using the NFL’s 2022 Big Data Bowl tracking data.
Our solution: Time to tackle.
load_packages(c("ggplot2", "ggalt", "ggforce", "hms", "gganimate", "data.table", "dplyr", "nflfastR", "gifski", "png", "ggimage"))
df_tracking <- fread("Data/NFLBDB2022/NFL2022/tracking2020.csv")
df_plays <- fread("Data/NFLBDB2022/NFL2022/plays.csv")
df_games <- fread("Data/NFLBDB2022/NFL2022/games.csv")
# you are creating a metric for a player in a play (playId) in a game (gameId)
my_gameId <- 2021010300
my_playId <- 1586
my_playerNumber <- 19
#check_playId_gameId(game_Id = my_gameId, play_Id = my_playId, frame_Id = 11111, data = df)
#check_playId_gameId(my_gameId, my_playId, 1, df)
#check_playId_gameId(my_gameId, my_playId, df) # will throw an error for missing argument
#check_playId_gameId(game_Id = my_gameId, play_Id = my_playId, data = df)
df <- df_tracking %>%
filter(gameId == my_gameId & playId == my_playId ) %>%
left_join(df_plays, by = c("playId" = "playId", "gameId" = "gameId")) %>%
select(x, y, displayName, jerseyNumber, team, gameId, playId, frameId, time,
nflId, dis, playDescription, s) %>%
data.frame()
team1 <- as.character( df_games[df_games$gameId == my_gameId, "homeTeamAbbr"] )
team2 <- as.character( df_games[df_games$gameId == my_gameId, "visitorTeamAbbr"] )
df_team1 <- teams_colors_logos %>%
filter(team_abbr == team1)
df_team2 <- teams_colors_logos %>%
filter(team_abbr == team2)
df_player1 <- df %>%
filter(jerseyNumber == my_playerNumber) %>%
select(gameId, playId, frameId, x, y, dis, team) %>%
data.frame()
df <- df %>%
left_join(df_player1, by = c( "playId" = "playId", "gameId" = "gameId", "frameId" = "frameId"))
# ----------------------------- here is the code that I created in class -----------------------------
df <- df %>%
mutate(dist = ifelse( team.x != team.y & team.x != "football",
sqrt( (x.x - x.y)^2 + (y.x - y.y)^2),
NA)) %>%
group_by(gameId, playId, frameId) %>%
# compute the average of all distances
mutate(mean_dist_to_player = mean(dist, na.rm = TRUE ),
alphaOld = 1 - (mean_dist_to_player/113.33) ) %>%
ungroup()
Time to tackle was developed using other metrics in the data.
First, we take develop a metric called “relative speed”. This metric measures the speed at which a defender and ball carrier are moving towards each other. If the defender is in front of a ball carrier, their speeds are added, otherwise the difference of their speeds is taken.
From there, we take Paul’s “dist” metric and divide it by relative speed. This gives us time to tackle. A rough measurement (in seconds) of the time until the defender and ball carrier are expected to “collide” for a tackle.
Finally, we take the minimum time to tackle on each frame, as this is the most relevant value to the ball carrier.
# Create a lookup table for ball carrier speeds within each frame
ball_carrier_speed <- df %>%
filter(jerseyNumber == my_playerNumber) %>%
select(gameId, playId, frameId, s) %>%
rename(ball_carrier_speed = s)
# Join the ball carrier speeds back to the main dataframe based on gameId, playId, and frameId
df <- df %>%
left_join(ball_carrier_speed, by = c("gameId", "playId", "frameId")) %>%
# Calculate relative speed and timeToTackle using df$dist directly
mutate(
relative_speed = ifelse(team.x == "away" & !is.na(ball_carrier_speed),
ifelse(x.x < x.y, s + ball_carrier_speed, s - ball_carrier_speed),
NA),
relative_speed = abs(as.numeric(relative_speed)), # Ensure numeric
timeToTackle = ifelse(!is.na(relative_speed) & relative_speed != 0,
df$dist / relative_speed, # Use df$dist directly
NA) # Avoid division by NA or zero
) %>%
data.frame()
df <- df %>%
# Group by gameId, playId, frameId
group_by(gameId, playId, frameId) %>%
# Recalculate minimum and average timeToTackle values
mutate(
min_time_to_tackle = min(timeToTackle, na.rm = TRUE),
) %>%
ungroup()
ecdf_function <- ecdf(df$min_time_to_tackle)
# Apply the ecdf function to each value in min_time_to_tackle to get percentiles from 0 to 1
df <- df %>%
mutate(alpha = 1 - ecdf_function(min_time_to_tackle)) %>%
data.frame()
# compute a running total of the distance traveled
df <- df%>%
arrange(nflId, frameId) %>%
group_by(nflId) %>%
mutate( dist_running_total = cumsum(dis.x)) %>%
rename( "x" = "x.x",
"y" = "y.x",
"team" = "team.x",
"dis" = "dis.x") %>%
data.frame()
# run the line below if you want to visualize the play
visualize_play(df, df_team1, df_team2, playerNumber = 19)
To evaluate time to tackle and dist, we’ll build multiple regression models, where “dis” is the response value. This way, we will evaluate which metric does a better job analyzing the “traffic” the ball carrier is in, and by extension how many yards the ball carrier is expected to travel.
# Filter the data
analysisDF <- df %>%
filter(jerseyNumber == 19)
# Fit the mean distance model
meanDistModel <- lm(dis.y ~ mean_dist_to_player, data = analysisDF)
# Print the summary to the document
meanDistModelSummary <- summary(meanDistModel)
print(meanDistModelSummary)
# Fit the minimum time to tackle model
minTimeToTackleModel <- lm(dis.y ~ min_time_to_tackle, data = analysisDF)
# Print the summary to the document
minTimeToTackleModelSummary <- summary(minTimeToTackleModel)
print(minTimeToTackleModelSummary)
##
## Call:
## lm(formula = dis.y ~ mean_dist_to_player, data = analysisDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.39459 -0.05934 0.01012 0.10396 0.22170
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.9833833 0.0165023 59.59 <2e-16 ***
## mean_dist_to_player -0.0172711 0.0005144 -33.57 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1344 on 229 degrees of freedom
## Multiple R-squared: 0.8311, Adjusted R-squared: 0.8304
## F-statistic: 1127 on 1 and 229 DF, p-value: < 2.2e-16
##
##
## Call:
## lm(formula = dis.y ~ min_time_to_tackle, data = analysisDF)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.45859 -0.30283 0.08024 0.25714 0.36954
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.5721355 0.0201908 28.336 < 2e-16 ***
## min_time_to_tackle -0.0077854 0.0009578 -8.129 2.71e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2881 on 229 degrees of freedom
## Multiple R-squared: 0.2239, Adjusted R-squared: 0.2205
## F-statistic: 66.08 on 1 and 229 DF, p-value: 2.715e-14
As you can see, mean_dist_to_player did a much better job predicting dis than minTimeToTackle. While both metrics were statistically significant predictors, mean_dist_to_player was far better at predicting variance in dis. The mean_dist_to_player regression model explained 83% of the variance in dis, while the minTimeToTackle model explained only 22%.
So, is time to tackle a bad metric? Not exactly. Time to tackle can be refined, but the main reason for the massive discrepancy in our model is our usage of the minimum time to tackle. This was a practical necessity to make the code simpler, but it clearly sacrificed predictive effectiveness. Because we are taking the lowest time to tackle out of 11 defenders, this value is prone to massive swings if the ball carrier can break past defenders. At the same time, the ball carrier will still be moving at a somewhat constant speed. The result is a very poor regression model.
Time to tackle is a promising metric, but it needs to be refined and used properly to be effective as a predictor.