Sports Analytics Assignment 2

# here are the library lines and working directory
setwd("/Users/jose/Desktop/Data Viz")

suppressMessages(library(data.table))
suppressMessages(library(dplyr)) # of choice strip cols and rows
suppressMessages(library(lubridate))
suppressMessages(library(httr))
suppressMessages(library(DescTools))
suppressMessages(library(ggplot2))
suppressMessages(library(scales))
suppressMessages(library(RColorBrewer))
suppressMessages(library(tidytext))

suppressMessages(library(ggplot2))
suppressMessages(library(ggalt))
suppressMessages(library(ggforce))
suppressMessages(library(hms))
suppressMessages(library(gganimate))
suppressMessages(library(data.table))
suppressMessages(library(dplyr))

# reading files and merging 
games <-        fread("Data/NFLBDB2022/NFL2022/games.csv") # gameId Season
plays <-        fread("Data/NFLBDB2022/NFL2022/plays.csv") # gameId playId
players <-      fread("Data/NFLBDB2022/NFL2022/players.csv") # nflId
PFFScouting <-  fread("Data/NFLBDB2022/NFL2022/pffScoutingData.csv") # gameId 

my_df <- left_join(plays, players, by = c("kickerId" = "nflId"))
my_df <- left_join(my_df,  games, by = c("gameId"))
my_df <- left_join(my_df,  PFFScouting, by = c("gameId", "playId"))

# creating the ranking data frame
df <- my_df %>%
  select(specialTeamsPlayType, specialTeamsResult, displayName, possessionTeam, gameId, playId, kickLength, hangTime, kickDirectionActual, kickDirectionIntended) %>%
  filter(specialTeamsPlayType == "Punt" & !is.na(displayName)) %>%
  mutate(good_att = ifelse(specialTeamsResult == "Fair Catch", 1, 0),
         fail_att = ifelse(specialTeamsResult != "Fair Catch", 1, 0)) %>%
  group_by(displayName) %>%
  summarise(total_attempts_fa = sum(fail_att),
            total_attempts_go = sum(good_att),
            .groups = "keep") %>%
  mutate(total_attempt_count = (total_attempts_fa + total_attempts_go),
         ratio_of_success = total_attempts_go / (total_attempt_count)) %>%
  filter(total_attempt_count > 40) %>%
  ungroup() %>%
  select(displayName, total_attempts_fa, total_attempts_go, total_attempt_count, ratio_of_success) %>%
  top_n(10, wt = ratio_of_success) %>%
  arrange(-ratio_of_success) %>%
  mutate(
         ratio_of_success = percent(ratio_of_success),
         ranking = 1:10) %>%
  data.frame()

# bring the display names
df_names <- df$displayName
  
# create data frame with all the merged data frames
tot_att_df <- my_df %>%
  filter(specialTeamsPlayType == "Punt") %>%
  mutate(attempt_success = ifelse(specialTeamsResult == "Fair Catch", "good", "bad")) %>%
  select(kickLength, displayName, attempt_success, playId, gameId) %>%
  filter(displayName %in% df_names) %>%
  filter(!is.na(kickLength)) %>%
  left_join(df, by = c("displayName" = "displayName" )) %>%
  arrange(ranking, attempt_success) %>%
  data.frame()

# annotate the count_of_att of attempt count for each plot
tot_att_counts_to_player <- tot_att_df %>%
   group_by(displayName, attempt_success) %>%
   summarise(count_of_att = n(),.groups = "keep") %>%
   select(displayName, attempt_success, count_of_att) %>%
   data.frame()

# bring in the counts and final plot merge
tot_att_df_final <- tot_att_df %>%
  left_join(tot_att_counts_to_player, by = c("displayName" = "displayName", "attempt_success" = "attempt_success")) %>%
  data.frame()

good <- tot_att_df %>%
  left_join(tot_att_counts_to_player, by = c("displayName" = "displayName", "attempt_success" = "attempt_success")) %>%
  filter(attempt_success == "good") %>%
  group_by(displayName, count_of_att, ranking, attempt_success) %>%
  summarise(avg_kick = round(mean(kickLength),2), .groups = "keep") %>%
  data.frame()

bad <- tot_att_df %>%
  left_join(tot_att_counts_to_player, by = c("displayName" = "displayName", "attempt_success" = "attempt_success")) %>%
  filter(attempt_success == "bad") %>%
  group_by(displayName, count_of_att, ranking, attempt_success) %>%
  summarise(avg_kick = round(mean(kickLength),2), .groups = "keep") %>%
  data.frame()

# merge good and bad to create new metrics
merge_to_ratio <- good %>%
  left_join(bad, by = c("displayName" = "displayName", "ranking" = "ranking")) %>%
  arrange(ranking) %>%
  rename("attempt_success_good" = "attempt_success.x",
         "attempt_success_bad" = "attempt_success.y",
         "avg_kick_good" = "avg_kick.x",
         "avg_kick_bad" = "avg_kick.y",
         "count_of_att_good" = "count_of_att.x",
         "count_of_att_bad" = "count_of_att.y") %>%
  mutate(sum_of_att = count_of_att_bad + count_of_att_good) %>%
  mutate(good_ratio = percent(count_of_att_good / (sum_of_att))) %>%
  mutate(bad_ratio = percent(count_of_att_bad / (sum_of_att))) %>%
  data.frame()

# to use for geom text
ready_bad <- bad %>%
  left_join(merge_to_ratio, by = c("displayName" = "displayName", "ranking" = "ranking")) %>%
  arrange(ranking) %>%
  select(displayName, sum_of_att, count_of_att_bad, avg_kick, bad_ratio, ranking, attempt_success) %>%
  data.frame()

# to use for geom_text
ready_good <- good %>%
  left_join(merge_to_ratio, by = c("displayName" = "displayName", "ranking" = "ranking")) %>%
  arrange(ranking) %>%
  select(displayName, sum_of_att, count_of_att_good, avg_kick, good_ratio, ranking, attempt_success) %>%
  data.frame()

Introduction

To determine the best kicker, I decided to focus on punts where a favorable punt had a special teams play type had an outcome of a fair catch. The key metric was the consistency of a fair catch outcome for a player. To complement this metric, a box plot of yards traveled in that play is important to choose the most favorable kicks.

Description of Project

I merged the plays, players, games, and PFFScounting to create a new data frame that outputs my top 10 punters. The top ten punters requires a minimum number of kicks to filter in more experienced kickers and to filter out those which which create outliers in kick accuracy. For this project 40 was enough to favor more experienced kickers and get rid of some outliers like the one in total attempt 30 as can be seen in the scatter plot on ratio of success. This was enough to create a simple box plot of the top 10 but more detail was required. I chose to use in the aes function “fill = attempt_success” which is a new variable I created to help distinguish between a favorable kick outcome and an unfavorable one. This created two plots for each player, one which plots kick length in yards when attempt_success equals good and another when it equals bad. Further data manipulation was required to create a table where I could store metrics on the count of kick attempts, the average kick length, and the ratio for each attempt type and also the count of both good and bad attempts to complement the ratio. I chose to not include possession team in grouping for my metrics due to it causing problems where the amount of data decreases depending on the amount of movement of the player between teams.

Data Visualization

For this project two plots are required. The first plot is to check is to check outliers and to verify that there is a good spread of data for data analysis. This consisted of an x-axis showing the total attempt counts complimented by the y-axis which maps the ratio of success for each of the total attempt counts. After planning the filters, a new data frame was used as a subset of the data to output only the top ten punters where the outcome is favorable and the amount of attempts is greater than 40. These were arranged by the ratio created by the favorable metric and passed into ggplot to create the base for the box plot. To add more detail including the count of attempts, the average kick length, and the ratio, I decided to create more subsets of the original data, using filter such as the names of the players from the top 10 punters data frame.

df2 <- my_df %>%
  select(specialTeamsPlayType, specialTeamsResult, displayName, possessionTeam, gameId, playId, kickLength, hangTime, kickDirectionActual, kickDirectionIntended) %>%
  filter(specialTeamsPlayType == "Punt" & !is.na(displayName)) %>%
  mutate(good_att = ifelse(specialTeamsResult == "Fair Catch", 1, 0),
         fail_att = ifelse(specialTeamsResult != "Fair Catch", 1, 0)) %>%
  group_by(displayName) %>%
  summarise(total_attempts_fa = sum(fail_att),
            total_attempts_go = sum(good_att),
            .groups = "keep") %>%
  mutate(total_attempt_count = (total_attempts_fa + total_attempts_go),
         ratio_of_success = total_attempts_go / (total_attempt_count)) %>%
  select(displayName, total_attempts_fa, total_attempts_go, total_attempt_count, ratio_of_success) %>% #, total_attempt_count
  arrange(-ratio_of_success) %>%
  data.frame()

ggplot(df2, aes(x = total_attempt_count, y = ratio_of_success)) +
  geom_point(color = "blue", size = 3) +
  labs(x = "Total Attempt Count", y = "Ratio of Success", title = "Scatter Plot of Ratio of Success by Attempt Count") +
  scale_x_continuous(breaks = seq(0, max(df2$total_attempt_count), by = 5)) +  # Set breaks every 5 units
  theme(plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
        axis.text.x= element_text(size = 12),
        axis.text.y= element_text(size = 12),
        axis.title = element_text(size = 18, face = "bold")
        )

# Create the ggplot
ggplot(tot_att_df_final, aes(x = reorder(displayName, -ranking), y = kickLength, fill = attempt_success)) +
  geom_boxplot() +          
  scale_y_continuous(breaks = seq(min(tot_att_df_final$kickLength, na.rm = TRUE), 
                                  max(tot_att_df_final$kickLength, na.rm = TRUE), by = 5)) +
  
  coord_flip(clip = "off") +  # Flip coordinates and allow text outside plot
  
  scale_x_discrete(expand = expansion(mult = c(0.1, 0.1))) +
  
  # Adding text outside the plot area
   geom_text(data = ready_good,
             aes(x = reorder(displayName, -ranking), 
                 y = 130,
                 label = paste("Count:"         , count_of_att_good,
                               "|| Avg Kick:"      , avg_kick, 
                               "|| Good Kick Stat:", good_ratio,
                               "of"             , sum_of_att), colour = attempt_success),
            hjust = .9, vjust = -1, 
             size = 2.5, 
            show.legend = FALSE) +
   
   geom_text(data = ready_bad,
           aes(x = reorder(displayName, -ranking), 
               y = 130,
               label = paste("Count:"         , count_of_att_bad,
                             "|| Avg Kick:"      , avg_kick, 
                             "|| Bad Kick Stat:", bad_ratio,
                             "of"             , sum_of_att), colour = attempt_success),
           hjust = .9, vjust = 1.75, 
           size = 2.5, 
           show.legend = FALSE) +
  
  # Adding labels
  labs(title = "Ranking of Punters with higher than 40 attempts \nbased on probability of a good kick",
       x = "Players (By Rank)",
       y = "Kick Length") +
  
  # Customizing the theme and margins
  theme(panel.background = element_rect(fill = "darkgreen", color = "darkgreen"),  
        panel.grid.major = element_line(color = "white", linewidth = 0.5),
        panel.grid.minor = element_line(color =  "grey", linetype = "dashed", linewidth = 0.25)) +
  
  theme(plot.title = element_text(hjust = 0.5, size = 18, face = "bold"),
        axis.text.x= element_text(size = 12),
        axis.text.y= element_text(size = 12),
        axis.title = element_text(size = 18, face = "bold")
        ) +
  
  scale_fill_discrete(name = "Success Group")

Conclusion

From this project I learned how important planning is when creating a list of the best in a category. Although some ideas might seem simple or straight forward while planning, it might require extra steps to carry out, specially if the correct order of steps are not followed. This appeared during steps such as grouping and summarizing among others. Planning was important for this project, but there will always be some oversights specially when working with many different libraries and functions which have rules that must be followed.

Sports Analytics Assignment 2

Jose Ojea

09/14/2024

Introduction

Description of Project

Data Visualization

Conclusion