For assignment #2, I merged the 3 csv files : players, games and scouting data. I selected the “punt” by creating a subset of data with only the punt as the main type of kick and also selected specific columns such as kicklength to investigate the consistency of kicklength of the punters.
I set all na = 0. I filtered by season and players.
I calculated the mean and the standard deviation of the kicklength of the punt and the standard deviation. The best kicker was defined as: the player with the smallest standard deviation around the mean kicklength . The smallest standard deviation meant that the player was fairly consistent in the kicklength of the punt.
my_path <-"C:/Users/rande/Rprogram/IS470SportsAnalytics/"
setwd(my_path)
library(httr)
library(dplyr)
library(RColorBrewer)
library(ggplot2)
library(data.table)
library(tidyverse)
library(tidytext)
dfplays <- read.csv('plays.csv')
dfplays <- dfplays%>%
filter(specialTeamsPlayType %in% c("Punt"))%>%
data.frame()
dfplayers <- read.csv('players.csv')
dfgames <- read.csv("games.csv" )
dfscouting <- read.csv('PFFScoutingData.csv')
df_all <- merge(dfplayers, dfplays, by.x = c("nflId"), by.y = c("kickerId"), all.x=TRUE)
df_all <- merge(df_all, dfgames, by.x = c("gameId"),all.x = TRUE)
df_all <- merge(df_all, dfscouting, by.x = c("gameId", "playId"), all.x=TRUE)
cols_to_use <- c("season", "displayName", "kickLength", "specialTeamsPlayType", "hangTime" )
df_f <-subset(df_all, select=cols_to_use)
###part 1
df_f <-subset(df_all, select=cols_to_use)
df_f <- na.omit(df_f)
dft1<- df_f %>%
group_by(season, displayName) %>%
summarize(kl_m= round(mean(kickLength),2) , kl_sd = round(sd(kickLength),2) ,.group='keep') %>%
mutate (displayName= reorder(displayName, kl_sd),
displayName=substr(displayName, 0, nchar(as.character(displayName))-7)) %>%
data.frame()
dft1[is.na(dft1)]<-0
ggplot(dft1, aes(x=reorder(displayName, -kl_m), y= kl_m, fill = kl_sd)) +
geom_bar(stat="identity") +
geom_text(aes(label=paste0(kl_sd)), vjust=-0.5, size = 3) +
theme(axis.text.x = element_text(angle=90, vjust=.5, hjust=1)) +
labs(x = "Player Name", y ="mean KickLength of punt", title="Mean kicklength of Punt by Player") +
scale_y_continuous(limits=c(0, max(dft1$kl_m)*1.1))+
scale_fill_continuous(
limits =c(1,15),
labels=paste0(seq(0,15,1)),
breaks= seq(0,15,1),
low="red",
high = "dark green") +
facet_wrap(ncol = 1, nrow=3, ~season, scales='free')+
theme(plot.title = element_text(hjust=0.5))
#Bump Chart ###I used the ranking function to rank the standard deviation around the mean kicklength to evaluate the consistency of the kicklenght of the best “punt” kicker.
dft1<- df_f %>%
group_by(season, displayName) %>%
summarize(kl_m= round(mean(kickLength),2) , kl_sd = round(sd(kickLength),2) ,.group='keep') %>%
mutate (rank = rank(-kl_sd),
displayName= reorder_within(displayName, kl_sd, season),
displayName=substr(displayName, 0, nchar(as.character(displayName))-7)) %>%
data.frame()
cols<-colorRampPalette(brewer.pal(8,"Set2"))
myPal<-cols(53)
ggplot(dft1, aes(x=season, y=rank, group = displayName)) +
geom_line(aes(color=displayName), size=2) +
geom_point(shape=21, size=4, fill="blue")+
scale_y_reverse(breaks=seq(max(dft1$rank), 1, -1))+
geom_text(data=dft1%>% filter (season==min(season)),
aes(x=season-.05,
y=rank,
label=displayName),
size=3,
hjust=1)+
geom_text(data=dft1%>% filter (season==max(season)),
aes(x=season +0.05,
y=rank,
label=displayName),
size=3,
hjust=0)+
scale_x_continuous(breaks = min(dft1$season):max(dft1$season),
labels= as.character(min(dft1$season):max(dft1$season)))+
scale_color_manual(values=myPal)+
labs ( title = "Bump chart for Players by Standard deviation around mean kicklength- ranking by lowest SD",
x= "Season",
y= "Rank",
colour= "Players")+
theme(plot.title = element_text(hjust=0.5))
Jonny Hecker in 2018 Jake Bailey in 2019 Coery Borjorques in 2020