For this visualization project, I sought to display how the New York Knicks (My favorite team) shoots as a unit relative to league average. For this goal, I have created what is sometimes called a “snowflake chart.”
All data are from the 2021-22 season.
I give all the credit to Owen Philips whose code and tutorials made it possible for me to pursue this project.
Code can be found below.
setwd("C:/Users/shasu/OneDrive/Desktop/R Files")
library(tidyverse)
library(metR)
library(nbastatR)
library(extrafont)
library(teamcolors)
library(cowplot)
# Custom theme
theme_owen <- function () {
theme_minimal(base_size=11, base_family="Consolas") %+replace%
theme(
panel.grid.minor = element_blank(),
plot.background = element_rect(fill = 'floralwhite', color = "floralwhite")
)
}
# Turn off scientific notation
options(scipen=999)
# Get NBA team names
Sys.setenv(VROOM_CONNECTION_SIZE=500072)
tms <- nba_teams()
tms <- tms %>% filter(isNonNBATeam == 0) %>% select(nameTeam, idTeam, slugTeam)
# Get primary team colors
tm.colors <- teamcolors::teamcolors
tm.colors <- tm.colors %>% filter(league == "nba")
tm.colors <- tm.colors %>% select(name, secondary)
# Manually adjust a few team colors
tm.colors <- tm.colors %>%
mutate(secondary = case_when(
name == "Golden State Warriors" ~ "#1D428A",
name == "Indiana Pacers" ~ "#002D62",
name == "Los Angeles Lakers" ~ "#552583",
name == "San Antonio Spurs" ~ "#000000",
name == "Oklahoma City Thunder" ~ "#EF3B24",
name == "Charlotte Hornets" ~ "#00788C",
name == "Utah Jazz" ~ "#00471B",
name == "New Orleans Pelicans" ~ "#0C2340",
TRUE ~ secondary
))
# Load NBA court dimensions from Github
devtools::source_url("https://github.com/Henryjean/NBA-Court/blob/main/CourtDimensions.R?raw=TRUE")
df <- read.csv("shotData23.csv")
# Merge shot data with team names
df <- left_join(df, tms)
# clean up the shot data a bit to fit our court dimensions
df <- df %>% mutate(locationX = as.numeric(as.character(locationX)) / 10,
locationY = as.numeric(as.character(locationY)) / 10 + hoop_center_y)
# Horizontally flip the data
df$locationX <- df$locationX * -1
# Clean up the Clippers team name for merging purposes
df <- df %>%
mutate(nameTeam = case_when(
nameTeam == "LA Clippers" ~ "Los Angeles Clippers",
TRUE ~ nameTeam
)) %>%
mutate(slugTeam = case_when(
nameTeam == "Los Angeles Clippers" ~ "LAC",
TRUE ~ slugTeam
))
# Filter out any shots greater than 35 feet (too noisy beyond that)
df <- df %>% filter(distanceShot <= 35)
get_density <- function(x, y, ...) {
density_out <- MASS::kde2d(x, y, ...)
int_x <- findInterval(x, density_out$x)
int_y <- findInterval(y, density_out$y)
comb_int <- cbind(int_x, int_y)
return(density_out$z[comb_int])
}
# Pick a team of interest
team1 <- "New York Knicks"
# Decide how granular we want to go w/ our density calculation. I've found that 300 works pretty well, but note that it runs a bit slow
n <- 300
# filter data to our team of interest, assign it to tm1
tm1 <- df %>%
select(locationX, locationY, nameTeam) %>%
filter(nameTeam == team1)
# filter data for every team other than our team of interest, assign it to tm2
tm2 <- df %>%
select(locationX, locationY, nameTeam)%>%
filter(nameTeam != team1)
# get x/y coords as vectors
tm1_x <- pull(tm1, locationX)
tm1_y <- pull(tm1, locationY)
# get x/y coords as vectors
tm2_x <- pull(tm2, locationX)
tm2_y <- pull(tm2, locationY)
# get x and y range to compute comparisons across
x_rng = range(c(-25, 25))
y_rng = range(c(0, 52))
# Explicitly calculate bandwidth for future use
bandwidth_x <- MASS::bandwidth.nrd(c(tm1_x, tm2_x))
bandwidth_y <- MASS::bandwidth.nrd(c(tm1_y, tm2_y))
bandwidth_calc <- c(bandwidth_x, bandwidth_y)
# Calculate the density estimate over the specified x and y range
d2_tm1 = MASS::kde2d(tm1_x, tm1_y, h = bandwidth_calc, n=n, lims=c(x_rng, y_rng))
d2_tm2 = MASS::kde2d(tm2_x, tm2_y, h = bandwidth_calc, n=n, lims=c(x_rng, y_rng))
# Create a new dataframe that contains the difference in shot density between our two dataframes
df_diff <- d2_tm1
# matrix subtraction density from tm1 from league average
df_diff$z <- d2_tm1$z - d2_tm2$z
# add matrix col names
colnames(df_diff$z) <- df_diff$y
# Convert list to dataframe with relevant variables and columns
df_diff <- df_diff$z %>%
as.data.frame() %>%
mutate(x_coord = df_diff$x) %>%
pivot_longer(-x_coord, names_to = "y_coord", values_to = "z") %>%
mutate(y_coord = as.double(y_coord),
name = team1) %>%
ungroup()
##
df_diff <- left_join(df_diff, tm.colors, by = "name")
##
p <- ggplot() +
geom_contour_fill(data = df_diff %>% filter(z >= mean(z)), aes(x = x_coord, y = y_coord, z = sqrt(z))) +
geom_contour_tanaka(data = df_diff, aes(x = x_coord, y = y_coord, z = sqrt(z)), bins = 5) +
coord_fixed(clip = 'off') +
theme_owen() +
scale_fill_gradient2(low = 'floralwhite', mid = 'floralwhite', high = df_diff$secondary[1]) +
scale_y_continuous(limits = c(-2.5, 41)) +
scale_x_continuous(limits = c(-30, 30)) +
theme(legend.position = 'none',
line = element_blank(),
axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_blank(),
plot.margin = margin(.25, .25, .5, .25, "lines"),
plot.title = element_text(size = 24, hjust= .5, face = 'bold', margin = margin(t = -25), color = '#006bb6'),
plot.subtitle = element_text(size = 10, hjust= .5, face = 'bold', margin = margin(t = 15)),
plot.caption = element_text(hjust = .5, size = 12)) +
geom_path(data = court_points,
aes(x = x, y = y, group = desc, linetype = dash),
color = "black", size = .25) +
labs(title = df_diff$name[1],
subtitle = "Where the Knicks tend to shoot from relative to league average",
caption = "Darker areas indicate a higher frequency of shot attempts from that location relative to league average") +
geom_curve(aes(x = 0, y = 32.5, xend = 2.5, yend = 14), curvature = -0.2, arrow = arrow(length = unit(0.03, "npc"))) +
annotate(geom = 'label', x = 0, y = 33.5, hjust = .5, label = "The Knicks like to shoot in the area between midrange and the rim", family = "Consolas", size = 3)
p <- cowplot::ggdraw(p) +
theme(plot.background = element_rect(fill="floralwhite", color = NA))
p