I’ve scraped data from https://www.basketball-reference.com/ for 4 stat
categories for all seasons from 2001 - 2022
- Advanced
- Per
Game
- Per 100 Possession
- Shooting
I removed players who averaged less than 12 minutes per game during any given season. I may potentially remove players who played less than 20 mins per game.
library(tidymodels) # broom, dials, parsnip, tune, workflows, yardstick
library(tidyverse) #ggplot2, dplyr, tidyr, readr, purr, tibble, stringr, lubridate
library(tidytext)
library(knitr)
library(stringi)
library(corrr)
library(tidyclust)
NBACleanData <- read_csv("Data/model0623.csv",show_col_types = FALSE) %>% select(-1)
model_df <- NBACleanData %>%
mutate(Year = as.character(Year)) %>%
mutate(across(where(is.numeric), ~as.numeric(scale(.))))
nba_clust <- kmeans(model_df %>% select(-c(1:4,80)), centers = 3) # removes und needed variables
tidy(nba_clust)[, 1:8] # centers of the clusters
## # A tibble: 3 × 8
## FG_pp FGA_pp FG_pct_pp ThrP_pp ThrPA_pp ThrP_pct_pp TwoP_pp TwoPA_pp
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 -0.403 -0.190 -0.516 0.426 0.464 0.393 -0.603 -0.536
## 2 -0.139 -0.539 0.885 -1.04 -1.09 -1.07 0.446 0.302
## 3 1.16 1.14 0.166 0.241 0.219 0.365 0.927 0.940
#augment shows all the data and includes the cluster number
head(augment(nba_clust, model_df)[, c(81,1:9)],5)
## # A tibble: 5 × 10
## .cluster Year Player Pos Tm FG_pp FGA_pp FG_pct_pp ThrP_pp ThrPA_pp
## <fct> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 2001 AC Green PF MIA -0.874 -0.899 -0.0991 -1.16 -1.20
## 2 1 2001 AJ Guyt… PG CHI -0.384 -0.0956 -0.681 0.533 0.393
## 3 3 2001 Aaron M… SG PHI -0.0274 -0.226 0.345 -0.274 -0.192
## 4 2 2001 Aaron W… PF NJN -0.429 -0.508 0.100 -1.16 -1.25
## 5 2 2001 Adam Ke… PF GSW -1.63 -1.55 -0.727 -1.08 -1.20
augment(nba_clust, model_df) %>%
ggplot(aes(FG_pp, FGA_pp, color = .cluster)) + # the aes values can be changed if needed
geom_point()
#glance gives us other useful metrics from the model
glance(nba_clust)
## # A tibble: 1 × 4
## totss tot.withinss betweenss iter
## <dbl> <dbl> <dbl> <int>
## 1 599100 401355. 197745. 4
kclusts <-
tibble(k = 1:15) %>%
mutate(
kclust = map(k, ~ kmeans(model_df %>% select(-c(1:4)), .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, model_df %>% select(-c(1:4)))
)
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = 0.8) +
geom_point(size = 2)
The elbow looks like it’s at 2 or 3. It may be beneficial to plot the
change for each k from k to k+1, k+2, and k+3
This shows the difference from k to k+1, k to k+2, and k to k+3
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
#change from k = 1 to k = 2
geom_line(aes(k, tot.withinss-lead(tot.withinss), color = "k+1"), alpha = 0.8 ) +
geom_point(aes(k, tot.withinss-lead(tot.withinss)),size = 2,color = "red") +
#change from k = 1 to k = 3
geom_line(aes(k, tot.withinss-lead(tot.withinss,n=2),color = "k+2"), alpha = 0.8) +
geom_point(aes(k, tot.withinss-lead(tot.withinss,n=2)),size = 2, color = "green") +
#change from k = 1 to k = 4
geom_line(aes(k, tot.withinss-lead(tot.withinss,n=3),color = "k+3"), alpha = 0.8) +
geom_point(aes(k, tot.withinss-lead(tot.withinss,n=3)),size = 2, color = "blue") +
labs(
x = "Number of Clusters (k)",
y = "tot.withinss", color = "",
title = "Where does the data level off?"
)
It’s a bit difficult to determine the optimal number of k. An argument
could be made for 3, but it seems unreasonable to have just 3 positions
for NBA players. It could be beneficial to have a team with players that
all fall in the “best” cluster, but more analysis would have to be done
on historically good teams.
An argument could be made for 6 or 7 when looking at the difference in tot.withinss from k to k+1/k+2/k+3 - it levels off around there.
The data is fairly high dimensional, so maybe reducing the dimensionality will yield more interpretable results.
Reducing the dimensionality using PCA.
#create a recipe
nba_rec <- recipe(~ ., data = model_df) %>%
update_role(Year, Player, Pos, Tm, new_role = "id") %>% #might need these variables later
step_normalize(all_predictors()) %>% #normalizes the data
step_pca(all_predictors(), num_comp = 20) #default for PCs is 5
#prep the data
nba_prep <- prep(nba_rec)
#shows just a few PCs
tidied_pca <- tidy(nba_prep, 2) # 2 refers to the step number. This provides the step_pca
# Plots the first few PCs a
tidied_pca %>%
filter(component %in% paste0("PC", 1:9)) %>%
group_by(component) %>%
top_n(10, abs(value)) %>%
ungroup() %>%
mutate(terms = reorder_within(terms, abs(value), component)) %>%
ggplot(aes(abs(value), terms, fill = value > 0)) +
geom_col() +
facet_wrap(~component, scales = "free_y") +
scale_y_reordered() +
labs(
x = "Absolute value of contribution",
y = NULL, fill = "Positive?"
)
sdev <- nba_prep$steps[[2]]$res$sdev
percent_variation <- sdev^2 / sum(sdev^2)
tibble(component = unique(tidied_pca$component),
percent_var = percent_variation) %>%
filter(component %in% paste0("PC", 1:20)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(component, percent_var)) +
geom_col(position = "stack") +
geom_text(aes(label = scales::percent((round(percent_var,4))),
vjust = -0.5), size = 3, color = "red"
) +
geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
vjust = -2), size = 4
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,.35))
- The cummulative vairance is listed at the top of each bar. The
variance captured by each PC is below it
- The first 5 PCs capture
70% of the variance, the first 9 capture 80%, the first 16 capture
90%
Capturing 80% of the variance might be a good place to start
nba_rec_80 <- recipe(~ ., data = model_df) %>%
update_role(Year, Player, Pos, Tm, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors(), num_comp = 9)
nba_prep_80 <- prep(nba_rec_80)
Start with 3 clusters just to see how the data looks
# the template field stores the PCA variables
pca_80 <- nba_prep_80$template %>%
mutate(Year = as.character(Year),
Player = as.character(Player),
Pos = as.character(Pos),
Tm = as.character(Tm)) %>%
as_tibble()
set.seed(525)
#start by looking at 3 clusters
kclust <- kmeans(pca_80 %>% select(-c(1:4)), centers = 3)
#The tidy function gives us the centers from each variable (PCs)
tidy(kclust)
## # A tibble: 3 × 12
## PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 size
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 -3.18 1.34 0.260 -0.0175 0.0824 0.0673 -0.0175 0.0107 -0.108 4172
## 2 5.62 3.59 -0.681 -0.121 0.137 0.0203 0.101 -0.146 -0.00777 1693
## 3 1.77 -5.50 0.0327 0.131 -0.271 -0.148 -0.0459 0.0957 0.218 2124
## # ℹ 2 more variables: withinss <dbl>, cluster <fct>
#augment shows all the data and includes the cluster number
head(augment(kclust, pca_80),5)
## # A tibble: 5 × 14
## Year Player Pos Tm PC1 PC2 PC3 PC4 PC5 PC6 PC7
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2001 AC Green PF MIA -1.97 -4.55 -1.27 0.324 0.271 2.94 1.79
## 2 2001 AJ Guyton PG CHI -6.09 2.77 -0.551 2.40 -1.39 1.66 -0.866
## 3 2001 Aaron McKie SG PHI 1.69 2.68 -0.924 -3.82 -0.536 1.50 -1.52
## 4 2001 Aaron Willi… PF NJN 3.80 -5.72 -1.69 -0.0535 1.05 -1.11 1.34
## 5 2001 Adam Keefe PF GSW -4.55 -5.52 -2.11 -0.275 0.475 0.665 1.36
## # ℹ 3 more variables: PC8 <dbl>, PC9 <dbl>, .cluster <fct>
augment(kclust, pca_80) %>%
ggplot(aes(PC1, PC2, color = .cluster)) + # the PCx values can be changed as needed
geom_point()
#glance gives us other useful metrics from the model
glance(kclust)
## # A tibble: 1 × 4
## totss tot.withinss betweenss iter
## <dbl> <dbl> <dbl> <int>
## 1 488899. 291292. 197607. 4
kclusts <-
tibble(k = 1:15) %>%
mutate(
kclust = map(k, ~ kmeans(pca_80 %>% select(-c(1:4)), .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, pca_80 %>% select(-c(1:4)))
)
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = 0.8) +
geom_point(size = 2)
The elbow looks like it’s at 3. It may be beneficial to plot the change
for each k from k to k+1, k+2, and k+3
This shows the difference from k to k+1, k to k+2, and k to k+3
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
#change from k = 1 to k = 2
geom_line(aes(k, tot.withinss-lead(tot.withinss), color = "k+1"), alpha = 0.8 ) +
geom_point(aes(k, tot.withinss-lead(tot.withinss)),size = 2,color = "red") +
#change from k = 1 to k = 3
geom_line(aes(k, tot.withinss-lead(tot.withinss,n=2),color = "k+2"), alpha = 0.8) +
geom_point(aes(k, tot.withinss-lead(tot.withinss,n=2)),size = 2, color = "green") +
#change from k = 1 to k = 4
geom_line(aes(k, tot.withinss-lead(tot.withinss,n=3),color = "k+3"), alpha = 0.8) +
geom_point(aes(k, tot.withinss-lead(tot.withinss,n=3)),size = 2, color = "blue") +
labs(
x = "Number of Clusters (k)",
y = "tot.withinss", color = "",
title = "Where does the data level off?"
)
It’s a bit difficult to determine the optimal number of k. An argument
could be made for 3 (similar to the above just using k), but it seems
unreasonable to have just 3 positions for NBA players. It could be
beneficial to have a team with players that all fall in the “best”
cluster, but more analysis would have to be done on historically good
teams.
An argument could be made for 7 or 8 when looking at the difference in tot.withinss from k to k+1/k+2/k+3 - it levels off around there.
Filtering down the data to include less players might make things a bit easier to interpret
From the initial cleaning, players with mpg of less than 12 minutes per game were removed. Removing players with less than 20 mins per game could make things more clear.
model_df_mpg20 <- NBACleanData %>%
filter(MP_pg >= 20)
nba_rec_mpg20 <- recipe(~ ., data = model_df_mpg20) %>%
update_role(Year, Player, Pos, Tm, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
nba_prep_mpg20 <- prep(nba_rec_mpg20)
tidied_pca_mpg20 <- tidy(nba_prep_mpg20, 2)
sdev <- nba_prep_mpg20$steps[[2]]$res$sdev
percent_variation <- sdev^2 / sum(sdev^2)
tibble(component = unique(tidied_pca_mpg20$component),
percent_var = percent_variation) %>%
filter(component %in% paste0("PC", 1:20)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(component, percent_var)) +
geom_col(position = "stack") +
geom_text(aes(label = scales::percent((round(percent_var,4))),
vjust = -0.5), size = 3, color = "red"
) +
geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
vjust = -2), size = 4
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,.35))
The cummulative vairance is listed at the top of each bar. The variance captured by each PC is below
Initial method: The first 5 PCs capture 70% of the variance, the first 9 capture 80%, the first 16 capture 90%
Reduced data Method: The first 5 PCs capture 70% of the variance, the first 8 capture 80%, the first 14 capture 90%
More or less the same, it takes one less PC to capture 80% of the variance
I was hoping for a bit more of a change in the number of variables that accounted for 80% of the variance, but that doesn’t appear to be the case
There were 4 stat categories - Per Game, Per 100 possessions, Shooting, and Advanced. Maybe running PCA on those will provide clearer results
model_df_adv <- NBACleanData %>%
select(Year, Player, Pos, Tm,contains("_adv"))
nba_rec_adv <- recipe(~ ., data = model_df_adv) %>%
update_role(Year, Player, Pos, Tm, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
nba_prep_adv <- prep(nba_rec_adv)
tidied_pca_adv <- tidy(nba_prep_adv, 2)
sdev <- nba_prep_adv$steps[[2]]$res$sdev
percent_variation <- sdev^2 / sum(sdev^2)
tibble(component = unique(tidied_pca_adv$component),
percent_var = percent_variation) %>%
filter(component %in% paste0("PC", 1:10)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(component, percent_var)) +
geom_col(position = "stack") +
geom_text(aes(label = scales::percent((round(percent_var,4))),
vjust = -0.5), size = 4, color = "red"
) +
geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
vjust = -2), size = 5
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,.50))
3 PCs account for 70%, 4 PCs 80%, 6 PCs 90%
model_df_pp <- NBACleanData %>%
select(Year, Player, Pos, Tm,contains("_pp"))
nba_rec_ppp <- recipe(~ ., data = model_df_pp) %>%
update_role(Year, Player, Pos, Tm, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
nba_prep_pp <- prep(nba_rec_ppp)
tidied_pca_pp <- tidy(nba_prep_pp, 2)
sdev <- nba_prep_pp$steps[[2]]$res$sdev
percent_variation <- sdev^2 / sum(sdev^2)
tibble(component = unique(tidied_pca_pp$component),
percent_var = percent_variation) %>%
filter(component %in% paste0("PC", 1:10)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(component, percent_var)) +
geom_col(position = "stack") +
geom_text(aes(label = scales::percent((round(percent_var,4))),
vjust = -0.5), size = 4, color = "red"
) +
geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
vjust = -2), size = 5
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,.35))
4 PCs account for 70%, 5 PCs 80%,9 PCs 90%
model_df_sho <- NBACleanData %>%
select(Year, Player, Pos, Tm,contains("_sho"))
nba_rec_sho <- recipe(~ ., data = model_df_sho) %>%
update_role(Year, Player, Pos, Tm, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
nba_prep_sho <- prep(nba_rec_sho)
tidied_pca_psho <- tidy(nba_prep_sho, 2)
sdev <- nba_prep_sho$steps[[2]]$res$sdev
percent_variation <- sdev^2 / sum(sdev^2)
tibble(component = unique(tidied_pca_psho$component),
percent_var = percent_variation) %>%
filter(component %in% paste0("PC", 1:10)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(component, percent_var)) +
geom_col(position = "stack") +
geom_text(aes(label = scales::percent((round(percent_var,4))),
vjust = -0.5), size = 3, color = "red"
) +
geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
vjust = -2), size = 4
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,.35))
5 PCs account for 70%, 7 PCs 80%,9 PCs 90%
model_df_pg <- NBACleanData %>%
select(Year, Player, Pos, Tm,contains("_pg"))
nba_rec_pg <- recipe(~ ., data = model_df_pg) %>%
update_role(Year, Player, Pos, Tm, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors())
nba_prep_pg <- prep(nba_rec_pg)
tidied_pca_pg <- tidy(nba_prep_pg, 2)
sdev <- nba_prep_pg$steps[[2]]$res$sdev
percent_variation <- sdev^2 / sum(sdev^2)
tibble(component = unique(tidied_pca_pg$component),
percent_var = percent_variation) %>%
filter(component %in% paste0("PC", 1:10)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(component, percent_var)) +
geom_col(position = "stack") +
geom_text(aes(label = scales::percent((round(percent_var,4))),
vjust = -0.5), size = 3, color = "red"
) +
geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
vjust = -2), size = 4
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,.65))
2 PCs account for 70%, 3 PCs 80%,6 PCs 90%
Per game stats account for the most variance with the least amount of data. We’ll take a look at kmeans for only per game statistics
# the template stores the PCA variables
pca_pg <- nba_prep_pg$template %>%
mutate(Year = as.character(Year),
Player = as.character(Player),
Pos = as.character(Pos),
Tm = as.character(Tm)) %>%
as_tibble()
set.seed(525)
kclust <- kmeans(pca_pg %>% select(-c(1:4)), centers = 3, nstart = 1000, iter.max = 10000)
kclusts <-
tibble(k = 1:15) %>%
mutate(
kclust = map(k, ~ kmeans(pca_pg %>% select(-c(1:4)), .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, pca_pg %>% select(-c(1:4)))
)
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
geom_line(alpha = 0.8) +
geom_point(size = 2)
The elbow looks like it’s at 2. It may be beneficial to plot the change
for each k from k to k+1, k+2, and k+3
This shows the difference from k to k+1, k to k+2, and k to k+3
kclusts %>%
unnest(glanced) %>%
ggplot(aes(k, tot.withinss)) +
#change from k = 1 to k = 2
geom_line(aes(k, tot.withinss-lead(tot.withinss), color = "k+1"), alpha = 0.8 ) +
geom_point(aes(k, tot.withinss-lead(tot.withinss)),size = 2,color = "red") +
#change from k = 1 to k = 3
geom_line(aes(k, tot.withinss-lead(tot.withinss,n=2),color = "k+2"), alpha = 0.8) +
geom_point(aes(k, tot.withinss-lead(tot.withinss,n=2)),size = 2, color = "green") +
#change from k = 1 to k = 4
geom_line(aes(k, tot.withinss-lead(tot.withinss,n=3),color = "k+3"), alpha = 0.8) +
geom_point(aes(k, tot.withinss-lead(tot.withinss,n=3)),size = 2, color = "blue") +
labs(
x = "Number of Clusters (k)",
y = "tot.withinss", color = "",
title = "Where does the data level off?"
)
I’ll use PCA for hierarchical clustering as well to reduce the dimensionality of the data
This plot is the same as the initial plot using all the data, just as a reminder
# recipe
nba_rec_hier <- recipe(~ ., data = model_df) %>%
update_role(Year, Player, Pos, Tm, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors(), num_comp = 20)
#prep - needed for tidied_pca and plot
nba_prep_hier <- prep(nba_rec_hier)
tidied_pca_hier <- tidy(nba_prep_hier,2)
sdev <- nba_prep_hier$steps[[2]]$res$sdev
percent_variation <- sdev^2 / sum(sdev^2)
tibble(component = unique(tidied_pca_hier$component),
percent_var = percent_variation) %>%
filter(component %in% paste0("PC", 1:20)) %>%
mutate(component = fct_inorder(component)) %>%
ggplot(aes(component, percent_var)) +
geom_col(position = "stack") +
geom_text(aes(label = scales::percent((round(percent_var,4))),
vjust = -0.5), size = 3, color = "red"
) +
geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
vjust = -2), size = 4
) +
scale_y_continuous(labels = scales::percent_format(), limits = c(0,.35))
# recipe - using 9 PCs instead of 20
nba_rec_hier <- recipe(~ ., data = model_df) %>%
update_role(Year, Player, Pos, Tm, new_role = "id") %>%
step_normalize(all_predictors()) %>%
step_pca(all_predictors(), num_comp = 9)
# new dataframe
nba_bake_hier <- bake(nba_prep_hier, new_data = NULL) %>%
mutate(across(where(is.factor), as.character))
# the dendrograms take too long to generate if all the data is included, so a sample is taken
model_df_hc <- nba_bake_hier %>%
sample_n(1000) %>%
select(-c(1:4)) %>%
select(c(1:9)) #selects the first 9 PCs
set.seed(42069)
# Distance between observations matrix
d <- dist(x=model_df_hc, method = "euclidean")
# Hierarchical clustering using Complete Linkage
nba_hclust_complete <- hclust(d, method = "complete")
# Hierarchical clustering using Average Linkage
nba_hclust_average <- hclust(d, method = "average")
# Hierarchical clustering using Ward Linkage
nba_hclust_ward <- hclust(d, method = "ward.D2")
library(factoextra)
fviz_dend(nba_hclust_complete, main = "Complete")
fviz_dend(nba_hclust_average, main = "Average Linkage")
fviz_dend(nba_hclust_ward, main = "Ward")
The below looks at the agglomerative coefficient, which measures clustering structure of the dataset. A value closer to 1 means more balanced.
library(cluster)
ac_metric <- list(
complete_ac = agnes(model_df_hc, metric = "euclidean", method = "complete")$ac,
average_ac = agnes(model_df_hc, metric = "euclidean", method = "average")$ac,
ward_ac = agnes(model_df_hc, metric = "euclidean", method = "ward")$ac
)
ac_metric
## $complete_ac
## [1] 0.8994221
##
## $average_ac
## [1] 0.7981196
##
## $ward_ac
## [1] 0.9805245
Ward’s is the closest to 1, so that’s the method that we’ll use that from
Look at how the sample looks from a clustering standpoint
fviz_dend(nba_hclust_ward, k = 6)
fviz_dend(nba_hclust_ward, k = 7)
fviz_dend(nba_hclust_ward, k = 8)
7 clusters looks to be the most balanced
K-means w/o PCA: 3 is likely optimal, but 6 or 7 show that it’s
flattening out
K-means w/ PCA all data: 3 may be optimal, but 7 or
8 show that it’s flattening out
K-means w/ PCA 20 minutes per game
cutoff (instead of 12): More or less the same as with all data
K-means w/ PCA using only per game stats: 2 may be optimal, levels off
around 6 or 7
Hierarchical w/ PCA: 2 may be optimal, levels off
around 6, 7 or 8. 7 or 8 are likely more balanced than 6
I’ve decide to go with 7 clusters for the final analysis.