Introduction

I’ve scraped data from https://www.basketball-reference.com/ for 4 stat categories for all seasons from 2001 - 2022
- Advanced
- Per Game
- Per 100 Possession
- Shooting

I removed players who averaged less than 12 minutes per game during any given season. I may potentially remove players who played less than 20 mins per game.

Libraries and Data

library(tidymodels) # broom, dials, parsnip, tune, workflows, yardstick
library(tidyverse) #ggplot2, dplyr, tidyr, readr, purr, tibble, stringr, lubridate
library(tidytext)
library(knitr)
library(stringi)
library(corrr)
library(tidyclust)
NBACleanData <- read_csv("Data/model0623.csv",show_col_types = FALSE) %>% select(-1)
model_df <-  NBACleanData %>%
  mutate(Year = as.character(Year)) %>%
  mutate(across(where(is.numeric), ~as.numeric(scale(.))))

Modeling

K-Means

nba_clust <- kmeans(model_df %>% select(-c(1:4,80)), centers = 3) # removes und needed variables

Tidy the data

tidy(nba_clust)[, 1:8] # centers of the clusters
## # A tibble: 3 × 8
##    FG_pp FGA_pp FG_pct_pp ThrP_pp ThrPA_pp ThrP_pct_pp TwoP_pp TwoPA_pp
##    <dbl>  <dbl>     <dbl>   <dbl>    <dbl>       <dbl>   <dbl>    <dbl>
## 1 -0.403 -0.190    -0.516   0.426    0.464       0.393  -0.603   -0.536
## 2 -0.139 -0.539     0.885  -1.04    -1.09       -1.07    0.446    0.302
## 3  1.16   1.14      0.166   0.241    0.219       0.365   0.927    0.940

Augment and plot the data

#augment shows all the data and includes the cluster number
head(augment(nba_clust, model_df)[, c(81,1:9)],5) 
## # A tibble: 5 × 10
##   .cluster Year  Player   Pos   Tm      FG_pp  FGA_pp FG_pct_pp ThrP_pp ThrPA_pp
##   <fct>    <chr> <chr>    <chr> <chr>   <dbl>   <dbl>     <dbl>   <dbl>    <dbl>
## 1 2        2001  AC Green PF    MIA   -0.874  -0.899    -0.0991  -1.16    -1.20 
## 2 1        2001  AJ Guyt… PG    CHI   -0.384  -0.0956   -0.681    0.533    0.393
## 3 3        2001  Aaron M… SG    PHI   -0.0274 -0.226     0.345   -0.274   -0.192
## 4 2        2001  Aaron W… PF    NJN   -0.429  -0.508     0.100   -1.16    -1.25 
## 5 2        2001  Adam Ke… PF    GSW   -1.63   -1.55     -0.727   -1.08    -1.20

Plot

augment(nba_clust, model_df) %>%
  ggplot(aes(FG_pp, FGA_pp, color = .cluster)) + # the aes values can be changed if needed
  geom_point()

Glance the data

#glance gives us other useful metrics from the model
glance(nba_clust)
## # A tibble: 1 × 4
##    totss tot.withinss betweenss  iter
##    <dbl>        <dbl>     <dbl> <int>
## 1 599100      401355.   197745.     4

Elbow Plot: find the optimal k

kclusts <- 
  tibble(k = 1:15) %>%
  mutate(
    kclust = map(k, ~ kmeans(model_df %>% select(-c(1:4)), .x)),
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, model_df %>% select(-c(1:4)))
  )

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_line(alpha = 0.8) +
  geom_point(size = 2)

The elbow looks like it’s at 2 or 3. It may be beneficial to plot the change for each k from k to k+1, k+2, and k+3

Elbow plots - k+1, k+2, k+3

This shows the difference from k to k+1, k to k+2, and k to k+3

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +

  #change from k = 1 to k = 2
  geom_line(aes(k, tot.withinss-lead(tot.withinss), color = "k+1"), alpha = 0.8 ) +
  geom_point(aes(k, tot.withinss-lead(tot.withinss)),size = 2,color = "red") +

  #change from k = 1 to k = 3
  geom_line(aes(k, tot.withinss-lead(tot.withinss,n=2),color = "k+2"), alpha = 0.8) +
  geom_point(aes(k, tot.withinss-lead(tot.withinss,n=2)),size = 2, color = "green") +

  #change from k = 1 to k = 4
  geom_line(aes(k, tot.withinss-lead(tot.withinss,n=3),color = "k+3"), alpha = 0.8) +
  geom_point(aes(k, tot.withinss-lead(tot.withinss,n=3)),size = 2, color = "blue") +
  
  labs(
    x = "Number of Clusters (k)",
    y = "tot.withinss", color = "",
    title = "Where does the data level off?"
  )

It’s a bit difficult to determine the optimal number of k. An argument could be made for 3, but it seems unreasonable to have just 3 positions for NBA players. It could be beneficial to have a team with players that all fall in the “best” cluster, but more analysis would have to be done on historically good teams.

An argument could be made for 6 or 7 when looking at the difference in tot.withinss from k to k+1/k+2/k+3 - it levels off around there.

The data is fairly high dimensional, so maybe reducing the dimensionality will yield more interpretable results.

K-Means w/ PCA

Reducing the dimensionality using PCA.

Recipe

#create a recipe
nba_rec <- recipe(~ ., data = model_df) %>%
  update_role(Year, Player, Pos, Tm, new_role = "id") %>% #might need these variables later
  step_normalize(all_predictors()) %>% #normalizes the data
  step_pca(all_predictors(), num_comp = 20) #default for PCs is 5

#prep the data
nba_prep <- prep(nba_rec)

PCs - Tidied and Plotted

#shows just a few PCs
tidied_pca <- tidy(nba_prep, 2) # 2 refers to the step number. This provides the step_pca

# Plots the first few PCs a
tidied_pca %>%
  filter(component %in% paste0("PC", 1:9)) %>%
  group_by(component) %>%
  top_n(10, abs(value)) %>%
  ungroup() %>%
  mutate(terms = reorder_within(terms, abs(value), component)) %>%
  ggplot(aes(abs(value), terms, fill = value > 0)) +
  geom_col() +
  facet_wrap(~component, scales = "free_y") +
  scale_y_reordered() +
  labs(
    x = "Absolute value of contribution",
    y = NULL, fill = "Positive?"
  )

How much variance is being captured?

sdev <- nba_prep$steps[[2]]$res$sdev 

percent_variation <- sdev^2 / sum(sdev^2)

tibble(component = unique(tidied_pca$component),
       percent_var = percent_variation) %>%
  filter(component %in% paste0("PC", 1:20)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(component, percent_var)) +
  geom_col(position = "stack") +
  geom_text(aes(label = scales::percent((round(percent_var,4))),
            vjust = -0.5), size = 3, color = "red"
            ) +
  geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
            vjust = -2), size = 4
            ) +
  scale_y_continuous(labels = scales::percent_format(), limits = c(0,.35))

- The cummulative vairance is listed at the top of each bar. The variance captured by each PC is below it
- The first 5 PCs capture 70% of the variance, the first 9 capture 80%, the first 16 capture 90%

Capturing 80% of the variance might be a good place to start

Recipe that captures 80% variance

nba_rec_80 <- recipe(~ ., data = model_df) %>%
  update_role(Year, Player, Pos, Tm, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors(), num_comp = 9)

nba_prep_80 <- prep(nba_rec_80)

Start with 3 clusters just to see how the data looks

# the template field stores the PCA variables
pca_80 <- nba_prep_80$template %>%
    mutate(Year = as.character(Year),
         Player = as.character(Player),
         Pos = as.character(Pos),
         Tm = as.character(Tm)) %>%
  as_tibble()

set.seed(525)

#start by looking at 3 clusters
kclust <- kmeans(pca_80 %>% select(-c(1:4)), centers = 3)

Tidy the data

#The tidy function gives us the centers from each variable (PCs)
tidy(kclust)
## # A tibble: 3 × 12
##     PC1   PC2     PC3     PC4     PC5     PC6     PC7     PC8      PC9  size
##   <dbl> <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>    <dbl> <int>
## 1 -3.18  1.34  0.260  -0.0175  0.0824  0.0673 -0.0175  0.0107 -0.108    4172
## 2  5.62  3.59 -0.681  -0.121   0.137   0.0203  0.101  -0.146  -0.00777  1693
## 3  1.77 -5.50  0.0327  0.131  -0.271  -0.148  -0.0459  0.0957  0.218    2124
## # ℹ 2 more variables: withinss <dbl>, cluster <fct>

Augment and plot the data

#augment shows all the data and includes the cluster number
head(augment(kclust, pca_80),5) 
## # A tibble: 5 × 14
##   Year  Player       Pos   Tm      PC1   PC2    PC3     PC4    PC5    PC6    PC7
##   <chr> <chr>        <chr> <chr> <dbl> <dbl>  <dbl>   <dbl>  <dbl>  <dbl>  <dbl>
## 1 2001  AC Green     PF    MIA   -1.97 -4.55 -1.27   0.324   0.271  2.94   1.79 
## 2 2001  AJ Guyton    PG    CHI   -6.09  2.77 -0.551  2.40   -1.39   1.66  -0.866
## 3 2001  Aaron McKie  SG    PHI    1.69  2.68 -0.924 -3.82   -0.536  1.50  -1.52 
## 4 2001  Aaron Willi… PF    NJN    3.80 -5.72 -1.69  -0.0535  1.05  -1.11   1.34 
## 5 2001  Adam Keefe   PF    GSW   -4.55 -5.52 -2.11  -0.275   0.475  0.665  1.36 
## # ℹ 3 more variables: PC8 <dbl>, PC9 <dbl>, .cluster <fct>
augment(kclust, pca_80) %>%
  ggplot(aes(PC1, PC2, color = .cluster)) + # the PCx values can be changed as needed
  geom_point()

Glance the data

#glance gives us other useful metrics from the model
glance(kclust)
## # A tibble: 1 × 4
##     totss tot.withinss betweenss  iter
##     <dbl>        <dbl>     <dbl> <int>
## 1 488899.      291292.   197607.     4

Elbow Plot: find the optimal k

kclusts <- 
  tibble(k = 1:15) %>%
  mutate(
    kclust = map(k, ~ kmeans(pca_80 %>% select(-c(1:4)), .x)),
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, pca_80 %>% select(-c(1:4)))
  )

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_line(alpha = 0.8) +
  geom_point(size = 2)

The elbow looks like it’s at 3. It may be beneficial to plot the change for each k from k to k+1, k+2, and k+3

Elbow plots - k+1, k+2, k+3

This shows the difference from k to k+1, k to k+2, and k to k+3

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +

  #change from k = 1 to k = 2
  geom_line(aes(k, tot.withinss-lead(tot.withinss), color = "k+1"), alpha = 0.8 ) +
  geom_point(aes(k, tot.withinss-lead(tot.withinss)),size = 2,color = "red") +

  #change from k = 1 to k = 3
  geom_line(aes(k, tot.withinss-lead(tot.withinss,n=2),color = "k+2"), alpha = 0.8) +
  geom_point(aes(k, tot.withinss-lead(tot.withinss,n=2)),size = 2, color = "green") +

  #change from k = 1 to k = 4
  geom_line(aes(k, tot.withinss-lead(tot.withinss,n=3),color = "k+3"), alpha = 0.8) +
  geom_point(aes(k, tot.withinss-lead(tot.withinss,n=3)),size = 2, color = "blue") +
  
  labs(
    x = "Number of Clusters (k)",
    y = "tot.withinss", color = "",
    title = "Where does the data level off?"
  )

It’s a bit difficult to determine the optimal number of k. An argument could be made for 3 (similar to the above just using k), but it seems unreasonable to have just 3 positions for NBA players. It could be beneficial to have a team with players that all fall in the “best” cluster, but more analysis would have to be done on historically good teams.

An argument could be made for 7 or 8 when looking at the difference in tot.withinss from k to k+1/k+2/k+3 - it levels off around there.

Filtering down the data to include less players might make things a bit easier to interpret

K-Means w/ PCA - using less data pt. 1

From the initial cleaning, players with mpg of less than 12 minutes per game were removed. Removing players with less than 20 mins per game could make things more clear.

model_df_mpg20 <-  NBACleanData %>%
  filter(MP_pg >= 20)

nba_rec_mpg20 <- recipe(~ ., data = model_df_mpg20) %>%
  update_role(Year, Player, Pos, Tm, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

nba_prep_mpg20 <- prep(nba_rec_mpg20)

tidied_pca_mpg20 <- tidy(nba_prep_mpg20, 2)

How much variance is being captured?

sdev <- nba_prep_mpg20$steps[[2]]$res$sdev

percent_variation <- sdev^2 / sum(sdev^2)

tibble(component = unique(tidied_pca_mpg20$component),
       percent_var = percent_variation) %>%
  filter(component %in% paste0("PC", 1:20)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(component, percent_var)) +
  geom_col(position = "stack") +
  geom_text(aes(label = scales::percent((round(percent_var,4))),
            vjust = -0.5), size = 3, color = "red"
            ) +
  geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
            vjust = -2), size = 4
            ) +
  scale_y_continuous(labels = scales::percent_format(), limits = c(0,.35))

  • The cummulative vairance is listed at the top of each bar. The variance captured by each PC is below

  • Initial method: The first 5 PCs capture 70% of the variance, the first 9 capture 80%, the first 16 capture 90%

  • Reduced data Method: The first 5 PCs capture 70% of the variance, the first 8 capture 80%, the first 14 capture 90%

  • More or less the same, it takes one less PC to capture 80% of the variance

I was hoping for a bit more of a change in the number of variables that accounted for 80% of the variance, but that doesn’t appear to be the case

K-Means w/ PCA - using less data pt. 2

There were 4 stat categories - Per Game, Per 100 possessions, Shooting, and Advanced. Maybe running PCA on those will provide clearer results

Advanced

model_df_adv <-  NBACleanData %>%
  select(Year, Player, Pos, Tm,contains("_adv"))

nba_rec_adv <- recipe(~ ., data = model_df_adv) %>%
  update_role(Year, Player, Pos, Tm, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

nba_prep_adv <- prep(nba_rec_adv)

tidied_pca_adv <- tidy(nba_prep_adv, 2)

sdev <- nba_prep_adv$steps[[2]]$res$sdev

percent_variation <- sdev^2 / sum(sdev^2)

tibble(component = unique(tidied_pca_adv$component),
       percent_var = percent_variation) %>%
  filter(component %in% paste0("PC", 1:10)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(component, percent_var)) +
  geom_col(position = "stack") +
  geom_text(aes(label = scales::percent((round(percent_var,4))),
            vjust = -0.5), size = 4, color = "red"
            ) +
  geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
            vjust = -2), size = 5
            ) +
  scale_y_continuous(labels = scales::percent_format(), limits = c(0,.50))

3 PCs account for 70%, 4 PCs 80%, 6 PCs 90%

Per 100 possessions

model_df_pp <-  NBACleanData %>%
  select(Year, Player, Pos, Tm,contains("_pp"))

nba_rec_ppp <- recipe(~ ., data = model_df_pp) %>%
  update_role(Year, Player, Pos, Tm, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

nba_prep_pp <- prep(nba_rec_ppp)

tidied_pca_pp <- tidy(nba_prep_pp, 2)

sdev <- nba_prep_pp$steps[[2]]$res$sdev

percent_variation <- sdev^2 / sum(sdev^2)

tibble(component = unique(tidied_pca_pp$component),
       percent_var = percent_variation) %>%
  filter(component %in% paste0("PC", 1:10)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(component, percent_var)) +
  geom_col(position = "stack") +
  geom_text(aes(label = scales::percent((round(percent_var,4))),
            vjust = -0.5), size = 4, color = "red"
            ) +
  geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
            vjust = -2), size = 5
            ) +
  scale_y_continuous(labels = scales::percent_format(), limits = c(0,.35))

4 PCs account for 70%, 5 PCs 80%,9 PCs 90%

Shooting

model_df_sho <-  NBACleanData %>%
  select(Year, Player, Pos, Tm,contains("_sho"))

nba_rec_sho <- recipe(~ ., data = model_df_sho) %>%
  update_role(Year, Player, Pos, Tm, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

nba_prep_sho <- prep(nba_rec_sho)

tidied_pca_psho <- tidy(nba_prep_sho, 2)

sdev <- nba_prep_sho$steps[[2]]$res$sdev

percent_variation <- sdev^2 / sum(sdev^2)

tibble(component = unique(tidied_pca_psho$component),
       percent_var = percent_variation) %>%
  filter(component %in% paste0("PC", 1:10)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(component, percent_var)) +
  geom_col(position = "stack") +
  geom_text(aes(label = scales::percent((round(percent_var,4))),
            vjust = -0.5), size = 3, color = "red"
            ) +
  geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
            vjust = -2), size = 4
            ) +
    scale_y_continuous(labels = scales::percent_format(), limits = c(0,.35))

5 PCs account for 70%, 7 PCs 80%,9 PCs 90%

Per Game

model_df_pg <-  NBACleanData %>%
  select(Year, Player, Pos, Tm,contains("_pg"))

nba_rec_pg <- recipe(~ ., data = model_df_pg) %>%
  update_role(Year, Player, Pos, Tm, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors())

nba_prep_pg <- prep(nba_rec_pg)

tidied_pca_pg <- tidy(nba_prep_pg, 2)

sdev <- nba_prep_pg$steps[[2]]$res$sdev

percent_variation <- sdev^2 / sum(sdev^2)

tibble(component = unique(tidied_pca_pg$component),
       percent_var = percent_variation) %>%
  filter(component %in% paste0("PC", 1:10)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(component, percent_var)) +
  geom_col(position = "stack") +
  geom_text(aes(label = scales::percent((round(percent_var,4))),
            vjust = -0.5), size = 3, color = "red"
            ) +
  geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
            vjust = -2), size = 4
            ) +
    scale_y_continuous(labels = scales::percent_format(), limits = c(0,.65))

2 PCs account for 70%, 3 PCs 80%,6 PCs 90%

Per game stats account for the most variance with the least amount of data. We’ll take a look at kmeans for only per game statistics

K-Means w/ PCA - using less data pt. 3 -only per game stats

# the template stores the PCA variables
pca_pg <- nba_prep_pg$template %>%
    mutate(Year = as.character(Year),
         Player = as.character(Player),
         Pos = as.character(Pos),
         Tm = as.character(Tm)) %>%
  as_tibble()

set.seed(525)

kclust <- kmeans(pca_pg %>% select(-c(1:4)), centers = 3, nstart = 1000, iter.max = 10000)

Elbow plot: find the optimal k

kclusts <- 
  tibble(k = 1:15) %>%
  mutate(
    kclust = map(k, ~ kmeans(pca_pg %>% select(-c(1:4)), .x)),
    tidied = map(kclust, tidy),
    glanced = map(kclust, glance),
    augmented = map(kclust, augment, pca_pg %>% select(-c(1:4)))
  )

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +
  geom_line(alpha = 0.8) +
  geom_point(size = 2)

The elbow looks like it’s at 2. It may be beneficial to plot the change for each k from k to k+1, k+2, and k+3

Elbow plots - k+1, k+2, k+3

This shows the difference from k to k+1, k to k+2, and k to k+3

kclusts %>%
  unnest(glanced) %>%
  ggplot(aes(k, tot.withinss)) +

  #change from k = 1 to k = 2
  geom_line(aes(k, tot.withinss-lead(tot.withinss), color = "k+1"), alpha = 0.8 ) +
  geom_point(aes(k, tot.withinss-lead(tot.withinss)),size = 2,color = "red") +

  #change from k = 1 to k = 3
  geom_line(aes(k, tot.withinss-lead(tot.withinss,n=2),color = "k+2"), alpha = 0.8) +
  geom_point(aes(k, tot.withinss-lead(tot.withinss,n=2)),size = 2, color = "green") +

  #change from k = 1 to k = 4
  geom_line(aes(k, tot.withinss-lead(tot.withinss,n=3),color = "k+3"), alpha = 0.8) +
  geom_point(aes(k, tot.withinss-lead(tot.withinss,n=3)),size = 2, color = "blue") +
  
  labs(
    x = "Number of Clusters (k)",
    y = "tot.withinss", color = "",
    title = "Where does the data level off?"
  )

Hierarchical

I’ll use PCA for hierarchical clustering as well to reduce the dimensionality of the data

This plot is the same as the initial plot using all the data, just as a reminder

# recipe
nba_rec_hier <- recipe(~ ., data = model_df) %>%
  update_role(Year, Player, Pos, Tm, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors(), num_comp = 20)

#prep -  needed for tidied_pca and plot
nba_prep_hier <- prep(nba_rec_hier)

tidied_pca_hier <- tidy(nba_prep_hier,2)

sdev <- nba_prep_hier$steps[[2]]$res$sdev

percent_variation <- sdev^2 / sum(sdev^2)

tibble(component = unique(tidied_pca_hier$component),
       percent_var = percent_variation) %>%
  filter(component %in% paste0("PC", 1:20)) %>%
  mutate(component = fct_inorder(component)) %>%
  ggplot(aes(component, percent_var)) +
  geom_col(position = "stack") +
  geom_text(aes(label = scales::percent((round(percent_var,4))),
            vjust = -0.5), size = 3, color = "red"
            ) +
  geom_text(aes(label = scales::percent(cumsum(round(percent_var,4))),
            vjust = -2), size = 4
            ) +
  scale_y_continuous(labels = scales::percent_format(), limits = c(0,.35))

Hierarchical Clustering Methods

# recipe - using 9 PCs instead of 20
nba_rec_hier <- recipe(~ ., data = model_df) %>%
  update_role(Year, Player, Pos, Tm, new_role = "id") %>%
  step_normalize(all_predictors()) %>%
  step_pca(all_predictors(), num_comp = 9)

# new dataframe
nba_bake_hier <- bake(nba_prep_hier, new_data = NULL) %>%
  mutate(across(where(is.factor), as.character))
  

# the dendrograms take too long to generate if all the data is included, so a sample is taken
model_df_hc <- nba_bake_hier %>%
  sample_n(1000) %>%
  select(-c(1:4)) %>%
  select(c(1:9)) #selects the first 9 PCs

set.seed(42069)

# Distance between observations matrix
d <- dist(x=model_df_hc, method = "euclidean")

# Hierarchical clustering using Complete Linkage
nba_hclust_complete <- hclust(d, method = "complete")

# Hierarchical clustering using Average Linkage
nba_hclust_average <- hclust(d, method = "average")

# Hierarchical clustering using Ward Linkage
nba_hclust_ward <- hclust(d, method = "ward.D2")

Dendrogram of each

library(factoextra)

fviz_dend(nba_hclust_complete, main = "Complete")

fviz_dend(nba_hclust_average, main = "Average Linkage")

fviz_dend(nba_hclust_ward, main = "Ward")

The below looks at the agglomerative coefficient, which measures clustering structure of the dataset. A value closer to 1 means more balanced.

Agglomerative Coefficients

library(cluster)

ac_metric <- list(
  complete_ac = agnes(model_df_hc, metric = "euclidean", method = "complete")$ac,
  average_ac = agnes(model_df_hc, metric = "euclidean", method = "average")$ac,
  ward_ac = agnes(model_df_hc, metric = "euclidean", method = "ward")$ac
)

ac_metric
## $complete_ac
## [1] 0.8994221
## 
## $average_ac
## [1] 0.7981196
## 
## $ward_ac
## [1] 0.9805245

Ward’s is the closest to 1, so that’s the method that we’ll use that from

Look at how the sample looks from a clustering standpoint

Dendrograms for k = 6, 7, and 8

fviz_dend(nba_hclust_ward, k = 6)

fviz_dend(nba_hclust_ward, k = 7)

fviz_dend(nba_hclust_ward, k = 8)

7 clusters looks to be the most balanced

Results

K-means w/o PCA: 3 is likely optimal, but 6 or 7 show that it’s flattening out
K-means w/ PCA all data: 3 may be optimal, but 7 or 8 show that it’s flattening out
K-means w/ PCA 20 minutes per game cutoff (instead of 12): More or less the same as with all data
K-means w/ PCA using only per game stats: 2 may be optimal, levels off around 6 or 7
Hierarchical w/ PCA: 2 may be optimal, levels off around 6, 7 or 8. 7 or 8 are likely more balanced than 6

I’ve decide to go with 7 clusters for the final analysis.