library(tidymodels) # broom, dials, parsnip, tune, workflows, yardstick
library(tidyverse) #ggplot2, dplyr, tidyr, readr, purr, tibble, stringr, lubridate
library(gghighlight)
NBACleanData <- read_csv("Data/model0623.csv",show_col_types = FALSE) %>% select(-1)
model_df <- NBACleanData %>% select(-80) %>%
mutate(Year = as.character(Year)) %>%
mutate(across(where(is.numeric), ~as.numeric(scale(.))))
From Pt.2 of the project, I decided to use 7 clusters. Using PCA didn’t really provide improved results in the dataset, plus I need to be able to explain the results in a concise format. Therefore, I’ve opted to not use PCA to reduce dimensionality. I’ve also decided to use kmeans over hierarchical, as I think it will be a little bit easier to interpret.
From the last part, here is the model
set.seed(1234)
nba_clust <- kmeans(model_df %>% select(-c(1:4)),
iter.max = 1,
nstart = 10 ,
centers = 7)
# head(augment(nba_clust, model_df))[, c(80,1:9)],5)
nba_aug <- broom::augment(nba_clust, model_df) %>%
rename(Cluster = .cluster) %>%
select(-c(1:4)) %>%
select(Cluster, everything())
options(scipen = 99)
kmeans_centers <- data.frame(Cluster = c(paste0('Cluster ', 1:7)), nba_clust$centers) %>% #creates a column w/ the cluster name
pivot_longer(!Cluster, names_to = 'feature', values_to = 'center') %>% #pivots longer for easier graphing
mutate(feature = as.factor(feature)) %>% #makes the feature a factor
mutate(Cluster = as.factor(Cluster))
head(kmeans_centers)
## # A tibble: 6 × 3
## Cluster feature center
## <fct> <fct> <dbl>
## 1 Cluster 1 FG_pp 1.14
## 2 Cluster 1 FGA_pp 0.715
## 3 Cluster 1 FG_pct_pp 0.848
## 4 Cluster 1 ThrP_pp -0.807
## 5 Cluster 1 ThrPA_pp -0.839
## 6 Cluster 1 ThrP_pct_pp -0.360
final_df <- model_df %>%
select(c(1:4)) %>%
cbind(nba_aug) %>%
mutate(Cluster = as.character(Cluster))
df_2022 <- final_df %>%
filter(Year %in% '2022')
The plots below showcase the makeup of each cluster.
I’ve included a new position name for each cluster, a summary of each position (aka cluster), and listed some of the players from the 2022 season that fall into each position.
cluster_plots <- function(cluster_value) {
# Geom_point for rankings in each category
filtered_data <- kmeans_centers %>%
group_by(feature) %>%
mutate(rank = ifelse(as.character(feature) %in% c('PF_pg', 'PF_pp', 'TOV_pg', 'TOV_pp', 'TOV_pct_adv'),
rank(center),
rank(desc(center)))
)%>% # adjust rank for high values in less desriable stat categories
arrange(case_when(Cluster == cluster_value ~ 1, TRUE ~ 2), Cluster, rank, desc(center)) %>%
mutate(Cluster = as.character(Cluster)) %>%
filter(Cluster == cluster_value)
plot <- kmeans_centers %>%
group_by(feature) %>%
mutate(rank = ifelse(as.character(feature) %in% c('PF_pg', 'PF_pp', 'TOV_pg', 'TOV_pp', 'TOV_pct_adv'),
rank(center),
rank(desc(center)))
)%>% # adjust rank for high values in less desriable stat categories
arrange(case_when(Cluster == cluster_value ~ 1, TRUE ~ 2), Cluster, rank, desc(center)) %>%
ggplot(aes(x = factor(feature, levels = unique(feature)), y = center, color = cluster_value)) +
geom_point(color = "#5A2D81") +
geom_text(data = filtered_data, aes(label = rank), color = "black", vjust = -0.5) + # Add point labels for rank
theme_minimal() +
gghighlight(Cluster == cluster_value, use_direct_label = FALSE) +
theme(legend.position = "none",
axis.text.x = element_text(angle = 90, size = 9, hjust = 0.5),
axis.title = element_text(hjust = 0.5),
plot.title = element_text(hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5)) +
labs(x = "Statistic", y = "Cluster Center",
title = "Visualizing K-Means Cluster Makeups",
subtitle = cluster_value)
print(plot)
# Bar Plot for Percentage of Stat Categories in Each Rank
filtered_data <- kmeans_centers %>%
group_by(feature) %>%
mutate(rank = ifelse(as.character(feature) %in% c('PF_pg', 'PF_pp', 'TOV_pg', 'TOV_pp', 'TOV_pct_adv'),
rank(center),
rank(desc(center)))
) %>%
arrange(case_when(Cluster == cluster_value ~ 1, TRUE ~ 2), Cluster, rank, desc(center)) %>%
mutate(Cluster = as.character(Cluster)) %>%
filter(Cluster == cluster_value)
plot <- kmeans_centers %>%
group_by(feature) %>%
mutate(rank = ifelse(as.character(feature) %in% c('PF_pg', 'PF_pp', 'TOV_pg', 'TOV_pp', 'TOV_pct_adv'),
rank(center),
rank(desc(center)))
) %>%
ungroup() %>%
select(Cluster, rank) %>%
group_by(Cluster, rank) %>%
summarise(count = n()) %>%
ungroup() %>%
group_by(Cluster) %>%
mutate(percentage = count / sum(count) * 100) %>%
filter(Cluster == cluster_value) %>%
ggplot(aes(x = as.factor(rank), y = percentage, fill = as.factor(rank))) +
geom_bar(stat = "identity") +
geom_text(aes(label = paste0(round(percentage, 0), "%")), vjust = -0.5, color = "black") +
labs(x = "Rank", y = "Percentage",
title = "Percentage of Stat Categories in Each Rank", subtitle = cluster_value) +
scale_fill_discrete(name = "Rank") +
scale_x_discrete(limits = as.character(1:7)) +
theme_minimal()
print(plot)
#Bar plot that shows count of traditional positions in each cluster
df_2022 <- final_df %>%
filter(Year %in% '2022')
count_df <- df_2022 %>%
filter(Cluster == substr(cluster_value, nchar(cluster_value), nchar(cluster_value))) %>%
group_by(Pos) %>%
summarise(count = n()) %>%
ungroup() %>%
arrange(desc(count))
plot <- ggplot(count_df, aes(x = reorder(Pos, -count), y = count, fill = Pos)) +
geom_bar(stat = "identity") +
geom_text(aes(label = count), vjust = -0.5, color = "black") +
scale_fill_manual(values = c("red", "blue", "green", "yellow", "orange")) +
labs(x = "Pos", y = "Count", title = paste0("Bar Plot of Traditional Positions in ", cluster_value))
print(plot)
}
cluster_plots('Cluster 1')
cluster_plots('Cluster 2')
cluster_plots('Cluster 3')
cluster_plots('Cluster 4')
cluster_plots('Cluster 5')
cluster_plots('Cluster 6')
cluster_plots('Cluster 7')
The below plot shows how the player types have evolved since the 2000-2001 season and a brief summary of each cluster is included.
ggplot(final_df %>% mutate(Year = as.numeric(Year)),
aes(x=Year,
fill=factor(Cluster))
) +
geom_bar(position="fill") +
geom_text(
aes(label=paste0(signif(..count.. / tapply(..count.., ..x.., sum)[as.character(..x..)]*100, digits=2),"%")),
stat="count",
position=position_fill(vjust=0.5),colour="black", size=3)+
labs(y="Percentage") +
scale_x_continuous("Year", breaks=seq(2001,2022,by=1)) +
scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1)) +
scale_fill_discrete(name = "Player\nType", labels =c('MVP Bigs', '3-Point\nBackups', 'Game\nGenerals', '3-Point\nThreats','Shooting\nBigs', 'Franchise\nPlayers', 'Rebounding\nBigs')) +
theme(legend.key.height=unit(2, "cm"))
The plot below displays the count of each traditional player position (C, PF, SF, SG, PG) within each of the new clusters (i.e. new player types).
ggplot(final_df %>%
mutate(Year = as.numeric(Year),
Cluster = as.numeric(Cluster)) %>%
filter(Year == 2022),
aes(x = Cluster, fill = Pos)) +
geom_bar(aes(y = (..count..)),colour="white") +
geom_text(stat='count', aes(label=..count..),position = position_stack(vjust = 0.5), color = "black") +
ggtitle("Count of Traditional Position in Each New Player Type - 2022") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_continuous(name = "New Position Name", breaks = c(1, 2, 3, 4, 5, 6, 7),
labels = c('MVP Bigs', '3-Point\nBackups', 'Game\nGenerals', '3-Point\nThreats', 'Shooting\nBigs', 'Franchise\nPlayers', 'Rebounding\nBigs')) +
labs(fill = "Traditional Position", y = "Count") +
theme(legend.key.height=unit(2, "cm"))
dat <- final_df %>% filter(Year ==2022)
dat1 <- as.data.frame(prop.table(table(dat$Pos, dat$Cluster), margin = 1))
colnames(dat1) <- c("Pos", "Cluster", "percent")
dat1 <- dat1 %>%
group_by(Pos) %>%
mutate(Pos_label_y = 1 - (cumsum(percent) - 0.5 * percent)) %>%
ungroup()
ggplot(dat1, aes(Pos, y = percent, fill = factor(Cluster))) +
geom_bar(data = . %>% filter(percent > 0), position = "fill", stat = "identity") +
scale_y_continuous(labels = scales::percent) +
geom_text(data = . %>% filter(percent > 0), aes(y = Pos_label_y, label = round(100 * percent, 1))) +
ggtitle("Percentage of Cluster within Each Traditional Position - 2022") +
theme(plot.title = element_text(hjust = 0.5)) +
scale_x_discrete(name = "Traditional Position") +
scale_fill_discrete(name = "New Position Name", labels = c('MVP Bigs', '3-Point\nBackups', 'Game\nGenerals', '3-Point\nThreats', 'Shooting\nBigs', 'Franchise\nPlayers', 'Rebounding\nBigs')) +
ylab("Percentage") +
theme(strip.background = element_blank(),
strip.text = element_blank(),
legend.key.height = unit(1, "cm"))
#teams by cluster
ggplot(df_2022 %>%
mutate(Year = as.numeric(Year),
Cluster = as.numeric(Cluster)) %>%
filter(Year == 2022),
aes(x = Tm, fill = factor(Cluster))) +
ggtitle("Count of New Position on Each Team - 2022") +
geom_bar(position = "fill") + ylab("Proportion") +
stat_count(geom = "text",
aes(label = stat(count)),
position=position_fill(vjust=0.5), colour="white") +
scale_fill_discrete(name = "New Position", labels =c('MVP Bigs', '3-Point\nBackups', 'Game\nGenerals', '3-Point\nThreats', 'Shooting\nBigs', 'Franchise\nPlayers', 'Rebounding\nBigs')) +
theme(legend.key.height=unit(1, "cm"))
## 2022 - New Positions Per Team - above .500
above500_2022 <- c('PHO','MEM','MIA','GSW','DAL','BOS','MIL','PHI','UTA','TOR','DEN','MIN','CHI','BRK','CLE','ATL','CHO','LAC')
ggplot(data = df_2022 %>% filter(Tm %in% above500_2022), aes(x = Tm, fill = factor(Cluster))) +
geom_bar(position = "fill") + ylab("Proportion by Count") +
stat_count(geom = "text",
aes(label = stat(count)),
position=position_fill(vjust=0.5), colour="Black") +
scale_fill_discrete(name = "New Position", labels =c('MVP Bigs', '3-Point\nBackups', 'Game\nGenerals', '3-Point\nThreats', 'Shooting\nBigs', 'Franchise\nPlayers', 'Rebounding\nBigs')) +
theme(legend.key.height=unit(1, "cm"))
### Takeaways * Each team with a record above .500 had at least 1
Franchise Player. * Boston, Brooklyn, Chicago, Milwaukee, Philadelphia,
and Phoenix had 2 Franchise Players * Boston, Dallas, Denver, Golden
State, LA Clippers, Minnesota, and Philadelphia didn’t have an MVP Big *
Cleveland had 2 MVP Bigs
below500_2022 <- c('NYK','NOP','WAS','SAS','LAL','SAC','POR','IND','OKC','DET','ORL','HOU')
ggplot(data = df_2022 %>% filter(Tm %in% below500_2022), aes(x = Tm, fill = factor(Cluster))) +
geom_bar(position = "fill") + ylab("Proportion by Count") +
stat_count(geom = "text",
aes(label = stat(count)),
position=position_fill(vjust=0.5), colour="Black") +
scale_fill_discrete(name = "New Position", labels =c('MVP Bigs', '3-Point\nBackups', 'Game\nGenerals', '3-Point\nThreats', 'Shooting\nBigs', 'Franchise\nPlayers', 'Rebounding\nBigs')) +
theme(legend.key.height=unit(1, "cm"))
### Takeaways * 5 of the 11 Teams with a record below .500 didn’t have a
Franchise Player. * Of the 5, Detroit and Indiana didn’t have an MVP
Big.
Further analysis needs to be done to determine which player types are included on successful teams. This analysis will be used to help determine what kind of players to target so that a team can be successful.