#clean the data, get rid of NA and join the datasets
nba_pre_processing <- function(d1,d2){
#remove NAs
d1<-na.omit(d1)
d2<-na.omit(d2)
#remove duplicate player stats for players that were traded
d1<-d1 %>% distinct(Player, .keep_all = TRUE)
#join datasets
full_nba_data = left_join(d1,d2)
full_nba_data<-full_nba_data%>%distinct(Player,.keep_all = TRUE)%>%filter(MP>50)
full_nba_data<-na.omit(full_nba_data)
#remove strange characters
full_nba_data$Player <- str_replace_all(full_nba_data$Player, "[[:punct:]]", " ")
full_nba_data$Player <- str_replace_all(full_nba_data$Player, "[^[:alnum:]]", " ")
full_nba_data
}
full_nba_data <- nba_pre_processing(nba_stats,nba_sal)
## Joining, by = "Player"
#add another scaled salary column in order to use salary as size in the visual
full_nba_data <- full_nba_data %>%
mutate(
sal = `2020-21` / 2000000,
)
#scales data to between 0 and 1 to make it normalized for clustering
scaler <- function(x){
(x-min(x))/(max(x)-min(x))
}
#normalize columns I am going to use that are not already in percent form
full_nba_data$Age<-scaler(full_nba_data$Age)
full_nba_data$G<-scaler(full_nba_data$G)
full_nba_data$MP<-scaler(full_nba_data$MP)
full_nba_data$ORB<-scaler(full_nba_data$ORB)
full_nba_data$DRB<-scaler(full_nba_data$DRB)
full_nba_data$TRB<-scaler(full_nba_data$TRB)
full_nba_data$AST<-scaler(full_nba_data$AST)
full_nba_data$BLK<-scaler(full_nba_data$BLK)
full_nba_data$STL<-scaler(full_nba_data$STL)
full_nba_data$TOV<-scaler(full_nba_data$TOV)
#cleaned and prepared datatable
datatable(full_nba_data)
#use only certain columns to cluster
clust_data1 <- full_nba_data[, c("Age", "G", "MP","FG%","3P%", "2P%", "eFG%", "FT%","TRB", "AST", "STL", "BLK")]
#run kmeans
set.seed(1)
kmeans_obj = kmeans(clust_data1, centers = 2,
algorithm = "Lloyd")
#add cluster as a column
full_nba_data$cluster <- kmeans_obj$cluster
library(NbClust)
#use NbClust to find the ideal number of clusters
nbclust_obj = NbClust(data = clust_data1, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 3 proposed 14 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
#nbclust_obj
#find the best number of clusters
freq_k = nbclust_obj$Best.nc[1,]
freq_k = data.frame(freq_k)
#two clusters was the most frequent best
freq_k
## freq_k
## KL 14
## CH 2
## Hartigan 3
## CCC 2
## Scott 3
## Marriot 3
## TrCovW 4
## TraceW 3
## Friedman 15
## Rubin 14
## Cindex 15
## DB 2
## Silhouette 2
## Duda 3
## PseudoT2 3
## Beale 3
## Ratkowsky 2
## Ball 3
## PtBiserial 2
## Frey 1
## McClain 2
## Dunn 7
## Hubert 0
## SDindex 2
## Dindex 0
## SDbw 14
#create cluster shape
party_clusters = as.factor(kmeans_obj$cluster)
#create a plotting function
ploting_function<- function(var1,var2,data, cluster_shape, title, x_label, y_label){
ggplot(data, aes(x = var1,
y = var2,
text = Player,
color = cluster, #color by cluster number
shape = cluster_shape)) +
geom_point(size = data$sal) + #make size dependent on salary
ggtitle(title) +
xlab(x_label) +
ylab(y_label) +
scale_shape_manual(name = "Cluster",
labels = c("Cluster 1", "Cluster 2"),
values = c("1", "2")) +
theme_light()
}
#looking for young unrated players
age_efg <- ploting_function(full_nba_data$Age, full_nba_data$`eFG%`, full_nba_data, party_clusters, "Age vs Efective Field Goal Percentage", "Age", "Efective Field Goal Percentage")
ggplotly(age_efg)
#looking for players that excel offensively
assists_rebound <- ploting_function(full_nba_data$AST, full_nba_data$TRB, full_nba_data, party_clusters, "Assists vs Rebounds", "Assists", "Rebounds")
ggplotly(assists_rebound)
#looking for players that excel defensively
steals_blocks <- ploting_function(full_nba_data$STL, full_nba_data$BLK, full_nba_data, party_clusters, "Steals vs Blocks", "Steals", "Blocks")
ggplotly(steals_blocks)
#looking for strong shooters
efg_mins_played <- ploting_function(full_nba_data$`eFG%`, full_nba_data$MP, full_nba_data, party_clusters, "Effective Field Goal Percentage vs Minutes Played", "Effective Field Goal Percentage", "Minutes Played")
ggplotly(efg_mins_played)
# Assign colors by party in a new data frame.
color3D= data.frame(cluster = c(1, 2),
color = c("cluster 1", "cluster 2"))
# Join the new data frame to orginial data set.
cluster_color = inner_join(full_nba_data, color3D)
## Joining, by = "cluster"
#putting 3D plot into a function
plot_3d <- function(data, var1, var2, var3){
fig <- plot_ly(data,type = "scatter3d",mode="markers", x = ~var1, y = ~var2, z = ~var3, color = ~color, colors = c('#0C4B8E','#BF382A'), text = ~paste('Player:', data$Player))
fig
}
#offensive plot: looking for strong offensive players
assists_eFieldGoals_rebounds <- plot_3d(cluster_color, cluster_color$AST, cluster_color$`eFG%`, cluster_color$TRB)
assists_eFieldGoals_rebounds
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#defensive plot: looking for strong defensive players
steals_blocks_rebounds <- plot_3d(cluster_color, cluster_color$STL, cluster_color$BLK, cluster_color$TRB)
steals_blocks_rebounds
#shooting plot: looking for strong shooters
shots <- plot_3d(cluster_color, cluster_color$`2P%`, cluster_color$`3P%`, cluster_color$`FT%`)
shots
Top Underrated Players:
“Effective Field Goal Percentage vs Minutes Played” Over 0.5 on both 1. Jarret Allen 2. Duncan Robinson 3. Mikal Bridges 4. Richaun Holmes 5. Chris Boucher 6. Luka Doncic
“Steals vs Blocks” Above 0.5 of both 1. Nerlens Noels 2. Jarret Allen 3. Chris Boucher 4. Richuan Holmes
“Assists vs Rebounds” Over 0.5 rebounds and assists 1. Bam Adebayo 2. Luka Doncic (most people have a low rebound assist combo)
“Age vs Efective Field Goal Percentage” Over 0.6 ef%, bottom 1/4 of age 1. Robert Williams 2. Alex Len 3. Chris Silva 4. Thomas Bryant 5. Nicolas Claxton 6. Frank Ntilikina 7. Ivica Zubac