nba_pre_processing <- function(d1,d2){
#omit missing vals in each data frame
d1<-na.omit(d1)
d2<-na.omit(d2)
#only keep "TOT" data from traded players to prevent multiple stats for each player
d1<-d1 %>% distinct(Player, .keep_all = TRUE)
#merge the information and salary dataframes together
full_nba_data = left_join(d1,d2)
#players should only be considered if >= 50 mins. played.
full_nba_data<-full_nba_data%>%distinct(Player,.keep_all =TRUE)%>%filter(MP>50)
full_nba_data<-na.omit(full_nba_data) #omit any more missing values, if present
#omit problematic str. values in Player column
full_nba_data$Player <- str_replace_all(full_nba_data$Player, "[[:punct:]]", " ")
full_nba_data$Player <- str_replace_all(full_nba_data$Player, "[^[:alnum:]]", " ")
full_nba_data
}
#save back to the data frame the result of running the function with the two data frames as input
full_nba_data <- nba_pre_processing(d1,d2)
## Joining, by = "Player"
#the merged dataframe containing player stats along with salary (`2020-21` column)
#produce additional column with standardized salaries; to be used in data viz
full_nba_data <- full_nba_data %>%
mutate(sal = `2020-21` / 2000000)
#full_nba_data
###Alter data frame by standardizing certain columns
range_vals<-function(x){(x-min(x))/(max(x)-min(x))} #this function will scale all values of interest in the data frame to a decimal value between 0 and 1
#https://stats.stackexchange.com/questions/70801/how-to-normalize-data-to-0-1-range
#feed the following columns of interest into the function to convert the columns to standardized proportions for comparison
full_nba_data$Age<-range_vals(full_nba_data$Age)
full_nba_data$G<-range_vals(full_nba_data$G)
full_nba_data$MP<-range_vals(full_nba_data$MP)
full_nba_data$ORB<-range_vals(full_nba_data$ORB)
full_nba_data$DRB<-range_vals(full_nba_data$DRB)
full_nba_data$TRB<-range_vals(full_nba_data$TRB)
full_nba_data$AST<-range_vals(full_nba_data$AST)
full_nba_data$STL<-range_vals(full_nba_data$STL)
full_nba_data$BLK<-range_vals(full_nba_data$BLK)
full_nba_data$TOV<-range_vals(full_nba_data$TOV)
datatable(full_nba_data)
#Subset standardized data frame to Variables of Interest (those to be used in evaluating player performance)
for_clustering<- full_nba_data[, c("Age", "G", "MP","FG%","3P%", "2P%", "eFG%", "FT%", "TRB", "AST", "STL", "BLK")]
#use syntax df[r,c]
#View(for_clustering)
#Create clusters object using kmeans
set.seed(1)
kmeans_obj_bball = kmeans(for_clustering, centers = 2, #use cluster variables to form the clusters
algorithm = "Lloyd")
#Produce cluster column with cluster classifications (1,2) in this scenario
full_nba_data$cluster<-kmeans_obj_bball$cluster
#Use NbClust to select a number of clusters that is best suited for the dataset
library(NbClust)
# Run NbClust.
nbclust_obj = NbClust(data = for_clustering, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 8 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 3 proposed 14 as the best number of clusters
## * 2 proposed 15 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
# View the output of NbClust.
#nbclust_obj
freq_k = nbclust_obj$Best.nc[1,] #take the first row, which is number of clusters
freq_k = data.frame(freq_k) #turn the first row into a data frame using data.frame
#two clusters was the most recommended; must be relatively effective in forming groups of players for analysis based upon pairings of variables
freq_k
## freq_k
## KL 14
## CH 2
## Hartigan 3
## CCC 2
## Scott 3
## Marriot 3
## TrCovW 4
## TraceW 3
## Friedman 15
## Rubin 14
## Cindex 15
## DB 2
## Silhouette 2
## Duda 3
## PseudoT2 3
## Beale 3
## Ratkowsky 2
## Ball 3
## PtBiserial 2
## Frey 1
## McClain 2
## Dunn 7
## Hubert 0
## SDindex 2
## Dindex 0
## SDbw 14
###Data Visualization
b_clusters = as.factor(kmeans_obj_bball$cluster) #cast clusters to type factor
###Plot Visualization and function to produce each graphic used to evaluate performance
ploting_function<- function(var1,var2,data, cluster_shape, title, x_label, y_label){
ggplot(data, aes(x = var1,
y = var2,
text = Player,
color = cluster, #color varies by cluster classification
shape = cluster_shape)) +
geom_point(size = data$sal) + #size will vary based upon salary
ggtitle(title) +
xlab(x_label) +
ylab(y_label) +
scale_shape_manual(name = "Cluster",
labels = c("Cluster 1", "Cluster 2"), #Note: this portion of the code will only work for 2 clusters (not as robust for different datasets)
values = c("1", "2")) +
theme_light()
}
#Age versus Effective Field Goal Percentage:
#This data tells us what AGE is correlated with a high performing eFG%
age_efg<-ploting_function(full_nba_data$Age, full_nba_data$`eFG%`, full_nba_data, b_clusters, "Age vs Efective Field Goal Percentage", "Age", "Efective Field Goal Percentage")
ggplotly(age_efg)
#Assists versus Total Rebounds:
#This data tells us what players are performing highly as OFFENSIVE players
assists_rebound <- ploting_function(full_nba_data$AST, full_nba_data$TRB, full_nba_data, b_clusters, "Assists vs Rebounds", "Assists", "Rebounds")
ggplotly(assists_rebound)
#Steals versus Blocks:
#This data tells us what players are performing highly as DEFENSIVE players
steals_blocks <- ploting_function(full_nba_data$STL, full_nba_data$BLK, full_nba_data, b_clusters, "Steals vs Blocks", "Steals", "Blocks")
ggplotly(steals_blocks)
#EFG versus Minutes Played:
#This data tells us what players are accurate in SHOOTING THE BALL
efg_mins_played <- ploting_function(full_nba_data$`eFG%`, full_nba_data$MP, full_nba_data, b_clusters, "Effective Field Goal Percentage vs Minutes Played", "Effective Field Goal Percentage", "Minutes Played")
ggplotly(efg_mins_played)
# Assign colors by party in a new data frame.
color3D= data.frame(cluster = c(1, 2),
color = c("cluster 1", "cluster 2")) #Note: this portion of the code will only work for 2 clusters (not as robust)
# Join the new data frame to orginial data set.
cluster_color = inner_join(full_nba_data, color3D,by='cluster')
#putting 3D plot into a function
plot_3d <- function(data, var1, var2, var3){
fig <- plot_ly(data,type = "scatter3d",mode="markers", x = ~var1, y = ~var2, z = ~var3, color = ~color, colors = c('#0C4B8E','#BF382A'), text = ~paste('Player:', data$Player))
fig
}
#Assists versus Effective Field Goal percentage versus Total Rebounds:
#This data tells us what players are performing highly as OFFENSIVE players
assists_eFieldGoals_rebounds <- plot_3d(cluster_color, cluster_color$AST, cluster_color$`eFG%`, cluster_color$TRB)
assists_eFieldGoals_rebounds
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
#Steals versus Blocks versus Total Rebounds:
#This data tells us what players are performing highly as DEFENSIVE players
steals_blocks_rebounds <- plot_3d(cluster_color, cluster_color$STL, cluster_color$BLK, cluster_color$TRB)
steals_blocks_rebounds
#2 Point versus 3 Point versus Free Throw Percentages:
#This data tells us what players are accurate in SHOOTING THE BALL
shots <- plot_3d(cluster_color, cluster_color$`2P%`, cluster_color$`3P%`, cluster_color$`FT%`)
shots
Methodology: Size of data points correlates with Salary amount. Thus, smaller data points that are within groups of larger data points can be pinpointed as players that are high performing, yet underpaid. The following players were concluded to fit into this classification as follows:
*Based on Age 2D Plot:
1. Mikal Bridges and Robert Williams: both of these players are very young, and thus must have less game experience than do older players. Despite this, Bridges’s eFG% is 0.625 and Williams’s eFG% is 0.72, some of the highest in the plot. Their points are also some of the smallest on the plot, indicating a very low salary that is not reflective of their performance even compared to older players. Additionally, both of these players fall into the “circle shape” cluster which is shared by some of the most skilled and highly paid players. This means that overall, their classification based on the designated clustering variable set is more aligned with the classifications of highly paid players (they should be stolen for our team!)
2. Mason Plumlee and Darius Miller: According to the Age 2D plot, both of these players are relatively high in age but make a small salary, despite their high free throw percentage of 0.6 or higher. They are among several large data points, indicating the equity of their performance to more highly paid players.
*Based on Defensive 2D Plot:
1. Robert Covington: Robert has 0.84 for his steal value, and an about average 0.35 blocks value. Despite this, his data point is small/medium sized compared to his similarly performing counterparts. Some bigger data points/players have an even smaller block percentage and only a slightly larger steal percentage.
2. Myles Turner has almost a 1.0 value (the highest value in the plot) for his block proportion, but only earns a medium salary compared to players who have a higher average steal value but much lower block value. Both of these players were classified in the “circle” category along with many of the most highly paid players; their common classification and alignment of clustering variables across the board justifies that they should be drafted by our management and offered a higher salary.
*Based on Offensive 2D Plot:
1. Luka Doncic: Luka has a 0.83 Assist and 0.56 Rebound percentage; his small data point (indicating a relatively small salary for his high performance on both variables) is amongst some of the most well-known and well-paid players, including Lebron James. Doncic was classified in the “circle” category along with many of the most highly paid players; his common classification and alignment of clustering variables across the board justifies that he should be drafted by our management and offered a higher salary.
*Based on Shooting 2D Plot:
1. Joe Harris: Joe has a 0.69 eFG% and a 0.83 minutes played, which are both relatively high on the plot. This indicates that not only is he an accurate shooter, but he is valued by is team and has played many minutes. His point is relatively small compared to the larger point (and similar performance) of Rudy Gobert.
2. Jarrett Allen has both a 0.67 eFG% and 0.70 minutes played, which like Harris are high. Despite only a small difference in performance from Joe Harris, his point is comparatively much smaller. Allen is very underrated and is not being paid nearly the same amount as Harris, despite only a slightly smaller level of performance.
Both of these players were classified in the “circle” category along with many of the most highly paid players; their common classification and alignment of clustering variables across the board justifies that they should be drafted by our management and offered a higher salary.